Merge pull request #2190 from dalf/fix-htmltextextractor

[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
author: Alexandre Flament <alex@al-f.net> 2020-09-19 15:59:03 +0200
committer: GitHub <noreply@github.com> 2020-09-19 15:59:03 +0200
commit: 530fc4bda7ef03b0b80e1587699c8ff29fd79aea (patch)
tree: a167d983a0006b1dddea5b7e6025d18b772dfc10 /searx/utils.py
parent: ae07f4a211ecba0331bcab5903e3263c646f8bdb (diff)
parent: 6deb85072ad00b85d2b3c1981c37aeb75ef68cc7 (diff)
download: searxng-530fc4bda7ef03b0b80e1587699c8ff29fd79aea.tar.gz
searxng-530fc4bda7ef03b0b80e1587699c8ff29fd79aea.zip
1 files changed, 9 insertions, 2 deletions
diff --git a/searx/utils.py b/searx/utils.py
index d8842c65f..0eb9f6a34 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -77,6 +77,10 @@ def highlight_content(content, query):
     return content
 
 
+class HTMLTextExtractorException(Exception):
+    pass
+
+
 class HTMLTextExtractor(HTMLParser):
 
     def __init__(self):
@@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
             return
 
         if tag != self.tags[-1]:
-            raise Exception("invalid html")
+            raise HTMLTextExtractorException()
 
         self.tags.pop()
 
@@ -128,7 +132,10 @@ def html_to_text(html):
     html = html.replace('\n', ' ')
     html = ' '.join(html.split())
     s = HTMLTextExtractor()
-    s.feed(html)
+    try:
+        s.feed(html)
+    except HTMLTextExtractorException:
+        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
     return s.get_text()
author	Alexandre Flament <alex@al-f.net>	2020-09-19 15:59:03 +0200
committer	GitHub <noreply@github.com>	2020-09-19 15:59:03 +0200
commit	530fc4bda7ef03b0b80e1587699c8ff29fd79aea (patch)
tree	a167d983a0006b1dddea5b7e6025d18b772dfc10 /searx/utils.py
parent	ae07f4a211ecba0331bcab5903e3263c646f8bdb (diff)
parent	6deb85072ad00b85d2b3c1981c37aeb75ef68cc7 (diff)
download	searxng-530fc4bda7ef03b0b80e1587699c8ff29fd79aea.tar.gz searxng-530fc4bda7ef03b0b80e1587699c8ff29fd79aea.zip