summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2020-09-11 10:23:56 +0200
committerAlexandre Flament <alex@al-f.net>2020-09-13 10:28:11 +0200
commit6deb85072ad00b85d2b3c1981c37aeb75ef68cc7 (patch)
treea167d983a0006b1dddea5b7e6025d18b772dfc10 /searx/utils.py
parentae07f4a211ecba0331bcab5903e3263c646f8bdb (diff)
downloadsearxng-6deb85072ad00b85d2b3c1981c37aeb75ef68cc7.tar.gz
searxng-6deb85072ad00b85d2b3c1981c37aeb75ef68cc7.zip
[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
Close #2188
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py11
1 files changed, 9 insertions, 2 deletions
diff --git a/searx/utils.py b/searx/utils.py
index d8842c65f..0eb9f6a34 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -77,6 +77,10 @@ def highlight_content(content, query):
return content
+class HTMLTextExtractorException(Exception):
+ pass
+
+
class HTMLTextExtractor(HTMLParser):
def __init__(self):
@@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
return
if tag != self.tags[-1]:
- raise Exception("invalid html")
+ raise HTMLTextExtractorException()
self.tags.pop()
@@ -128,7 +132,10 @@ def html_to_text(html):
html = html.replace('\n', ' ')
html = ' '.join(html.split())
s = HTMLTextExtractor()
- s.feed(html)
+ try:
+ s.feed(html)
+ except HTMLTextExtractorException:
+ logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
return s.get_text()