summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/utils.py11
-rw-r--r--tests/unit/test_utils.py9
2 files changed, 18 insertions, 2 deletions
diff --git a/searx/utils.py b/searx/utils.py
index d8842c65f..0eb9f6a34 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -77,6 +77,10 @@ def highlight_content(content, query):
return content
+class HTMLTextExtractorException(Exception):
+ pass
+
+
class HTMLTextExtractor(HTMLParser):
def __init__(self):
@@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
return
if tag != self.tags[-1]:
- raise Exception("invalid html")
+ raise HTMLTextExtractorException()
self.tags.pop()
@@ -128,7 +132,10 @@ def html_to_text(html):
html = html.replace('\n', ' ')
html = ' '.join(html.split())
s = HTMLTextExtractor()
- s.feed(html)
+ try:
+ s.feed(html)
+ except HTMLTextExtractorException:
+ logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
return s.get_text()
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 5f98511c3..08b759542 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
self.assertIsNotNone(utils.html_to_text(html))
self.assertEqual(utils.html_to_text(html), "Test text")
+ def test_html_to_text_invalid(self):
+ html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
+ self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
+
def test_prettify_url(self):
data = (('https://searx.me/', 'https://searx.me/'),
('https://searx.me/ű', 'https://searx.me/ű'),
@@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
self.html_text_extractor.handle_entityref(entity)
self.assertIn(entity, self.html_text_extractor.result)
+ def test_invalid_html(self):
+ text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
+ with self.assertRaises(utils.HTMLTextExtractorException):
+ self.html_text_extractor.feed(text)
+
class TestUnicodeWriter(SearxTestCase):