diff options
-rw-r--r-- | searx/utils.py | 14 | ||||
-rw-r--r-- | tests/unit/test_utils.py | 1 |
2 files changed, 14 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py index 7f6017617..c009c3144 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -15,6 +15,7 @@ from numbers import Number from os.path import splitext, join from random import choice from html.parser import HTMLParser +from html import escape from urllib.parse import urljoin, urlparse from markdown_it import MarkdownIt @@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception): """Internal exception raised when the HTML is invalid""" -class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844) +class _HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b def get_text(self): return ''.join(self.result).strip() + def error(self, message): + # error handle is needed in <py3.10 + # https://github.com/python/cpython/pull/8562/files + raise AssertionError(message) + def html_to_text(html_str: str) -> str: """Extract text from a HTML string @@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' + + >>> html_to_text(r'regexp: (?<![a-zA-Z]') + 'regexp: (?<![a-zA-Z]' """ html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) s = _HTMLTextExtractor() try: s.feed(html_str) + except AssertionError: + s = _HTMLTextExtractor() + s.feed(escape(html_str, quote=True)) except _HTMLTextExtractorException: logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 2ad4593a1..6398e63f0 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -41,6 +41,7 @@ class TestUtils(SearxTestCase): self.assertIsInstance(utils.html_to_text(html_str), str) self.assertIsNotNone(utils.html_to_text(html_str)) self.assertEqual(utils.html_to_text(html_str), "Test text") + self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]") def test_extract_text(self): html_str = """ |