diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2023-10-18 14:34:18 +0200 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2023-10-22 10:35:02 +0200 |
commit | ef56e1d68447742e0035ef8d75634c51e955a59c (patch) | |
tree | ed858dc06efc914a3532e1fec545a817d70b2971 | |
parent | 01b5b9cb8ecc5db2a56e111fd306c1bb1a407709 (diff) | |
download | searxng-ef56e1d68447742e0035ef8d75634c51e955a59c.tar.gz searxng-ef56e1d68447742e0035ef8d75634c51e955a59c.zip |
[fix] HTMLParser: undocumented not implemented method
In python versions <py3.10 there is an issue with an undocumented method
HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed
in Python 3.5.
To be compatible to higher versions (>=py3.10) an error method is implemented
which throws an AssertionError exception like the higher Python versions do [3].
[1] https://github.com/python/cpython/issues/76025
[2] https://bugs.python.org/issue31844
[3] https://github.com/python/cpython/pull/8562
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r-- | searx/utils.py | 14 | ||||
-rw-r--r-- | tests/unit/test_utils.py | 1 |
2 files changed, 14 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py index 7f6017617..c009c3144 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -15,6 +15,7 @@ from numbers import Number from os.path import splitext, join from random import choice from html.parser import HTMLParser +from html import escape from urllib.parse import urljoin, urlparse from markdown_it import MarkdownIt @@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception): """Internal exception raised when the HTML is invalid""" -class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844) +class _HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b def get_text(self): return ''.join(self.result).strip() + def error(self, message): + # error handle is needed in <py3.10 + # https://github.com/python/cpython/pull/8562/files + raise AssertionError(message) + def html_to_text(html_str: str) -> str: """Extract text from a HTML string @@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' + + >>> html_to_text(r'regexp: (?<![a-zA-Z]') + 'regexp: (?<![a-zA-Z]' """ html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) s = _HTMLTextExtractor() try: s.feed(html_str) + except AssertionError: + s = _HTMLTextExtractor() + s.feed(escape(html_str, quote=True)) except _HTMLTextExtractorException: logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 2ad4593a1..6398e63f0 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -41,6 +41,7 @@ class TestUtils(SearxTestCase): self.assertIsInstance(utils.html_to_text(html_str), str) self.assertIsNotNone(utils.html_to_text(html_str)) self.assertEqual(utils.html_to_text(html_str), "Test text") + self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]") def test_extract_text(self): html_str = """ |