diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2023-10-18 14:34:18 +0200 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2023-10-22 10:35:02 +0200 |
commit | ef56e1d68447742e0035ef8d75634c51e955a59c (patch) | |
tree | ed858dc06efc914a3532e1fec545a817d70b2971 /searx/utils.py | |
parent | 01b5b9cb8ecc5db2a56e111fd306c1bb1a407709 (diff) | |
download | searxng-ef56e1d68447742e0035ef8d75634c51e955a59c.tar.gz searxng-ef56e1d68447742e0035ef8d75634c51e955a59c.zip |
[fix] HTMLParser: undocumented not implemented method
In python versions <py3.10 there is an issue with an undocumented method
HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed
in Python 3.5.
To be compatible to higher versions (>=py3.10) an error method is implemented
which throws an AssertionError exception like the higher Python versions do [3].
[1] https://github.com/python/cpython/issues/76025
[2] https://bugs.python.org/issue31844
[3] https://github.com/python/cpython/pull/8562
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py index 7f6017617..c009c3144 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -15,6 +15,7 @@ from numbers import Number from os.path import splitext, join from random import choice from html.parser import HTMLParser +from html import escape from urllib.parse import urljoin, urlparse from markdown_it import MarkdownIt @@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception): """Internal exception raised when the HTML is invalid""" -class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844) +class _HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b def get_text(self): return ''.join(self.result).strip() + def error(self, message): + # error handle is needed in <py3.10 + # https://github.com/python/cpython/pull/8562/files + raise AssertionError(message) + def html_to_text(html_str: str) -> str: """Extract text from a HTML string @@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' + + >>> html_to_text(r'regexp: (?<![a-zA-Z]') + 'regexp: (?<![a-zA-Z]' """ html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) s = _HTMLTextExtractor() try: s.feed(html_str) + except AssertionError: + s = _HTMLTextExtractor() + s.feed(escape(html_str, quote=True)) except _HTMLTextExtractorException: logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() |