summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-10-18 14:34:18 +0200
committerAlexandre Flament <alex@al-f.net>2023-10-22 10:35:02 +0200
commitef56e1d68447742e0035ef8d75634c51e955a59c (patch)
treeed858dc06efc914a3532e1fec545a817d70b2971
parent01b5b9cb8ecc5db2a56e111fd306c1bb1a407709 (diff)
downloadsearxng-ef56e1d68447742e0035ef8d75634c51e955a59c.tar.gz
searxng-ef56e1d68447742e0035ef8d75634c51e955a59c.zip
[fix] HTMLParser: undocumented not implemented method
In python versions <py3.10 there is an issue with an undocumented method HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed in Python 3.5. To be compatible to higher versions (>=py3.10) an error method is implemented which throws an AssertionError exception like the higher Python versions do [3]. [1] https://github.com/python/cpython/issues/76025 [2] https://bugs.python.org/issue31844 [3] https://github.com/python/cpython/pull/8562 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r--searx/utils.py14
-rw-r--r--tests/unit/test_utils.py1
2 files changed, 14 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py
index 7f6017617..c009c3144 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from numbers import Number
from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
+from html import escape
from urllib.parse import urljoin, urlparse
from markdown_it import MarkdownIt
@@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception):
"""Internal exception raised when the HTML is invalid"""
-class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844)
+class _HTMLTextExtractor(HTMLParser):
"""Internal class to extract text from HTML"""
def __init__(self):
@@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
def get_text(self):
return ''.join(self.result).strip()
+ def error(self, message):
+ # error handle is needed in <py3.10
+ # https://github.com/python/cpython/pull/8562/files
+ raise AssertionError(message)
+
def html_to_text(html_str: str) -> str:
"""Extract text from a HTML string
@@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example'
+
+ >>> html_to_text(r'regexp: (?<![a-zA-Z]')
+ 'regexp: (?<![a-zA-Z]'
"""
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor()
try:
s.feed(html_str)
+ except AssertionError:
+ s = _HTMLTextExtractor()
+ s.feed(escape(html_str, quote=True))
except _HTMLTextExtractorException:
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
return s.get_text()
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 2ad4593a1..6398e63f0 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -41,6 +41,7 @@ class TestUtils(SearxTestCase):
self.assertIsInstance(utils.html_to_text(html_str), str)
self.assertIsNotNone(utils.html_to_text(html_str))
self.assertEqual(utils.html_to_text(html_str), "Test text")
+ self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
def test_extract_text(self):
html_str = """