summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2022-04-15 13:37:27 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2022-04-16 09:45:57 +0200
commit4224607c62d14a457ef3f46bf83dca6f271a592d (patch)
tree3c3531f2e9e9b5129df5202414754459c57ada91 /searx/utils.py
parent1a82e79b5018b578d984bebf667643b9e334a034 (diff)
downloadsearxng-4224607c62d14a457ef3f46bf83dca6f271a592d.tar.gz
searxng-4224607c62d14a457ef3f46bf83dca6f271a592d.zip
searx.utils.html_to_text: replace <br/> by a space
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py
index 43a7578d7..ffc9a39d6 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -88,6 +88,8 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
def handle_starttag(self, tag, attrs):
self.tags.append(tag)
+ if tag == 'br':
+ self.result.append(' ')
def handle_endtag(self, tag):
if not self.tags:
@@ -142,7 +144,7 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example'
"""
- html_str = html_str.replace('\n', ' ')
+ html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor()
try: