diff options
author | Alexandre Flament <alex@al-f.net> | 2022-04-15 13:37:27 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2022-04-16 09:45:57 +0200 |
commit | 4224607c62d14a457ef3f46bf83dca6f271a592d (patch) | |
tree | 3c3531f2e9e9b5129df5202414754459c57ada91 /searx/utils.py | |
parent | 1a82e79b5018b578d984bebf667643b9e334a034 (diff) | |
download | searxng-4224607c62d14a457ef3f46bf83dca6f271a592d.tar.gz searxng-4224607c62d14a457ef3f46bf83dca6f271a592d.zip |
searx.utils.html_to_text: replace <br/> by a space
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py index 43a7578d7..ffc9a39d6 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -88,6 +88,8 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b def handle_starttag(self, tag, attrs): self.tags.append(tag) + if tag == 'br': + self.result.append(' ') def handle_endtag(self, tag): if not self.tags: @@ -142,7 +144,7 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' """ - html_str = html_str.replace('\n', ' ') + html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) s = _HTMLTextExtractor() try: |