summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorMarkus <markus@venom.fritz.box>2024-09-01 18:53:56 +0200
committerBnyro <bnyro@tutanota.com>2024-09-03 22:26:59 +0200
commit21bfb4996e3fa4042e7ae018587d50ff2281f583 (patch)
tree475a0b875c82075406d0d5fe29c55d98e774d9b3 /searx
parent94a1f39bde1150a31e4e45341952430cadd14a1b (diff)
downloadsearxng-21bfb4996e3fa4042e7ae018587d50ff2281f583.tar.gz
searxng-21bfb4996e3fa4042e7ae018587d50ff2281f583.zip
[fix] engine yahoo: HTML tags are included in result titles
- https://github.com/searxng/searxng/issues/3790 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/yahoo.py15
1 files changed, 12 insertions, 3 deletions
diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py
index 305cf523d..8dba443c7 100644
--- a/searx/engines/yahoo.py
+++ b/searx/engines/yahoo.py
@@ -16,6 +16,7 @@ from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
+ html_to_text,
)
from searx.enginelib.traits import EngineTraits
@@ -133,12 +134,20 @@ def response(resp):
url = parse_url(url)
title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='')
- title = extract_text(title)
+ title: str = extract_text(title)
content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
- content = extract_text(content, allow_none=True)
+ content: str = extract_text(content, allow_none=True)
# append result
- results.append({'url': url, 'title': title, 'content': content})
+ results.append(
+ {
+ 'url': url,
+ # title sometimes contains HTML tags / see
+ # https://github.com/searxng/searxng/issues/3790
+ 'title': " ".join(html_to_text(title).strip().split()),
+ 'content': " ".join(html_to_text(content).strip().split()),
+ }
+ )
for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
# append suggestion