summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2024-11-27 13:35:21 +0100
committerMarkus Heiser <markus.heiser@darmarIT.de>2024-11-27 13:49:45 +0100
commit342d321196a84437d51c75da348c533e01626361 (patch)
tree7aa919101d17f56a5d45854decb40c7fdb78f8a3 /searx/engines
parent55481a63773494bf917dc71542cc6461a9c4b617 (diff)
downloadsearxng-342d321196a84437d51c75da348c533e01626361.tar.gz
searxng-342d321196a84437d51c75da348c533e01626361.zip
[fix] google engine: remove <script> tags from result items
In some results, Google returns a <script> tag that must be removed before extracting the content. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/google.py18
1 files changed, 9 insertions, 9 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py
index e1871c654..e322aa41b 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -59,11 +59,6 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
-results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
-title_xpath = './/a/h3[1]'
-href_xpath = './/a[h3]/@href'
-content_xpath = './/div[contains(@data-sncf, "1")]'
-
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
@@ -345,22 +340,27 @@ def response(resp):
# parse results
- for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
+ for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
+ # pylint: disable=too-many-nested-blocks
try:
- title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
+ title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
- url = eval_xpath_getindex(result, href_xpath, 0, None)
+ url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue
- content_nodes = eval_xpath(result, content_xpath)
+ content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
+ for item in content_nodes:
+ for script in item.xpath(".//script"):
+ script.getparent().remove(script)
+
content = extract_text(content_nodes)
if not content: