diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2024-11-27 13:35:21 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarIT.de> | 2024-11-27 13:49:45 +0100 |
commit | 342d321196a84437d51c75da348c533e01626361 (patch) | |
tree | 7aa919101d17f56a5d45854decb40c7fdb78f8a3 /searx/engines | |
parent | 55481a63773494bf917dc71542cc6461a9c4b617 (diff) | |
download | searxng-342d321196a84437d51c75da348c533e01626361.tar.gz searxng-342d321196a84437d51c75da348c533e01626361.zip |
[fix] google engine: remove <script> tags from result items
In some results, Google returns a <script> tag that must be removed before
extracting the content.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/google.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py index e1871c654..e322aa41b 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -59,11 +59,6 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} # specific xpath variables # ------------------------ -results_xpath = './/div[contains(@jscontroller, "SC7lYd")]' -title_xpath = './/a/h3[1]' -href_xpath = './/a[h3]/@href' -content_xpath = './/div[contains(@data-sncf, "1")]' - # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' @@ -345,22 +340,27 @@ def response(resp): # parse results - for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks + for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'): + # pylint: disable=too-many-nested-blocks try: - title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) + title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug('ignoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) - url = eval_xpath_getindex(result, href_xpath, 0, None) + url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None) if url is None: logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) continue - content_nodes = eval_xpath(result, content_xpath) + content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]') + for item in content_nodes: + for script in item.xpath(".//script"): + script.getparent().remove(script) + content = extract_text(content_nodes) if not content: |