[fix] google engine - div classes has been renamed in HTML reult

Since 1. October 2020 google has changed the 'class' attribute of the HTML result page. Fix the xpath expressions and ignore <div class="g" ../> sections which do not match to title's xpath expression. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
author: Markus Heiser <markus.heiser@darmarit.de> 2020-10-01 09:44:29 +0200
committer: Markus Heiser <markus.heiser@darmarit.de> 2020-10-01 09:44:29 +0200
commit: 8162d7aff4a61801615d8044b71c7c477b2b2816 (patch)
tree: cceb122f6428e9b69182ccc67dedf5c0c1fe0bec /searx
parent: fd5fe369844e481aecc0d731b08ee8b29c9b47e6 (diff)
download: searxng-8162d7aff4a61801615d8044b71c7c477b2b2816.tar.gz
searxng-8162d7aff4a61801615d8044b71c7c477b2b2816.zip
1 files changed, 11 insertions, 6 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 7dc6b44ac..c9faadb6e 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -116,12 +116,12 @@ g_section_with_header = './g-section-with-header'
 # the title is a h3 tag relative to the result group
 title_xpath = './/h3[1]'
 
-# in the result group there is <div class="r" ../> it's first child is a <a
-# href=...> (on some results, the <a> is the first "descendant", not ""child")
-href_xpath = './/div[@class="r"]//a/@href'
+# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
+# href=...>
+href_xpath = './/div[@class="yuRUbf"]//a/@href'
 
-# in the result group there is <div class="s" ../> containing he *content*
-content_xpath = './/div[@class="s"]'
+# in the result group there is <div class="IsZvec" ../> containing he *content*
+content_xpath = './/div[@class="IsZvec"]'
 
 # Suggestions are links placed in a *card-section*, we extract only the text
 # from the links not the links itself.
@@ -249,7 +249,12 @@ def response(resp):
             continue
 
         try:
-            title = extract_text(eval_xpath(result, title_xpath)[0])
+            title_tag = eval_xpath(result, title_xpath)
+            if not title_tag:
+                # this not one of the common google results *section*
+                logger.debug('ingoring <div class="g" ../> section: missing title')
+                continue
+            title = extract_text(title_tag[0])
             url = eval_xpath(result, href_xpath)[0]
             content = extract_text_from_dom(result, content_xpath)
             results.append({
author	Markus Heiser <markus.heiser@darmarit.de>	2020-10-01 09:44:29 +0200
committer	Markus Heiser <markus.heiser@darmarit.de>	2020-10-01 09:44:29 +0200
commit	8162d7aff4a61801615d8044b71c7c477b2b2816 (patch)
tree	cceb122f6428e9b69182ccc67dedf5c0c1fe0bec /searx
parent	fd5fe369844e481aecc0d731b08ee8b29c9b47e6 (diff)
download	searxng-8162d7aff4a61801615d8044b71c7c477b2b2816.tar.gz searxng-8162d7aff4a61801615d8044b71c7c477b2b2816.zip