[fix] engine: google has changed the layout of its response

Since 28. March google has changed its response, this patch fixes the google engine to scrap out the results & images from the new designed response. closes: https://github.com/searxng/searxng/issues/2287 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
author: Markus Heiser <markus.heiser@darmarit.de> 2023-03-28 14:39:16 +0200
committer: Markus Heiser <markus.heiser@darmarit.de> 2023-03-28 14:39:16 +0200
commit: 6f9e678346e5978a09ee453a62fa133cdc0ee0bd (patch)
tree: ab893ff1f60d8c969ff0f5c2fad0cff49148aa3c /searx/engines/google.py
parent: 1498202b0b2df1345cc231679f04805877e26683 (diff)
download: searxng-6f9e678346e5978a09ee453a62fa133cdc0ee0bd.tar.gz
searxng-6f9e678346e5978a09ee453a62fa133cdc0ee0bd.zip
1 files changed, 26 insertions, 45 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py
index bfdd4f1e5..708068f3a 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -59,13 +59,10 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
 # specific xpath variables
 # ------------------------
 
-results_xpath = './/div[@data-sokoban-container]'
+results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
 title_xpath = './/a/h3[1]'
 href_xpath = './/a[h3]/@href'
-content_xpath = './/div[@data-content-feature]'
-
-# google *sections* are no usual *results*, we ignore them
-g_section_with_header = './g-section-with-header'
+content_xpath = './/div[@data-sncf]'
 
 # Suggestions are links placed in a *card-section*, we extract only the text
 # from the links not the links itself.
@@ -303,21 +300,18 @@ def request(query, params):
     return params
 
 
-# (function(){var s='data:image/jpeg;base64,/9j/4AAQ ...
-# ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})();
-RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc")
+# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
+# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
+RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
 
 
 def _parse_data_images(dom):
     data_image_map = {}
-    for _script in eval_xpath_list(dom, "//script[@nonce]"):
-        script = _script.text
-        if not script:
-            continue
-        script = RE_DATA_IMAGE.search(script)
-        if not script:
-            continue
-        data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=')
+    for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
+        end_pos = data_image.rfind('=')
+        if end_pos > 0:
+            data_image = data_image[: end_pos + 1]
+        data_image_map[img_id] = data_image
     logger.debug('data:image objects --> %s', list(data_image_map.keys()))
     return data_image_map
 
@@ -331,11 +325,7 @@ def response(resp):
 
     # convert the text to dom
     dom = html.fromstring(resp.text)
-
-    data_image_map = {}
-    if '_fmt:html' in UI_ASYNC:
-        # in this format images are embedded by a bse64 encoded 'data:image'
-        data_image_map = _parse_data_images(dom)
+    data_image_map = _parse_data_images(dom)
 
     # results --> answer
     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
@@ -349,11 +339,6 @@ def response(resp):
 
     for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks
 
-        # google *sections*
-        if extract_text(eval_xpath(result, g_section_with_header)):
-            logger.debug("ignoring <g-section-with-header>")
-            continue
-
         try:
             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
             if title_tag is None:
@@ -361,33 +346,29 @@ def response(resp):
                 logger.debug('ignoring item from the result_xpath list: missing title')
                 continue
             title = extract_text(title_tag)
+
             url = eval_xpath_getindex(result, href_xpath, 0, None)
             if url is None:
+                logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
                 continue
 
-            content = []
-            img_list = []
-            for content_feature in eval_xpath(result, content_xpath):
-                val = content_feature.attrib['data-content-feature']
-                if val in ['1', '2']:
-                    txt = extract_text(content_feature, allow_none=True)
-                    if txt:
-                        content.append(txt)
-                elif '0' in val:
-                    img = content_feature.xpath('.//img/@src')
-                    if img:
-                        img = img[0]
-                        if img.startswith('data:image'):
-                            img_id = content_feature.xpath('.//img/@id')
-                            if img_id:
-                                img = data_image_map.get(img_id[0])
-                        img_list.append(img)
+            content_nodes = eval_xpath(result, content_xpath)
+            content = extract_text(content_nodes)
 
             if not content:
                 logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
                 continue
-            content = ' / '.join(content)
-            img_src = img_list[0] if img_list else None
+
+            img_src = content_nodes[0].xpath('.//img/@src')
+            if img_src:
+                img_src = img_src[0]
+                if img_src.startswith('data:image'):
+                    img_id = content_nodes[0].xpath('.//img/@id')
+                    if img_id:
+                        img_src = data_image_map.get(img_id[0])
+            else:
+                img_src = None
+
             results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
 
         except Exception as e:  # pylint: disable=broad-except
author	Markus Heiser <markus.heiser@darmarit.de>	2023-03-28 14:39:16 +0200
committer	Markus Heiser <markus.heiser@darmarit.de>	2023-03-28 14:39:16 +0200
commit	6f9e678346e5978a09ee453a62fa133cdc0ee0bd (patch)
tree	ab893ff1f60d8c969ff0f5c2fad0cff49148aa3c /searx/engines/google.py
parent	1498202b0b2df1345cc231679f04805877e26683 (diff)
download	searxng-6f9e678346e5978a09ee453a62fa133cdc0ee0bd.tar.gz searxng-6f9e678346e5978a09ee453a62fa133cdc0ee0bd.zip