[fix] engine google-News: fix decoding of URLs

Google-News returns internal links where the origin URL is encoded in a base64 (RFC 2045 aka URL-safe) string. Closes: https://github.com/searxng/searxng/issues/1959 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
author: Markus Heiser <markus.heiser@darmarit.de> 2023-04-01 19:33:13 +0200
committer: Markus Heiser <markus.heiser@darmarit.de> 2023-04-01 19:33:13 +0200
commit: 8de8070ed9c8c8423521ba722e850f1538ee0b1d (patch)
tree: 91af20507f27657d391dda1191ec3473d5a8d704
parent: 7592d85982d0878940b4c9d57e78e51047adf8d7 (diff)
download: searxng-8de8070ed9c8c8423521ba722e850f1538ee0b1d.tar.gz
searxng-8de8070ed9c8c8423521ba722e850f1538ee0b1d.zip
1 files changed, 13 insertions, 32 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index ae55ca9cb..2cee2e2c1 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`:
 
 from typing import TYPE_CHECKING
 
-import binascii
-import re
 from urllib.parse import urlencode
-from base64 import b64decode
+import base64
 from lxml import html
 import babel
 
@@ -144,34 +142,17 @@ def response(resp):
 
     for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
 
-        # The first <a> tag in the <article> contains the link to the
-        # article The href attribute of the <a> is a google internal link,
-        # we can't use.  The real link is hidden in the jslog attribute:
-        #
-        #   <a ...
-        #      jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
-        #      href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
-        #      ... />
-
-        jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
-        url = re.findall('http[^;]*', jslog)
-        if url:
-            url = url[0]
-        else:
-            # The real URL is base64 encoded in the json attribute:
-            # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
-            jslog = jslog.split(";")[1].split(':')[1].strip()
-            try:
-                padding = (4 - (len(jslog) % 4)) * "="
-                jslog = b64decode(jslog + padding)
-            except binascii.Error:
-                # URL can't be read, skip this result
-                continue
-
-            # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
-            url = re.findall('http[^;"]*', str(jslog))[0]
-
-        # the first <h3> tag in the <article> contains the title of the link
+        # The first <a> tag in the <article> contains the link to the article
+        # The href attribute of the <a> tag is a google internal link, we have
+        # to decode
+
+        href = eval_xpath_getindex(result, './article/a/@href', 0)
+        href = href.split('?')[0]
+        href = href.split('/')[-1]
+        href = base64.urlsafe_b64decode(href + '====')
+        href = href[4:].split(b'\xd2')[0]
+        href = href.decode()
+
         title = extract_text(eval_xpath(result, './article/h3[1]'))
 
         # The pub_date is mostly a string like 'yesertday', not a real
@@ -189,7 +170,7 @@ def response(resp):
 
         results.append(
             {
-                'url': url,
+                'url': href,
                 'title': title,
                 'content': content,
                 'img_src': img_src,
author	Markus Heiser <markus.heiser@darmarit.de>	2023-04-01 19:33:13 +0200
committer	Markus Heiser <markus.heiser@darmarit.de>	2023-04-01 19:33:13 +0200
commit	8de8070ed9c8c8423521ba722e850f1538ee0b1d (patch)
tree	91af20507f27657d391dda1191ec3473d5a8d704
parent	7592d85982d0878940b4c9d57e78e51047adf8d7 (diff)
download	searxng-8de8070ed9c8c8423521ba722e850f1538ee0b1d.tar.gz searxng-8de8070ed9c8c8423521ba722e850f1538ee0b1d.zip