diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2023-04-01 19:33:13 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-04-01 19:33:13 +0200 |
commit | 8de8070ed9c8c8423521ba722e850f1538ee0b1d (patch) | |
tree | 91af20507f27657d391dda1191ec3473d5a8d704 | |
parent | 7592d85982d0878940b4c9d57e78e51047adf8d7 (diff) | |
download | searxng-8de8070ed9c8c8423521ba722e850f1538ee0b1d.tar.gz searxng-8de8070ed9c8c8423521ba722e850f1538ee0b1d.zip |
[fix] engine google-News: fix decoding of URLs
Google-News returns internal links where the origin URL is encoded in a
base64 (RFC 2045 aka URL-safe) string.
Closes: https://github.com/searxng/searxng/issues/1959
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r-- | searx/engines/google_news.py | 45 |
1 files changed, 13 insertions, 32 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index ae55ca9cb..2cee2e2c1 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`: from typing import TYPE_CHECKING -import binascii -import re from urllib.parse import urlencode -from base64 import b64decode +import base64 from lxml import html import babel @@ -144,34 +142,17 @@ def response(resp): for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): - # The first <a> tag in the <article> contains the link to the - # article The href attribute of the <a> is a google internal link, - # we can't use. The real link is hidden in the jslog attribute: - # - # <a ... - # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click" - # href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen" - # ... /> - - jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) - url = re.findall('http[^;]*', jslog) - if url: - url = url[0] - else: - # The real URL is base64 encoded in the json attribute: - # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" - jslog = jslog.split(";")[1].split(':')[1].strip() - try: - padding = (4 - (len(jslog) % 4)) * "=" - jslog = b64decode(jslog + padding) - except binascii.Error: - # URL can't be read, skip this result - continue - - # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' - url = re.findall('http[^;"]*', str(jslog))[0] - - # the first <h3> tag in the <article> contains the title of the link + # The first <a> tag in the <article> contains the link to the article + # The href attribute of the <a> tag is a google internal link, we have + # to decode + + href = eval_xpath_getindex(result, './article/a/@href', 0) + href = href.split('?')[0] + href = href.split('/')[-1] + href = base64.urlsafe_b64decode(href + '====') + href = href[4:].split(b'\xd2')[0] + href = href.decode() + title = extract_text(eval_xpath(result, './article/h3[1]')) # The pub_date is mostly a string like 'yesertday', not a real @@ -189,7 +170,7 @@ def response(resp): results.append( { - 'url': url, + 'url': href, 'title': title, 'content': content, 'img_src': img_src, |