summaryrefslogtreecommitdiff
path: root/searx/engines/google_news.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines/google_news.py')
-rw-r--r--searx/engines/google_news.py250
1 files changed, 199 insertions, 51 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 1ada2d64d..ae55ca9cb 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -1,24 +1,40 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
-"""This is the implementation of the google news engine. The google news API
-ignores some parameters from the common :ref:`google API`:
+"""This is the implementation of the Google News engine.
-- num_ : the number of search results is ignored
+Google News has a different region handling compared to Google WEB.
+
+- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
+- the hl_ argument has to be set correctly (and different to Google WEB)
+- the gl_ argument is mandatory
+
+If one of this argument is not set correctly, the request is redirected to
+CONSENT dialog::
+
+ https://consent.google.com/m?continue=
+
+The google news API ignores some parameters from the common :ref:`google API`:
+
+- num_ : the number of search results is ignored / there is no paging all
+ results for a query term are in the first response.
- save_ : is ignored / Google-News results are always *SafeSearch*
+.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
+.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
-
"""
-# pylint: disable=invalid-name
+from typing import TYPE_CHECKING
import binascii
import re
from urllib.parse import urlencode
from base64 import b64decode
from lxml import html
+import babel
+from searx import locales
from searx.utils import (
eval_xpath,
eval_xpath_list,
@@ -26,18 +42,19 @@ from searx.utils import (
extract_text,
)
-# pylint: disable=unused-import
+from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
- supported_languages_url,
- _fetch_supported_languages,
+ get_google_info,
+ detect_google_sorry,
)
+from searx.enginelib.traits import EngineTraits
-# pylint: enable=unused-import
+if TYPE_CHECKING:
+ import logging
-from searx.engines.google import (
- get_lang_info,
- detect_google_sorry,
-)
+ logger: logging.Logger
+
+traits: EngineTraits
# about
about = {
@@ -49,70 +66,77 @@ about = {
"results": 'HTML',
}
-# compared to other google engines google-news has a different time range
-# support. The time range is included in the search term.
-time_range_dict = {
- 'day': 'when:1d',
- 'week': 'when:7d',
- 'month': 'when:1m',
- 'year': 'when:1y',
-}
-
# engine dependent config
-
categories = ['news']
paging = False
-use_locale_domain = True
-time_range_support = True
+time_range_support = False
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
-safesearch = False
-send_accept_language_header = True
+safesearch = True
+# send_accept_language_header = True
def request(query, params):
"""Google-News search request"""
- lang_info = get_lang_info(params, supported_languages, language_aliases, False)
+ sxng_locale = params.get('searxng_locale', 'en-US')
+ ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
+ google_info = get_google_info(params, traits)
+ google_info['subdomain'] = 'news.google.com' # google news has only one domain
- # google news has only one domain
- lang_info['subdomain'] = 'news.google.com'
+ ceid_region, ceid_lang = ceid.split(':')
+ ceid_lang, ceid_suffix = (
+ ceid_lang.split('-')
+ + [
+ None,
+ ]
+ )[:2]
- ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
+ google_info['params']['hl'] = ceid_lang
- # google news redirects en to en-US
- if lang_info['params']['hl'] == 'en':
- lang_info['params']['hl'] = 'en-US'
+ if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
- # Very special to google-news compared to other google engines, the time
- # range is included in the search term.
- if params['time_range']:
- query += ' ' + time_range_dict[params['time_range']]
+ if ceid_region.lower() == ceid_lang:
+ google_info['params']['hl'] = ceid_lang + '-' + ceid_region
+ else:
+ google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
+
+ elif ceid_region.lower() != ceid_lang:
+
+ if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
+ google_info['params']['hl'] = ceid_lang
+ else:
+ google_info['params']['hl'] = ceid_lang + '-' + ceid_region
+
+ google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
+ google_info['params']['gl'] = ceid_region
query_url = (
'https://'
- + lang_info['subdomain']
- + '/search'
- + "?"
- + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']})
+ + google_info['subdomain']
+ + "/search?"
+ + urlencode(
+ {
+ 'q': query,
+ **google_info['params'],
+ }
+ )
+ # ceid includes a ':' character which must not be urlencoded
+ ('&ceid=%s' % ceid)
- ) # ceid includes a ':' character which must not be urlencoded
- params['url'] = query_url
-
- params['cookies']['CONSENT'] = "YES+"
- params['headers'].update(lang_info['headers'])
- params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+ )
+ params['url'] = query_url
+ params['cookies'] = google_info['cookies']
+ params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
-
detect_google_sorry(resp)
# convert the text to dom
@@ -152,8 +176,8 @@ def response(resp):
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
- pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time'))
- pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a'))
+ pub_date = extract_text(eval_xpath(result, './article//time'))
+ pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
@@ -174,3 +198,127 @@ def response(resp):
# return results
return results
+
+
+ceid_list = [
+ 'AE:ar',
+ 'AR:es-419',
+ 'AT:de',
+ 'AU:en',
+ 'BD:bn',
+ 'BE:fr',
+ 'BE:nl',
+ 'BG:bg',
+ 'BR:pt-419',
+ 'BW:en',
+ 'CA:en',
+ 'CA:fr',
+ 'CH:de',
+ 'CH:fr',
+ 'CL:es-419',
+ 'CN:zh-Hans',
+ 'CO:es-419',
+ 'CU:es-419',
+ 'CZ:cs',
+ 'DE:de',
+ 'EG:ar',
+ 'ES:es',
+ 'ET:en',
+ 'FR:fr',
+ 'GB:en',
+ 'GH:en',
+ 'GR:el',
+ 'HK:zh-Hant',
+ 'HU:hu',
+ 'ID:en',
+ 'ID:id',
+ 'IE:en',
+ 'IL:en',
+ 'IL:he',
+ 'IN:bn',
+ 'IN:en',
+ 'IN:hi',
+ 'IN:ml',
+ 'IN:mr',
+ 'IN:ta',
+ 'IN:te',
+ 'IT:it',
+ 'JP:ja',
+ 'KE:en',
+ 'KR:ko',
+ 'LB:ar',
+ 'LT:lt',
+ 'LV:en',
+ 'LV:lv',
+ 'MA:fr',
+ 'MX:es-419',
+ 'MY:en',
+ 'NA:en',
+ 'NG:en',
+ 'NL:nl',
+ 'NO:no',
+ 'NZ:en',
+ 'PE:es-419',
+ 'PH:en',
+ 'PK:en',
+ 'PL:pl',
+ 'PT:pt-150',
+ 'RO:ro',
+ 'RS:sr',
+ 'RU:ru',
+ 'SA:ar',
+ 'SE:sv',
+ 'SG:en',
+ 'SI:sl',
+ 'SK:sk',
+ 'SN:fr',
+ 'TH:th',
+ 'TR:tr',
+ 'TW:zh-Hant',
+ 'TZ:en',
+ 'UA:ru',
+ 'UA:uk',
+ 'UG:en',
+ 'US:en',
+ 'US:es-419',
+ 'VE:es-419',
+ 'VN:vi',
+ 'ZA:en',
+ 'ZW:en',
+]
+"""List of region/language combinations supported by Google News. Values of the
+``ceid`` argument of the Google News REST API."""
+
+
+_skip_values = [
+ 'ET:en', # english (ethiopia)
+ 'ID:en', # english (indonesia)
+ 'LV:en', # english (latvia)
+]
+
+_ceid_locale_map = {'NO:no': 'nb-NO'}
+
+
+def fetch_traits(engine_traits: EngineTraits):
+ _fetch_traits(engine_traits, add_domains=False)
+
+ engine_traits.custom['ceid'] = {}
+
+ for ceid in ceid_list:
+ if ceid in _skip_values:
+ continue
+
+ region, lang = ceid.split(':')
+ x = lang.split('-')
+ if len(x) > 1:
+ if x[1] not in ['Hant', 'Hans']:
+ lang = x[0]
+
+ sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
+ try:
+ locale = babel.Locale.parse(sxng_locale, sep='-')
+ except babel.UnknownLocaleError:
+ print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
+ continue
+
+ engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid