diff options
Diffstat (limited to 'searx/engines/google_news.py')
-rw-r--r-- | searx/engines/google_news.py | 69 |
1 files changed, 35 insertions, 34 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index e83b2ba48..b10f77005 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -2,13 +2,16 @@ """Google (News) For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. Not all parameters can be appied, e.g. num_ (the number of -search results to return) is ignored. +Definitions`_. Not all parameters can be appied: + +- num_ : the number of search results is ignored +- save_ : is ignored / Google-News results are always *SafeSearch* .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp +.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp """ @@ -32,20 +35,19 @@ from searx.utils import ( from searx.engines.google import ( supported_languages_url, _fetch_supported_languages, - detect_google_sorry, ) # pylint: enable=unused-import from searx.engines.google import ( - get_lang_country, - filter_mapping, + get_lang_info, + detect_google_sorry, ) # about about = { "website": 'https://news.google.com', "wikidata_id": 'Q12020', - "official_api_documentation": None, + "official_api_documentation": 'https://developers.google.com/custom-search', "use_official_api": False, "require_api_key": False, "results": 'HTML', @@ -69,51 +71,53 @@ paging = False language_support = True use_locale_domain = True time_range_support = True -safesearch = True # not really, but it is not generated by google + +# Google-News results are always *SafeSearch*. Option 'safesearch' is set to +# False here, otherwise checker will report safesearch-errors:: +# +# safesearch : results are identitical for safesearch=0 and safesearch=2 +safesearch = False def request(query, params): """Google-News search request""" - language, country, lang_country = get_lang_country( + lang_info = get_lang_info( # pylint: disable=undefined-variable params, supported_languages, language_aliases ) - subdomain = 'news.google.com' - if params['time_range']: # in time_range_dict: + # google news has only one domain + lang_info['subdomain'] = 'news.google.com' + + ceid = "%s:%s" % (lang_info['country'], lang_info['language']) + + # google news redirects en to en-US + if lang_info['hl'] == 'en': + lang_info['hl'] = 'en-US' + + # Very special to google-news compared to other google engines, the time + # range is included in the search term. + if params['time_range']: query += ' ' + time_range_dict[params['time_range']] - query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ + query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ 'q': query, - 'hl': language, - 'lr': "lang_" + language, + 'hl': lang_info['hl'], + 'lr': lang_info['lr'], 'ie': "utf8", 'oe': "utf8", - 'ceid' : "%s:%s" % (country, language), - 'gl' : country, - }) + 'gl': lang_info['country'], + }) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded - if params['safesearch']: - query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) - - params['url'] = query_url logger.debug("query_url --> %s", query_url) + params['url'] = query_url - # en-US,en;q=0.8,en;q=0.5 - params['headers']['Accept-Language'] = ( - lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' - ) - logger.debug("HTTP header Accept-Language --> %s", - params['headers']['Accept-Language']) + logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) + params['headers']['Accept-Language'] = lang_info['Accept-Language'] params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) - # hl=en redirect to hl=en-US / en-CA ... - params['soft_max_redirects'] = 1 - - #params['google_subdomain'] = subdomain - return params @@ -123,9 +127,6 @@ def response(resp): detect_google_sorry(resp) - # which subdomain ? - # subdomain = resp.search_params.get('google_subdomain') - # convert the text to dom dom = html.fromstring(resp.text) |