From 249989955497cd048fa3312d115971282983b269 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 4 Dec 2022 22:57:22 +0100 Subject: [mod] Google: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the Google engines including a improved language and region handling based on the engine.traits_v1 data. When ever possible the implementations of the Google engines try to make use of the async REST APIs. The get_lang_info() has been generalized to a get_google_info() function / especially the region handling has been improved by adding the cr parameter. searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - Google (WEB), - Google images, - Google news, - Google scholar and - Google videos and remove data from obsolete data type "supported_languages". A traits.custom type that maps region codes to *supported_domains* is fetched from https://www.google.com/supported_domains searx/autocomplete.py: Reversed engineered autocomplete from Google WEB. Supports Google's languages and subdomains. The old API suggestqueries.google.com/complete has been replaced by the async REST API: https://{subdomain}/complete/search?{args} searx/engines/google.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - always use the async REST API (formally known as 'use_mobile_ui') - use *supported_domains* from traits - improved the result list by fetching './/div[@data-content-feature]' and parsing the type of the various *content features* --> thumbnails are added searx/engines/google_images.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - if exists, freshness_date is added to the result - issue 1864: result list has been improved a lot (due to the new cr parameter) searx/engines/google_news.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. *supported_domains* is not needed but a ceid list has been added. - different region handling compared to Google WEB - fixed for various languages & regions (due to the new ceid parameter) / avoid CONSENT page - Google News do no longer support time range - result list has been fixed: XPath of pub_date and pub_origin searx/engines/google_videos.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - add paging support - implement a async request ('asearch': 'arc' & 'async': 'use_ac:true,_fmt:html') - simplified code (thanks to '_fmt:html' request) - issue 1359: fixed xpath of video length data searx/engines/google_scholar.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - request(): include patents & citations - response(): fixed CAPTCHA detection (Scholar has its own CATCHA manager) - hardening XPath to iterate over results - fixed XPath of pub_type (has been change from gs_ct1 to gs_cgt2 class) - issue 1769 fixed: new request implementation is no longer incompatible Signed-off-by: Markus Heiser --- searx/engines/google_videos.py | 115 ++++++++++++----------------------------- 1 file changed, 32 insertions(+), 83 deletions(-) (limited to 'searx/engines/google_videos.py') diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 5ab29f9ff..985189df5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google videos engine. +"""This is the implementation of the Google Videos engine. .. admonition:: Content-Security-Policy (CSP) @@ -14,9 +14,8 @@ """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING -import re from urllib.parse import urlencode from lxml import html @@ -27,20 +26,22 @@ from searx.utils import ( extract_text, ) +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, filter_mapping, - g_section_with_header, - title_xpath, suggestion_xpath, detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -55,70 +56,32 @@ about = { # engine dependent config categories = ['videos', 'web'] -paging = False +paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = True -send_accept_language_header = True - -RE_CACHE = {} - - -def _re(regexpr): - """returns compiled regular expression""" - RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) - return RE_CACHE[regexpr] - - -def scrap_out_thumbs_src(dom): - ret_val = {} - thumb_name = 'dimg_' - for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): - _script = script.text - # "dimg_35":"https://i.ytimg.c....", - _dimurl = _re("s='([^']*)").findall(_script) - for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): - v = v.replace(r'\u003d', '=') - v = v.replace(r'\u0026', '&') - ret_val[k] = v - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val - - -def scrap_out_thumbs(dom): - """Scrap out thumbnail data from