diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2022-12-04 22:57:22 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-03-24 10:37:42 +0100 |
commit | 249989955497cd048fa3312d115971282983b269 (patch) | |
tree | 9c25d0499f301dd75e95c2283e6940f3f97a52da /searx/engines/google_videos.py | |
parent | c80e82a855fd388c6080066da892b9723d6037c9 (diff) | |
download | searxng-249989955497cd048fa3312d115971282983b269.tar.gz searxng-249989955497cd048fa3312d115971282983b269.zip |
[mod] Google: reversed engineered & upgrade to data_type: traits_v1
Partial reverse engineering of the Google engines including a improved language
and region handling based on the engine.traits_v1 data.
When ever possible the implementations of the Google engines try to make use of
the async REST APIs. The get_lang_info() has been generalized to a
get_google_info() function / especially the region handling has been improved by
adding the cr parameter.
searx/data/engine_traits.json
Add data type "traits_v1" generated by the fetch_traits() functions from:
- Google (WEB),
- Google images,
- Google news,
- Google scholar and
- Google videos
and remove data from obsolete data type "supported_languages".
A traits.custom type that maps region codes to *supported_domains* is fetched
from https://www.google.com/supported_domains
searx/autocomplete.py:
Reversed engineered autocomplete from Google WEB. Supports Google's languages and
subdomains. The old API suggestqueries.google.com/complete has been replaced
by the async REST API: https://{subdomain}/complete/search?{args}
searx/engines/google.py
Reverse engineering and extensive testing ..
- fetch_traits(): Fetch languages & regions from Google properties.
- always use the async REST API (formally known as 'use_mobile_ui')
- use *supported_domains* from traits
- improved the result list by fetching './/div[@data-content-feature]'
and parsing the type of the various *content features* --> thumbnails are
added
searx/engines/google_images.py
Reverse engineering and extensive testing ..
- fetch_traits(): Fetch languages & regions from Google properties.
- use *supported_domains* from traits
- if exists, freshness_date is added to the result
- issue 1864: result list has been improved a lot (due to the new cr parameter)
searx/engines/google_news.py
Reverse engineering and extensive testing ..
- fetch_traits(): Fetch languages & regions from Google properties.
*supported_domains* is not needed but a ceid list has been added.
- different region handling compared to Google WEB
- fixed for various languages & regions (due to the new ceid parameter) /
avoid CONSENT page
- Google News do no longer support time range
- result list has been fixed: XPath of pub_date and pub_origin
searx/engines/google_videos.py
- fetch_traits(): Fetch languages & regions from Google properties.
- use *supported_domains* from traits
- add paging support
- implement a async request ('asearch': 'arc' & 'async':
'use_ac:true,_fmt:html')
- simplified code (thanks to '_fmt:html' request)
- issue 1359: fixed xpath of video length data
searx/engines/google_scholar.py
- fetch_traits(): Fetch languages & regions from Google properties.
- use *supported_domains* from traits
- request(): include patents & citations
- response(): fixed CAPTCHA detection (Scholar has its own CATCHA manager)
- hardening XPath to iterate over results
- fixed XPath of pub_type (has been change from gs_ct1 to gs_cgt2 class)
- issue 1769 fixed: new request implementation is no longer incompatible
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/google_videos.py')
-rw-r--r-- | searx/engines/google_videos.py | 115 |
1 files changed, 32 insertions, 83 deletions
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 5ab29f9ff..985189df5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google videos engine. +"""This is the implementation of the Google Videos engine. .. admonition:: Content-Security-Policy (CSP) @@ -14,9 +14,8 @@ """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING -import re from urllib.parse import urlencode from lxml import html @@ -27,20 +26,22 @@ from searx.utils import ( extract_text, ) +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, filter_mapping, - g_section_with_header, - title_xpath, suggestion_xpath, detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -55,70 +56,32 @@ about = { # engine dependent config categories = ['videos', 'web'] -paging = False +paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = True -send_accept_language_header = True - -RE_CACHE = {} - - -def _re(regexpr): - """returns compiled regular expression""" - RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) - return RE_CACHE[regexpr] - - -def scrap_out_thumbs_src(dom): - ret_val = {} - thumb_name = 'dimg_' - for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): - _script = script.text - # "dimg_35":"https://i.ytimg.c....", - _dimurl = _re("s='([^']*)").findall(_script) - for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): - v = v.replace(r'\u003d', '=') - v = v.replace(r'\u0026', '&') - ret_val[k] = v - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val - - -def scrap_out_thumbs(dom): - """Scrap out thumbnail data from <script> tags.""" - ret_val = {} - thumb_name = 'dimg_' - - for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): - _script = script.text - - # var s='data:image/jpeg;base64, ...' - _imgdata = _re("s='([^']*)").findall(_script) - if not _imgdata: - continue - - # var ii=['dimg_17'] - for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): - # At least the equal sign in the URL needs to be decoded - ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") - - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val def request(query, params): """Google-Video search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + google_info = get_google_info(params, traits) query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" - + urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"}) + + urlencode( + { + 'q': query, + 'tbm': "vid", + 'start': 10 * params['pageno'], + **google_info['params'], + 'asearch': 'arc', + 'async': 'use_ac:true,_fmt:html', + } + ) ) if params['time_range'] in time_range_dict: @@ -127,9 +90,8 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -141,43 +103,30 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) - vidthumb_imgdata = scrap_out_thumbs(dom) - thumbs_src = scrap_out_thumbs_src(dom) - logger.debug(str(thumbs_src)) # parse results for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): - # ignore google *sections* - if extract_text(eval_xpath(result, g_section_with_header)): - logger.debug("ignoring <g-section-with-header>") - continue - - # ingnore articles without an image id / e.g. news articles - img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) - if img_id is None: - logger.error("no img_id found in item %s (news article?)", len(results) + 1) + img_src = eval_xpath_getindex(result, './/img/@src', 0, None) + if img_src is None: continue - img_src = vidthumb_imgdata.get(img_id, None) - if not img_src: - img_src = thumbs_src.get(img_id, "") + title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) + url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) - title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) - url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) - length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) content = extract_text(c_node) - pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) + pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]')) + length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]')) results.append( { 'url': url, 'title': title, 'content': content, - 'length': length, 'author': pub_info, 'thumbnail': img_src, + 'length': length, 'template': 'videos.html', } ) |