diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2021-01-22 17:16:46 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2021-01-24 09:39:30 +0100 |
commit | 89b3050b5c406f795dd25d24f182cf173ad42774 (patch) | |
tree | 0c59f73e97658a0596411f40ccaf5666fc1287e5 /searx/engines/google_videos.py | |
parent | f4a17acb7a4fa82c5cb629f4eaad11ef528f89e4 (diff) | |
download | searxng-89b3050b5c406f795dd25d24f182cf173ad42774.tar.gz searxng-89b3050b5c406f795dd25d24f182cf173ad42774.zip |
[fix] revise of the google-Video engine
This revise is based on the methods developed in the revise of the google engine
(see commit 410c2f9).
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/google_videos.py')
-rw-r--r-- | searx/engines/google_videos.py | 248 |
1 files changed, 186 insertions, 62 deletions
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 61e01ca7b..486ba7ccd 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,13 +1,58 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """ - Google (Videos) +Google (Viedo) + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. Not all parameters can be appied. + +.. _admonition:: Content-Security-Policy (CSP) + + This engine needs to allow images from the `data URLs`_ (prefixed with the + ``data:` scheme).:: + + Header set Content-Security-Policy "img-src 'self' data: ;" + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions +.. _data URLs: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs + """ -from datetime import date, timedelta -from urllib.parse import urlencode -from lxml import html -from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +# pylint: disable=invalid-name, missing-function-docstring + import re +from urllib.parse import urlencode, urlparse +from lxml import html + +from searx import logger +from searx.exceptions import SearxEngineCaptchaException +from searx.utils import ( + eval_xpath, + eval_xpath_list, + extract_text, +) + +from searx.engines.google import ( + get_lang_country, + google_domains, + time_range_dict, + filter_mapping, + results_xpath, + g_section_with_header, + title_xpath, + href_xpath, + content_xpath, + suggestion_xpath, + spelling_suggestion_xpath +) + +# pylint: disable=unused-import +from searx.engines.google import ( + supported_languages_url + , _fetch_supported_languages +) +# pylint: enable=unused-import # about about = { @@ -17,83 +62,162 @@ about = { "use_official_api": False, "require_api_key": False, "results": 'HTML', + "template": 'video.html', + "parse": ('url', 'title', 'content', 'thumbnail') } +logger = logger.getChild('google video') + # engine dependent config + categories = ['videos'] -paging = True -safesearch = True +paging = False +language_support = True +use_locale_domain = True time_range_support = True -number_of_results = 10 +safesearch = True -search_url = 'https://www.google.com/search'\ - '?q={query}'\ - '&tbm=vid'\ - '&{search_options}' -time_range_attr = "qdr:{range}" -time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm'} +RE_CACHE = {} +def _re(regexpr): + """returns compiled regular expression""" + RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) + return RE_CACHE[regexpr] -# do search-request -def request(query, params): - search_options = { - 'ijn': params['pageno'] - 1, - 'start': (params['pageno'] - 1) * number_of_results - } +def scrap_out_thumbs(dom): + """Scrap out thumbnail data from <script> tags. + """ + ret_val = dict() + thumb_name = 'vidthumb' - if params['time_range'] in time_range_dict: - search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) - elif params['time_range'] == 'year': - now = date.today() - then = now - timedelta(days=365) - start = then.strftime('%m/%d/%Y') - end = now.strftime('%m/%d/%Y') - search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) + for script in eval_xpath(dom, '//script[contains(., "_setImagesSrc")]'): + _script = script.text + + # var s='data:image/jpeg;base64, ...' + _imgdata = _re("s='([^']*)").findall( _script) + if not _imgdata: + continue + + # var ii=['vidthumb4','vidthumb7'] + for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): + # At least the equal sign in the URL needs to be decoded + ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") + + # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... + for script in eval_xpath(dom, '//script[contains(., "google.ldi={")]'): + _script = script.text + for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : + match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) + if match: + # At least the equal sign in the URL needs to be decoded + ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=") - if safesearch and params['safesearch']: - search_options['safe'] = 'on' + logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) + return ret_val - params['url'] = search_url.format(query=urlencode({'q': query}), - search_options=urlencode(search_options)) +def request(query, params): + """Google-Video search request""" + + language, country, lang_country = get_lang_country( + # pylint: disable=undefined-variable + params, supported_languages, language_aliases + ) + subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') + + query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ + 'q': query, + 'tbm': "vid", + 'hl': lang_country, + 'lr': "lang_" + language, + 'ie': "utf8", + 'oe': "utf8", + }) + + if params['time_range'] in time_range_dict: + query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) + if params['safesearch']: + query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) + + params['url'] = query_url + logger.debug("query_url --> %s", query_url) + + # en-US,en;q=0.8,en;q=0.5 + params['headers']['Accept-Language'] = ( + "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) + logger.debug( + "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) + params['headers']['Accept'] = ( + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + #params['google_subdomain'] = subdomain return params -# get response from search-request def response(resp): + """Get response from google's search request""" results = [] + # detect google sorry + resp_url = urlparse(resp.url) + if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': + raise SearxEngineCaptchaException() + + if resp_url.path.startswith('/sorry'): + raise SearxEngineCaptchaException() + + # which subdomain ? + # subdomain = resp.search_params.get('google_subdomain') + + # convert the text to dom dom = html.fromstring(resp.text) + vidthumb_imgdata = scrap_out_thumbs(dom) # parse results - for result in eval_xpath_list(dom, '//div[@class="g"]'): - - title = extract_text(eval_xpath(result, './/h3')) - url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0) - content = extract_text(eval_xpath(result, './/span[@class="st"]')) - - # get thumbnails - script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) - ids = result.xpath('.//div[@class="s"]//img/@id') - if len(ids) > 0: - thumbnails_data = \ - re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0], - script) - tmp = [] - if len(thumbnails_data) != 0: - tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) - thumbnail = '' - if len(tmp) != 0: - thumbnail = tmp[-1] - - # append result - results.append({'url': url, - 'title': title, - 'content': content, - 'thumbnail': thumbnail, - 'template': 'videos.html'}) + for result in eval_xpath_list(dom, results_xpath): + + # google *sections* + if extract_text(eval_xpath(result, g_section_with_header)): + logger.debug("ingoring <g-section-with-header>") + continue + + title = extract_text(eval_xpath(result, title_xpath)[0]) + url = eval_xpath(result, href_xpath)[0] + c_node = eval_xpath(result, content_xpath)[0] + + # <img id="vidthumb1" ...> + img_id = eval_xpath(c_node, './div[1]//a/g-img/img/@id') + if not img_id: + continue + img_id = img_id[0] + img_src = vidthumb_imgdata.get(img_id, None) + if not img_src: + logger.error("no vidthumb imgdata for: %s" % img_id) + img_src = eval_xpath(c_node, './div[1]//a/g-img/img/@src')[0] + + duration = extract_text(eval_xpath(c_node, './div[1]//a/span')) + content = extract_text(eval_xpath(c_node, './div[2]/span')) + pub_info = extract_text(eval_xpath(c_node, './div[2]/div')) + + if len(duration) > 3: + content = duration + " - " + content + if pub_info: + content = content + " (%s)" % pub_info + + results.append({ + 'url': url, + 'title': title, + 'content': content, + 'thumbnail': img_src, + 'template': 'videos.html', + }) + + # parse suggestion + for suggestion in eval_xpath(dom, suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + for correction in eval_xpath(dom, spelling_suggestion_xpath): + results.append({'correction': extract_text(correction)}) return results |