summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2021-01-22 17:16:46 +0100
committerMarkus Heiser <markus.heiser@darmarit.de>2021-01-24 09:39:30 +0100
commit89b3050b5c406f795dd25d24f182cf173ad42774 (patch)
tree0c59f73e97658a0596411f40ccaf5666fc1287e5 /searx
parentf4a17acb7a4fa82c5cb629f4eaad11ef528f89e4 (diff)
downloadsearxng-89b3050b5c406f795dd25d24f182cf173ad42774.tar.gz
searxng-89b3050b5c406f795dd25d24f182cf173ad42774.zip
[fix] revise of the google-Video engine
This revise is based on the methods developed in the revise of the google engine (see commit 410c2f9). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/google_videos.py248
1 files changed, 186 insertions, 62 deletions
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
index 61e01ca7b..486ba7ccd 100644
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -1,13 +1,58 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
- Google (Videos)
+Google (Viedo)
+
+For detailed description of the *REST-full* API see: `Query Parameter
+Definitions`_. Not all parameters can be appied.
+
+.. _admonition:: Content-Security-Policy (CSP)
+
+ This engine needs to allow images from the `data URLs`_ (prefixed with the
+ ``data:` scheme).::
+
+ Header set Content-Security-Policy "img-src 'self' data: ;"
+
+.. _Query Parameter Definitions:
+ https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
+.. _data URLs:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
+
"""
-from datetime import date, timedelta
-from urllib.parse import urlencode
-from lxml import html
-from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
+# pylint: disable=invalid-name, missing-function-docstring
+
import re
+from urllib.parse import urlencode, urlparse
+from lxml import html
+
+from searx import logger
+from searx.exceptions import SearxEngineCaptchaException
+from searx.utils import (
+ eval_xpath,
+ eval_xpath_list,
+ extract_text,
+)
+
+from searx.engines.google import (
+ get_lang_country,
+ google_domains,
+ time_range_dict,
+ filter_mapping,
+ results_xpath,
+ g_section_with_header,
+ title_xpath,
+ href_xpath,
+ content_xpath,
+ suggestion_xpath,
+ spelling_suggestion_xpath
+)
+
+# pylint: disable=unused-import
+from searx.engines.google import (
+ supported_languages_url
+ , _fetch_supported_languages
+)
+# pylint: enable=unused-import
# about
about = {
@@ -17,83 +62,162 @@ about = {
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
+ "template": 'video.html',
+ "parse": ('url', 'title', 'content', 'thumbnail')
}
+logger = logger.getChild('google video')
+
# engine dependent config
+
categories = ['videos']
-paging = True
-safesearch = True
+paging = False
+language_support = True
+use_locale_domain = True
time_range_support = True
-number_of_results = 10
+safesearch = True
-search_url = 'https://www.google.com/search'\
- '?q={query}'\
- '&tbm=vid'\
- '&{search_options}'
-time_range_attr = "qdr:{range}"
-time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
-time_range_dict = {'day': 'd',
- 'week': 'w',
- 'month': 'm'}
+RE_CACHE = {}
+def _re(regexpr):
+ """returns compiled regular expression"""
+ RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
+ return RE_CACHE[regexpr]
-# do search-request
-def request(query, params):
- search_options = {
- 'ijn': params['pageno'] - 1,
- 'start': (params['pageno'] - 1) * number_of_results
- }
+def scrap_out_thumbs(dom):
+ """Scrap out thumbnail data from <script> tags.
+ """
+ ret_val = dict()
+ thumb_name = 'vidthumb'
- if params['time_range'] in time_range_dict:
- search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
- elif params['time_range'] == 'year':
- now = date.today()
- then = now - timedelta(days=365)
- start = then.strftime('%m/%d/%Y')
- end = now.strftime('%m/%d/%Y')
- search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
+ for script in eval_xpath(dom, '//script[contains(., "_setImagesSrc")]'):
+ _script = script.text
+
+ # var s='data:image/jpeg;base64, ...'
+ _imgdata = _re("s='([^']*)").findall( _script)
+ if not _imgdata:
+ continue
+
+ # var ii=['vidthumb4','vidthumb7']
+ for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
+ # At least the equal sign in the URL needs to be decoded
+ ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
+
+ # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
+ for script in eval_xpath(dom, '//script[contains(., "google.ldi={")]'):
+ _script = script.text
+ for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
+ match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
+ if match:
+ # At least the equal sign in the URL needs to be decoded
+ ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
- if safesearch and params['safesearch']:
- search_options['safe'] = 'on'
+ logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
+ return ret_val
- params['url'] = search_url.format(query=urlencode({'q': query}),
- search_options=urlencode(search_options))
+def request(query, params):
+ """Google-Video search request"""
+
+ language, country, lang_country = get_lang_country(
+ # pylint: disable=undefined-variable
+ params, supported_languages, language_aliases
+ )
+ subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
+
+ query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({
+ 'q': query,
+ 'tbm': "vid",
+ 'hl': lang_country,
+ 'lr': "lang_" + language,
+ 'ie': "utf8",
+ 'oe': "utf8",
+ })
+
+ if params['time_range'] in time_range_dict:
+ query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
+ if params['safesearch']:
+ query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
+
+ params['url'] = query_url
+ logger.debug("query_url --> %s", query_url)
+
+ # en-US,en;q=0.8,en;q=0.5
+ params['headers']['Accept-Language'] = (
+ "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language))
+ logger.debug(
+ "HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
+ params['headers']['Accept'] = (
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+ )
+ #params['google_subdomain'] = subdomain
return params
-# get response from search-request
def response(resp):
+ """Get response from google's search request"""
results = []
+ # detect google sorry
+ resp_url = urlparse(resp.url)
+ if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
+ raise SearxEngineCaptchaException()
+
+ if resp_url.path.startswith('/sorry'):
+ raise SearxEngineCaptchaException()
+
+ # which subdomain ?
+ # subdomain = resp.search_params.get('google_subdomain')
+
+ # convert the text to dom
dom = html.fromstring(resp.text)
+ vidthumb_imgdata = scrap_out_thumbs(dom)
# parse results
- for result in eval_xpath_list(dom, '//div[@class="g"]'):
-
- title = extract_text(eval_xpath(result, './/h3'))
- url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0)
- content = extract_text(eval_xpath(result, './/span[@class="st"]'))
-
- # get thumbnails
- script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
- ids = result.xpath('.//div[@class="s"]//img/@id')
- if len(ids) > 0:
- thumbnails_data = \
- re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
- script)
- tmp = []
- if len(thumbnails_data) != 0:
- tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
- thumbnail = ''
- if len(tmp) != 0:
- thumbnail = tmp[-1]
-
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content,
- 'thumbnail': thumbnail,
- 'template': 'videos.html'})
+ for result in eval_xpath_list(dom, results_xpath):
+
+ # google *sections*
+ if extract_text(eval_xpath(result, g_section_with_header)):
+ logger.debug("ingoring <g-section-with-header>")
+ continue
+
+ title = extract_text(eval_xpath(result, title_xpath)[0])
+ url = eval_xpath(result, href_xpath)[0]
+ c_node = eval_xpath(result, content_xpath)[0]
+
+ # <img id="vidthumb1" ...>
+ img_id = eval_xpath(c_node, './div[1]//a/g-img/img/@id')
+ if not img_id:
+ continue
+ img_id = img_id[0]
+ img_src = vidthumb_imgdata.get(img_id, None)
+ if not img_src:
+ logger.error("no vidthumb imgdata for: %s" % img_id)
+ img_src = eval_xpath(c_node, './div[1]//a/g-img/img/@src')[0]
+
+ duration = extract_text(eval_xpath(c_node, './div[1]//a/span'))
+ content = extract_text(eval_xpath(c_node, './div[2]/span'))
+ pub_info = extract_text(eval_xpath(c_node, './div[2]/div'))
+
+ if len(duration) > 3:
+ content = duration + " - " + content
+ if pub_info:
+ content = content + " (%s)" % pub_info
+
+ results.append({
+ 'url': url,
+ 'title': title,
+ 'content': content,
+ 'thumbnail': img_src,
+ 'template': 'videos.html',
+ })
+
+ # parse suggestion
+ for suggestion in eval_xpath(dom, suggestion_xpath):
+ # append suggestion
+ results.append({'suggestion': extract_text(suggestion)})
+
+ for correction in eval_xpath(dom, spelling_suggestion_xpath):
+ results.append({'correction': extract_text(correction)})
return results