diff options
-rw-r--r-- | searx/engines/google.py | 37 | ||||
-rw-r--r-- | searx/engines/google_images.py | 42 |
2 files changed, 43 insertions, 36 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py index 24d46b97c..093ad6bd7 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -99,9 +99,9 @@ time_range_dict = { # Filter results. 0: None, 1: Moderate, 2: Strict filter_mapping = { - 0 : 'off', - 1 : 'medium', - 2 : 'high' + 0: 'off', + 1: 'medium', + 2: 'high' } # specific xpath variables @@ -111,7 +111,7 @@ filter_mapping = { results_xpath = '//div[@class="g"]' # google *sections* are no usual *results*, we ignore them -g_section_with_header='./g-section-with-header' +g_section_with_header = './g-section-with-header' # the title is a h3 tag relative to the result group title_xpath = './/h3[1]' @@ -131,6 +131,7 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a' # *spelling suggestions*, we use them anyway. spelling_suggestion_xpath = '//div[@class="med"]/p/a' + def extract_text_from_dom(result, xpath): """returns extract_text on the first result selected by the xpath or None""" r = eval_xpath(result, xpath) @@ -138,6 +139,7 @@ def extract_text_from_dom(result, xpath): return extract_text(r[0]) return None + def get_lang_country(params, lang_list, custom_aliases): """Returns a tuple with *langauage* on its first and *country* on its second position.""" @@ -159,6 +161,7 @@ def get_lang_country(params, lang_list, custom_aliases): return language, country, lang_country + def request(query, params): """Google search request""" @@ -170,7 +173,7 @@ def request(query, params): subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium - query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ + query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ 'q': query, 'hl': lang_country, 'lr': "lang_" + language, @@ -190,16 +193,17 @@ def request(query, params): # en-US,en;q=0.8,en;q=0.5 params['headers']['Accept-Language'] = ( lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' - ) + ) logger.debug("HTTP header Accept-Language --> %s", params['headers']['Accept-Language']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - ) - #params['google_subdomain'] = subdomain + ) + # params['google_subdomain'] = subdomain return params + def response(resp): """Get response from google's search request""" results = [] @@ -249,16 +253,16 @@ def response(resp): url = eval_xpath(result, href_xpath)[0] content = extract_text_from_dom(result, content_xpath) results.append({ - 'url': url, - 'title': title, - 'content': content - }) + 'url': url, + 'title': title, + 'content': content + }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) - #from lxml import etree - #logger.debug(etree.tostring(result, pretty_print=True)) - #import pdb - #pdb.set_trace() + # from lxml import etree + # logger.debug(etree.tostring(result, pretty_print=True)) + # import pdb + # pdb.set_trace() continue # parse suggestion @@ -272,6 +276,7 @@ def response(resp): # return results return results + # get supported languages from their site def _fetch_supported_languages(resp): ret_val = {} diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 75264eb9c..6ec242b53 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -33,15 +33,15 @@ from searx.engines.xpath import extract_text # pylint: disable=unused-import from searx.engines.google import ( - supported_languages_url - , _fetch_supported_languages + supported_languages_url, + _fetch_supported_languages, ) # pylint: enable=unused-import from searx.engines.google import ( - get_lang_country - , google_domains - , time_range_dict + get_lang_country, + google_domains, + time_range_dict, ) logger = logger.getChild('google images') @@ -56,11 +56,12 @@ time_range_support = True safesearch = True filter_mapping = { - 0 : 'images', - 1 : 'active', - 2 : 'active' + 0: 'images', + 1: 'active', + 2: 'active' } + def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ @@ -68,13 +69,14 @@ def scrap_out_thumbs(dom): for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'): _script = script.text # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....'); - _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",",1) - _thumb_no = _thumb_no.replace("'","") - _img_data = _img_data.replace("'","") + _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1) + _thumb_no = _thumb_no.replace("'", "") + _img_data = _img_data.replace("'", "") _img_data = _img_data.replace(r"\/", r"/") ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=") return ret_val + def request(query, params): """Google-Video search request""" @@ -84,10 +86,10 @@ def request(query, params): ) subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') - query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ - 'q': query, + query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ + 'q': query, 'tbm': "isch", - 'hl': lang_country, + 'hl': lang_country, 'lr': "lang_" + language, 'ie': "utf8", 'oe': "utf8", @@ -108,8 +110,8 @@ def request(query, params): "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - ) - #params['google_subdomain'] = subdomain + ) + # params['google_subdomain'] = subdomain return params @@ -196,10 +198,10 @@ def response(resp): }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) - #from lxml import etree - #logger.debug(etree.tostring(img_node, pretty_print=True)) - #import pdb - #pdb.set_trace() + # from lxml import etree + # logger.debug(etree.tostring(img_node, pretty_print=True)) + # import pdb + # pdb.set_trace() continue return results |