diff options
author | Bnyro <bnyro@tutanota.com> | 2024-11-25 17:00:52 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-25 17:00:52 +0100 |
commit | 66f6495a2253e1c5ec9abae35640ce2a951250e2 (patch) | |
tree | 61a09706feffdea1f732067681391dbb84fcb0fb | |
parent | 5bf3fbc93b93cd415e1460877051854c76ae0a28 (diff) | |
download | searxng-66f6495a2253e1c5ec9abae35640ce2a951250e2.tar.gz searxng-66f6495a2253e1c5ec9abae35640ce2a951250e2.zip |
[fix] duckduckgo extra: crashes and returns no results
-rw-r--r-- | searx/engines/duckduckgo.py | 98 | ||||
-rw-r--r-- | searx/engines/duckduckgo_extra.py | 18 |
2 files changed, 66 insertions, 50 deletions
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 82b37f9d1..d6c5be8f4 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,12 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """ -DuckDuckGo Lite -~~~~~~~~~~~~~~~ +DuckDuckGo WEB +~~~~~~~~~~~~~~ """ +from __future__ import annotations + from typing import TYPE_CHECKING import re -from urllib.parse import urlencode +from urllib.parse import urlencode, quote_plus import json import babel import lxml.html @@ -18,12 +20,12 @@ from searx import ( ) from searx.utils import ( eval_xpath, + extr, extract_text, ) from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx import redisdb from searx.enginelib.traits import EngineTraits -from searx.utils import extr from searx.exceptions import SearxEngineCaptchaException if TYPE_CHECKING: @@ -60,42 +62,30 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} __CACHE = [] -def _cache_key(data: dict): - return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}") +def _cache_key(query: str, region: str): + return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}") -def cache_vqd(data: dict, value): +def cache_vqd(query: str, region: str, value: str): """Caches a ``vqd`` value from a query.""" c = redisdb.client() if c: - logger.debug("cache vqd value: %s", value) - c.set(_cache_key(data), value, ex=600) + logger.debug("VALKEY cache vqd value: %s (%s)", value, region) + c.set(_cache_key(query, region), value, ex=600) else: - logger.debug("MEM cache vqd value: %s", value) + logger.debug("MEM cache vqd value: %s (%s)", value, region) if len(__CACHE) > 100: # cache vqd from last 100 queries __CACHE.pop(0) - __CACHE.append((_cache_key(data), value)) - + __CACHE.append((_cache_key(query, region), value)) -def get_vqd(data): - """Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST). - DDG's bot detection is sensitive to the ``vqd`` value. For some search terms - (such as extremely long search terms that are often sent by bots), no ``vqd`` - value can be determined. +def get_vqd(query: str, region: str, force_request: bool = False): + """Returns the ``vqd`` that fits to the *query*. - If SearXNG cannot determine a ``vqd`` value, then no request should go out - to DDG: - - A request with a wrong ``vqd`` value leads to DDG temporarily putting - SearXNG's IP on a block list. - - Requests from IPs in this block list run into timeouts. - - Not sure, but it seems the block list is a sliding window: to get my IP rid - from the bot list I had to cool down my IP for 1h (send no requests from - that IP to DDG). + :param query: The query term + :param region: DDG's region code + :param force_request: force a request to get a vqd value from DDG TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used by all request to DDG: @@ -106,23 +96,46 @@ def get_vqd(data): - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...`` - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...`` + DDG's bot detection is sensitive to the ``vqd`` value. For some search terms + (such as extremely long search terms that are often sent by bots), no ``vqd`` + value can be determined. + + If SearXNG cannot determine a ``vqd`` value, then no request should go out + to DDG. + + .. attention:: + + A request with a wrong ``vqd`` value leads to DDG temporarily putting + SearXNG's IP on a block list. + + Requests from IPs in this block list run into timeouts. Not sure, but it + seems the block list is a sliding window: to get my IP rid from the bot list + I had to cool down my IP for 1h (send no requests from that IP to DDG). """ + key = _cache_key(query, region) - key = _cache_key(data) - value = None c = redisdb.client() if c: value = c.get(key) if value or value == b'': - value = value.decode('utf-8') + value = value.decode('utf-8') # type: ignore logger.debug("re-use CACHED vqd value: %s", value) return value - else: - for k, value in __CACHE: - if k == key: - logger.debug("MEM re-use CACHED vqd value: %s", value) + for k, value in __CACHE: + if k == key: + logger.debug("MEM re-use CACHED vqd value: %s", value) + return value + + if force_request: + resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}') + if resp.status_code == 200: # type: ignore + value = extr(resp.text, 'vqd="', '"') # type: ignore + if value: + logger.debug("vqd value from DDG request: %s", value) + cache_vqd(query, region, value) return value + return None @@ -251,7 +264,7 @@ def request(query, params): for x in query.split() ] ) - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore if eng_region == "wt-wt": # https://html.duckduckgo.com/html sets an empty value for "all". eng_region = "" @@ -310,10 +323,7 @@ def request(query, params): params['data']['v'] = form_data.get('v', 'l') params['headers']['Referer'] = url - # from here on no more params['data'] shuld be set, since this dict is - # needed to get a vqd value from the cache .. - - vqd = get_vqd(params['data']) + vqd = get_vqd(query, eng_region, force_request=False) # Certain conditions must be met in order to call up one of the # following pages ... @@ -362,7 +372,7 @@ def response(resp): form = form[0] form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0] - cache_vqd(resp.search_params["data"], form_vqd) + cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd) # just select "web-result" and ignore results of class "result--ad result--ad--small" for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'): @@ -379,7 +389,7 @@ def response(resp): results.append(item) zero_click_info_xpath = '//div[@id="zero_click_abstract"]' - zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() + zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() # type: ignore if zero_click and ( "Your IP address is" not in zero_click @@ -432,7 +442,7 @@ def fetch_traits(engine_traits: EngineTraits): if not resp.ok: # type: ignore print("ERROR: response from DuckDuckGo is not OK.") - js_code = extr(resp.text, 'regions:', ',snippetLengths') + js_code = extr(resp.text, 'regions:', ',snippetLengths') # type: ignore regions = json.loads(js_code) for eng_tag, name in regions.items(): @@ -466,7 +476,7 @@ def fetch_traits(engine_traits: EngineTraits): engine_traits.custom['lang_region'] = {} - js_code = extr(resp.text, 'languages:', ',regions') + js_code = extr(resp.text, 'languages:', ',regions') # type: ignore languages = js_variable_to_python(js_code) for eng_lang, name in languages.items(): diff --git a/searx/engines/duckduckgo_extra.py b/searx/engines/duckduckgo_extra.py index b30574d6c..6b0d4b98c 100644 --- a/searx/engines/duckduckgo_extra.py +++ b/searx/engines/duckduckgo_extra.py @@ -4,16 +4,15 @@ DuckDuckGo Extra (images, videos, news) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ """ +from __future__ import annotations + from datetime import datetime from typing import TYPE_CHECKING from urllib.parse import urlencode from searx.utils import get_embeded_stream_url from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import -from searx.engines.duckduckgo import ( - get_ddg_lang, - get_vqd, -) +from searx.engines.duckduckgo import get_ddg_lang, get_vqd from searx.enginelib.traits import EngineTraits if TYPE_CHECKING: @@ -48,15 +47,16 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'} def request(query, params): + eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore # request needs a vqd argument - vqd = get_vqd(query) + vqd = get_vqd(query, eng_region, force_request=True) + if not vqd: # some search terms do not have results and therefore no vqd value params['url'] = None return params - eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) eng_lang = get_ddg_lang(traits, params['searxng_locale']) args = { @@ -86,6 +86,12 @@ def request(query, params): params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}' + # sending these two headers prevents rate limiting for the query + params['headers'] = { + 'Referer': 'https://duckduckgo.com/', + 'X-Requested-With': 'XMLHttpRequest', + } + return params |