diff options
Diffstat (limited to 'searx')
-rw-r--r-- | searx/autocomplete.py | 24 | ||||
-rw-r--r-- | searx/data/engine_traits.json | 507 | ||||
-rw-r--r-- | searx/engines/duckduckgo.py | 343 | ||||
-rw-r--r-- | searx/engines/duckduckgo_definitions.py | 36 | ||||
-rw-r--r-- | searx/engines/duckduckgo_images.py | 113 | ||||
-rw-r--r-- | searx/engines/duckduckgo_weather.py | 35 | ||||
-rw-r--r-- | searx/sxng_locales.py | 5 |
7 files changed, 687 insertions, 376 deletions
diff --git a/searx/autocomplete.py b/searx/autocomplete.py index acea05c32..4eabd880f 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -61,14 +61,24 @@ def dbpedia(query, _lang): return results -def duckduckgo(query, _lang): - # duckduckgo autocompleter - url = 'https://ac.duckduckgo.com/ac/?{0}&type=list' +def duckduckgo(query, sxng_locale): + """Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages""" - resp = loads(get(url.format(urlencode(dict(q=query)))).text) - if len(resp) > 1: - return resp[1] - return [] + traits = engines['duckduckgo'].traits + args = { + 'q': query, + 'kl': traits.get_region(sxng_locale, traits.all_locale), + } + + url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args) + resp = get(url) + + ret_val = [] + if resp.ok: + j = resp.json() + if len(j) > 1: + ret_val = j[1] + return ret_val def google(query, lang): diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index 09d7ab740..174a42a76 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -471,11 +471,106 @@ "zh_TW" ] }, - "ddg definitions": { + "duckduckgo": { "all_locale": "wt-wt", - "custom": {}, - "data_type": "supported_languages", - "languages": {}, + "custom": { + "lang_region": { + "ar-DZ": "ar_DZ", + "ar-JO": "ar_JO", + "ar-SA": "ar_SA", + "bn-IN": "bn_IN", + "de-CH": "de_CH", + "en-AU": "en_AU", + "en-CA": "en_CA", + "en-GB": "en_GB", + "es-AR": "es_AR", + "es-CL": "es_CL", + "es-CO": "es_CO", + "es-CR": "es_CR", + "es-EC": "es_EC", + "es-MX": "es_MX", + "es-PE": "es_PE", + "es-UY": "es_UY", + "es-VE": "es_VE", + "fr-BE": "fr_BE", + "fr-CA": "fr_CA", + "fr-CH": "fr_CH", + "nl-BE": "nl_BE", + "pt-BR": "pt_BR" + } + }, + "data_type": "traits_v1", + "languages": { + "af": "af_ZA", + "ar": "ar_EG", + "ast": "ast_ES", + "az_Latn": "az_AZ", + "be": "be_BY", + "bg": "bg_BG", + "bn": "bn_BD", + "br": "br_FR", + "bs_Latn": "bs_BA", + "ca": "ca_ES", + "cs": "cs_CZ", + "cy": "cy_GB", + "da": "da_DK", + "de": "de_DE", + "el": "el_GR", + "en": "en_US", + "eo": "eo_XX", + "es": "es_ES", + "et": "et_EE", + "eu": "eu_ES", + "fa": "fa_IR", + "fi": "fi_FI", + "fil": "tl_PH", + "fr": "fr_FR", + "ga": "ga_IE", + "gd": "gd_GB", + "gl": "gl_ES", + "he": "he_IL", + "hi": "hi_IN", + "hr": "hr_HR", + "hu": "hu_HU", + "hy": "hy_AM", + "id": "id_ID", + "is": "is_IS", + "it": "it_IT", + "ja": "ja_JP", + "kab": "kab_DZ", + "kn": "kn_IN", + "ko": "ko_KR", + "ku": "ku", + "kw": "kw_GB", + "lt": "lt_LT", + "lv": "lv_LV", + "ml": "ml_IN", + "mr": "mr_IN", + "ms": "ms_MY", + "nb": "nb_NO", + "nl": "nl_NL", + "nn": "nn_NO", + "pl": "pl_PL", + "pt": "pt_PT", + "ro": "ro_RO", + "ru": "ru_RU", + "sc": "sc_IT", + "si": "si_LK", + "sk": "sk_SK", + "sl": "sl_SI", + "sq": "sq_AL", + "sr_Cyrl": "sr_RS", + "sv": "sv_SE", + "ta": "ta_IN", + "te": "te_IN", + "th": "th_TH", + "tr": "tr_TR", + "uk": "uk_UA", + "ur": "ur_PK", + "vi": "vi_VN", + "zh_Hans": "zh_CN", + "zh_Hant": "zh_TW" + }, "regions": { "ar-SA": "xa-ar", "bg-BG": "bg-bg", @@ -539,77 +634,108 @@ "zh-HK": "hk-tzh", "zh-TW": "tw-tzh" }, - "supported_languages": [ - "ar-XA", - "bg-BG", - "ca-CT", - "ca-ES", - "cs-CZ", - "da-DK", - "de-AT", - "de-CH", - "de-DE", - "el-GR", - "en-AU", - "en-CA", - "en-ID", - "en-IE", - "en-IL", - "en-IN", - "en-MY", - "en-NZ", - "en-PH", - "en-PK", - "en-SG", - "en-TH", - "en-UK", - "en-US", - "en-VN", - "en-ZA", - "es-AR", - "es-CL", - "es-CO", - "es-ES", - "es-MX", - "es-PE", - "es-US", - "et-EE", - "fi-FI", - "fr-BE", - "fr-CA", - "fr-CH", - "fr-FR", - "hr-HR", - "hu-HU", - "it-IT", - "jp-JP", - "kr-KR", - "lt-LT", - "lv-LV", - "nl-BE", - "nl-NL", - "no-NO", - "pl-PL", - "pt-BR", - "pt-PT", - "ro-RO", - "ru-RU", - "sk-SK", - "sl-SL", - "sv-SE", - "tr-TR", - "tzh-HK", - "tzh-TW", - "uk-UA", - "wt-WT", - "zh-CN" - ] + "supported_languages": {} }, - "duckduckgo": { + "duckduckgo images": { "all_locale": "wt-wt", - "custom": {}, - "data_type": "supported_languages", - "languages": {}, + "custom": { + "lang_region": { + "ar-DZ": "ar_DZ", + "ar-JO": "ar_JO", + "ar-SA": "ar_SA", + "bn-IN": "bn_IN", + "de-CH": "de_CH", + "en-AU": "en_AU", + "en-CA": "en_CA", + "en-GB": "en_GB", + "es-AR": "es_AR", + "es-CL": "es_CL", + "es-CO": "es_CO", + "es-CR": "es_CR", + "es-EC": "es_EC", + "es-MX": "es_MX", + "es-PE": "es_PE", + "es-UY": "es_UY", + "es-VE": "es_VE", + "fr-BE": "fr_BE", + "fr-CA": "fr_CA", + "fr-CH": "fr_CH", + "nl-BE": "nl_BE", + "pt-BR": "pt_BR" + } + }, + "data_type": "traits_v1", + "languages": { + "af": "af_ZA", + "ar": "ar_EG", + "ast": "ast_ES", + "az_Latn": "az_AZ", + "be": "be_BY", + "bg": "bg_BG", + "bn": "bn_BD", + "br": "br_FR", + "bs_Latn": "bs_BA", + "ca": "ca_ES", + "cs": "cs_CZ", + "cy": "cy_GB", + "da": "da_DK", + "de": "de_DE", + "el": "el_GR", + "en": "en_US", + "eo": "eo_XX", + "es": "es_ES", + "et": "et_EE", + "eu": "eu_ES", + "fa": "fa_IR", + "fi": "fi_FI", + "fil": "tl_PH", + "fr": "fr_FR", + "ga": "ga_IE", + "gd": "gd_GB", + "gl": "gl_ES", + "he": "he_IL", + "hi": "hi_IN", + "hr": "hr_HR", + "hu": "hu_HU", + "hy": "hy_AM", + "id": "id_ID", + "is": "is_IS", + "it": "it_IT", + "ja": "ja_JP", + "kab": "kab_DZ", + "kn": "kn_IN", + "ko": "ko_KR", + "ku": "ku", + "kw": "kw_GB", + "lt": "lt_LT", + "lv": "lv_LV", + "ml": "ml_IN", + "mr": "mr_IN", + "ms": "ms_MY", + "nb": "nb_NO", + "nl": "nl_NL", + "nn": "nn_NO", + "pl": "pl_PL", + "pt": "pt_PT", + "ro": "ro_RO", + "ru": "ru_RU", + "sc": "sc_IT", + "si": "si_LK", + "sk": "sk_SK", + "sl": "sl_SI", + "sq": "sq_AL", + "sr_Cyrl": "sr_RS", + "sv": "sv_SE", + "ta": "ta_IN", + "te": "te_IN", + "th": "th_TH", + "tr": "tr_TR", + "uk": "uk_UA", + "ur": "ur_PK", + "vi": "vi_VN", + "zh_Hans": "zh_CN", + "zh_Hant": "zh_TW" + }, "regions": { "ar-SA": "xa-ar", "bg-BG": "bg-bg", @@ -673,77 +799,108 @@ "zh-HK": "hk-tzh", "zh-TW": "tw-tzh" }, - "supported_languages": [ - "ar-XA", - "bg-BG", - "ca-CT", - "ca-ES", - "cs-CZ", - "da-DK", - "de-AT", - "de-CH", - "de-DE", - "el-GR", - "en-AU", - "en-CA", - "en-ID", - "en-IE", - "en-IL", - "en-IN", - "en-MY", - "en-NZ", - "en-PH", - "en-PK", - "en-SG", - "en-TH", - "en-UK", - "en-US", - "en-VN", - "en-ZA", - "es-AR", - "es-CL", - "es-CO", - "es-ES", - "es-MX", - "es-PE", - "es-US", - "et-EE", - "fi-FI", - "fr-BE", - "fr-CA", - "fr-CH", - "fr-FR", - "hr-HR", - "hu-HU", - "it-IT", - "jp-JP", - "kr-KR", - "lt-LT", - "lv-LV", - "nl-BE", - "nl-NL", - "no-NO", - "pl-PL", - "pt-BR", - "pt-PT", - "ro-RO", - "ru-RU", - "sk-SK", - "sl-SL", - "sv-SE", - "tr-TR", - "tzh-HK", - "tzh-TW", - "uk-UA", - "wt-WT", - "zh-CN" - ] + "supported_languages": {} }, - "duckduckgo images": { + "duckduckgo weather": { "all_locale": "wt-wt", - "custom": {}, - "data_type": "supported_languages", - "languages": {}, + "custom": { + "lang_region": { + "ar-DZ": "ar_DZ", + "ar-JO": "ar_JO", + "ar-SA": "ar_SA", + "bn-IN": "bn_IN", + "de-CH": "de_CH", + "en-AU": "en_AU", + "en-CA": "en_CA", + "en-GB": "en_GB", + "es-AR": "es_AR", + "es-CL": "es_CL", + "es-CO": "es_CO", + "es-CR": "es_CR", + "es-EC": "es_EC", + "es-MX": "es_MX", + "es-PE": "es_PE", + "es-UY": "es_UY", + "es-VE": "es_VE", + "fr-BE": "fr_BE", + "fr-CA": "fr_CA", + "fr-CH": "fr_CH", + "nl-BE": "nl_BE", + "pt-BR": "pt_BR" + } + }, + "data_type": "traits_v1", + "languages": { + "af": "af_ZA", + "ar": "ar_EG", + "ast": "ast_ES", + "az_Latn": "az_AZ", + "be": "be_BY", + "bg": "bg_BG", + "bn": "bn_BD", + "br": "br_FR", + "bs_Latn": "bs_BA", + "ca": "ca_ES", + "cs": "cs_CZ", + "cy": "cy_GB", + "da": "da_DK", + "de": "de_DE", + "el": "el_GR", + "en": "en_US", + "eo": "eo_XX", + "es": "es_ES", + "et": "et_EE", + "eu": "eu_ES", + "fa": "fa_IR", + "fi": "fi_FI", + "fil": "tl_PH", + "fr": "fr_FR", + "ga": "ga_IE", + "gd": "gd_GB", + "gl": "gl_ES", + "he": "he_IL", + "hi": "hi_IN", + "hr": "hr_HR", + "hu": "hu_HU", + "hy": "hy_AM", + "id": "id_ID", + "is": "is_IS", + "it": "it_IT", + "ja": "ja_JP", + "kab": "kab_DZ", + "kn": "kn_IN", + "ko": "ko_KR", + "ku": "ku", + "kw": "kw_GB", + "lt": "lt_LT", + "lv": "lv_LV", + "ml": "ml_IN", + "mr": "mr_IN", + "ms": "ms_MY", + "nb": "nb_NO", + "nl": "nl_NL", + "nn": "nn_NO", + "pl": "pl_PL", + "pt": "pt_PT", + "ro": "ro_RO", + "ru": "ru_RU", + "sc": "sc_IT", + "si": "si_LK", + "sk": "sk_SK", + "sl": "sl_SI", + "sq": "sq_AL", + "sr_Cyrl": "sr_RS", + "sv": "sv_SE", + "ta": "ta_IN", + "te": "te_IN", + "th": "th_TH", + "tr": "tr_TR", + "uk": "uk_UA", + "ur": "ur_PK", + "vi": "vi_VN", + "zh_Hans": "zh_CN", + "zh_Hant": "zh_TW" + }, "regions": { "ar-SA": "xa-ar", "bg-BG": "bg-bg", @@ -807,71 +964,7 @@ "zh-HK": "hk-tzh", "zh-TW": "tw-tzh" }, - "supported_languages": [ - "ar-XA", - "bg-BG", - "ca-CT", - "ca-ES", - "cs-CZ", - "da-DK", - "de-AT", - "de-CH", - "de-DE", - "el-GR", - "en-AU", - "en-CA", - "en-ID", - "en-IE", - "en-IL", - "en-IN", - "en-MY", - "en-NZ", - "en-PH", - "en-PK", - "en-SG", - "en-TH", - "en-UK", - "en-US", - "en-VN", - "en-ZA", - "es-AR", - "es-CL", - "es-CO", - "es-ES", - "es-MX", - "es-PE", - "es-US", - "et-EE", - "fi-FI", - "fr-BE", - "fr-CA", - "fr-CH", - "fr-FR", - "hr-HR", - "hu-HU", - "it-IT", - "jp-JP", - "kr-KR", - "lt-LT", - "lv-LV", - "nl-BE", - "nl-NL", - "no-NO", - "pl-PL", - "pt-BR", - "pt-PT", - "ro-RO", - "ru-RU", - "sk-SK", - "sl-SL", - "sv-SE", - "tr-TR", - "tzh-HK", - "tzh-TW", - "uk-UA", - "wt-WT", - "zh-CN" - ] + "supported_languages": {} }, "google": { "all_locale": "ZZ", diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index cb47122ae..85e977bdb 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,73 +1,207 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""DuckDuckGo Lite +""" +DuckDuckGo Lite +~~~~~~~~~~~~~~~ """ +from typing import TYPE_CHECKING +from urllib.parse import urlencode import json -from lxml import html +import babel +import lxml.html +from searx import ( + network, + locales, + redislib, +) +from searx import redisdb from searx.utils import ( - dict_subset, eval_xpath, eval_xpath_getindex, extract_text, - match_language, ) -from searx import network from searx.enginelib.traits import EngineTraits +from searx.exceptions import SearxEngineAPIException + +if TYPE_CHECKING: + import logging + + logger: logging.Logger traits: EngineTraits -# about about = { "website": 'https://lite.duckduckgo.com/lite/', "wikidata_id": 'Q12805', - "official_api_documentation": 'https://duckduckgo.com/api', "use_official_api": False, "require_api_key": False, "results": 'HTML', } +send_accept_language_header = True +"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP +``Accept-Language``. Optional the user can select a region filter (but not a +language). +""" + # engine dependent config categories = ['general', 'web'] paging = True -supported_languages_url = 'https://duckduckgo.com/util/u588.js' time_range_support = True -send_accept_language_header = True +safesearch = True # user can't select but the results are filtered -language_aliases = { - 'ar-SA': 'ar-XA', - 'es-419': 'es-XL', - 'ja': 'jp-JP', - 'ko': 'kr-KR', - 'sl-SI': 'sl-SL', - 'zh-TW': 'tzh-TW', - 'zh-HK': 'tzh-HK', -} +url = 'https://lite.duckduckgo.com/lite/' +# url_ping = 'https://duckduckgo.com/t/sl_l' time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} +form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} -# search-url -url = 'https://lite.duckduckgo.com/lite/' -url_ping = 'https://duckduckgo.com/t/sl_l' -# match query's language to a region code that duckduckgo will accept -def get_region_code(lang, lang_list=None): - if lang == 'all': - return None +def cache_vqd(query, value): + """Caches a ``vqd`` value from a query. + + The vqd value depends on the query string and is needed for the follow up + pages or the images loaded by a XMLHttpRequest: + + - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...` + - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...` + + """ + c = redisdb.client() + if c: + logger.debug("cache vqd value: %s", value) + key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) + c.set(key, value, ex=600) + + +def get_vqd(query, headers): + """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached + (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the + response. + + """ + value = None + c = redisdb.client() + if c: + key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) + value = c.get(key) + if value: + value = value.decode('utf-8') + logger.debug("re-use cached vqd value: %s", value) + return value - lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT') - lang_parts = lang_code.split('-') + query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query})) + res = network.get(query_url, headers=headers) + content = res.text + if content.find('vqd=\'') == -1: + raise SearxEngineAPIException('Request failed') + value = content[content.find('vqd=\'') + 5 :] + value = value[: value.find('\'')] + logger.debug("new vqd value: %s", value) + cache_vqd(query, value) + return value - # country code goes first - return lang_parts[1].lower() + '-' + lang_parts[0].lower() + +def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): + """Get DuckDuckGo's language identifier from SearXNG's locale. + + DuckDuckGo defines its lanaguages by region codes (see + :py:obj:`fetch_traits`). + + To get region and language of a DDG service use: + + .. code: python + + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + It might confuse, but the ``l`` value of the cookie is what SearXNG calls + the *region*: + + .. code:: python + + # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} + params['cookies']['ad'] = eng_lang + params['cookies']['ah'] = eng_region + params['cookies']['l'] = eng_region + + .. hint:: + + `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language + selection to the user, only a region can be selected by the user + (``eng_region`` from the example above). DDG-lite stores the selected + region in a cookie:: + + params['cookies']['kl'] = eng_region # 'ar-es' + + """ + return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default)) + + +ddg_reg_map = { + 'tw-tzh': 'zh_TW', + 'hk-tzh': 'zh_HK', + 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES + 'es-ca': 'ca_ES', + 'id-en': 'id_ID', + 'no-no': 'nb_NO', + 'jp-jp': 'ja_JP', + 'kr-kr': 'ko_KR', + 'xa-ar': 'ar_SA', + 'sl-sl': 'sl_SI', + 'th-en': 'th_TH', + 'vn-en': 'vi_VN', +} + +ddg_lang_map = { + # use ar --> ar_EG (Egypt's arabic) + "ar_DZ": 'lang_region', + "ar_JO": 'lang_region', + "ar_SA": 'lang_region', + # use bn --> bn_BD + 'bn_IN': 'lang_region', + # use de --> de_DE + 'de_CH': 'lang_region', + # use en --> en_US, + 'en_AU': 'lang_region', + 'en_CA': 'lang_region', + 'en_GB': 'lang_region', + # Esperanto + 'eo_XX': 'eo', + # use es --> es_ES, + 'es_AR': 'lang_region', + 'es_CL': 'lang_region', + 'es_CO': 'lang_region', + 'es_CR': 'lang_region', + 'es_EC': 'lang_region', + 'es_MX': 'lang_region', + 'es_PE': 'lang_region', + 'es_UY': 'lang_region', + 'es_VE': 'lang_region', + # use fr --> rf_FR + 'fr_CA': 'lang_region', + 'fr_CH': 'lang_region', + 'fr_BE': 'lang_region', + # use nl --> nl_NL + 'nl_BE': 'lang_region', + # use pt --> pt_PT + 'pt_BR': 'lang_region', + # skip these languages + 'od_IN': 'skip', + 'io_XX': 'skip', + 'tokipona_XX': 'skip', +} def request(query, params): + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + # eng_lang = get_ddg_lang(traits, params['searxng_locale']) + params['url'] = url params['method'] = 'POST' - params['data']['q'] = query # The API is not documented, so we do some reverse engineering and emulate @@ -90,23 +224,19 @@ def request(query, params): params['data']['s'] = offset params['data']['dc'] = offset + 1 + # request needs a vqd argument + params['data']['vqd'] = get_vqd(query, params["headers"]) + # initial page does not have additional data in the input form if params['pageno'] > 1: - # request the second page (and more pages) needs 'o' and 'api' arguments - params['data']['o'] = 'json' - params['data']['api'] = 'd.js' - # initial page does not have additional data in the input form - if params['pageno'] > 2: - # request the third page (and more pages) some more arguments - params['data']['nextParams'] = '' - params['data']['v'] = '' - params['data']['vqd'] = '' + params['data']['o'] = form_data.get('o', 'json') + params['data']['api'] = form_data.get('api', 'd.js') + params['data']['nextParams'] = form_data.get('nextParams', '') + params['data']['v'] = form_data.get('v', 'l') - region_code = get_region_code(params['language'], supported_languages) - if region_code: - params['data']['kl'] = region_code - params['cookies']['kl'] = region_code + params['data']['kl'] = eng_region + params['cookies']['kl'] = eng_region params['data']['df'] = '' if params['time_range'] in time_range_dict: @@ -118,26 +248,40 @@ def request(query, params): return params -# get response from search-request def response(resp): - headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) - network.get(url_ping, headers=headers_ping) - if resp.status_code == 303: return [] results = [] - doc = html.fromstring(resp.text) + doc = lxml.html.fromstring(resp.text) result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') - if not len(result_table) >= 3: + + if len(result_table) == 2: + # some locales (at least China) does not have a "next page" button and + # the layout of the HTML tables is different. + result_table = result_table[1] + elif not len(result_table) >= 3: # no more results return [] - result_table = result_table[2] + else: + result_table = result_table[2] + # update form data from response + form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') + if len(form): + + form = form[0] + form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0] + form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0] + form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0] + logger.debug('form_data: %s', form_data) + + value = eval_xpath(form, '//input[@name="vqd"]/@value')[0] + query = resp.search_params['data']['q'] + cache_vqd(query, value) tr_rows = eval_xpath(result_table, './/tr') - # In the last <tr> is the form of the 'previous/next page' links tr_rows = tr_rows[:-1] @@ -174,32 +318,35 @@ def response(resp): return results -# get supported languages from their site -def _fetch_supported_languages(resp): - - # response is a js file with regions as an embedded object - response_page = resp.text - response_page = response_page[response_page.find('regions:{') + 8 :] - response_page = response_page[: response_page.find('}') + 1] - - regions_json = json.loads(response_page) - supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages & regions from DuckDuckGo. - return list(supported_languages) + SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``). + DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no + sense in a SearXNG request since SearXNG's ``all`` will not add a + ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale`` + is ``wt-wt`` (the region). + Beside regions DuckDuckGo also defines its lanaguages by region codes. By + example these are the english languages in DuckDuckGo: -def fetch_traits(engine_traits: EngineTraits): - """Fetch regions from DuckDuckGo.""" - # pylint: disable=import-outside-toplevel + - en_US + - en_AU + - en_CA + - en_GB - engine_traits.data_type = 'supported_languages' # deprecated + The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from + SearXNG's locale. - import babel - from searx.locales import region_tag + """ + # pylint: disable=too-many-branches, too-many-statements + # fetch regions engine_traits.all_locale = 'wt-wt' - resp = network.get('https://duckduckgo.com/util/u588.js') + # updated from u588 to u661 / should be updated automatically? + resp = network.get('https://duckduckgo.com/util/u661.js') + if not resp.ok: print("ERROR: response from DuckDuckGo is not OK.") @@ -208,28 +355,13 @@ def fetch_traits(engine_traits: EngineTraits): pos = js_code.find('}') + 1 regions = json.loads(js_code[:pos]) - reg_map = { - 'tw-tzh': 'zh_TW', - 'hk-tzh': 'zh_HK', - 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES - 'es-ca': 'ca_ES', - 'id-en': 'id_ID', - 'no-no': 'nb_NO', - 'jp-jp': 'ja_JP', - 'kr-kr': 'ko_KR', - 'xa-ar': 'ar_SA', - 'sl-sl': 'sl_SI', - 'th-en': 'th_TH', - 'vn-en': 'vi_VN', - } - for eng_tag, name in regions.items(): if eng_tag == 'wt-wt': engine_traits.all_locale = 'wt-wt' continue - region = reg_map.get(eng_tag) + region = ddg_reg_map.get(eng_tag) if region == 'skip': continue @@ -238,7 +370,7 @@ def fetch_traits(engine_traits: EngineTraits): region = eng_lang + '_' + eng_territory.upper() try: - sxng_tag = region_tag(babel.Locale.parse(region)) + sxng_tag = locales.region_tag(babel.Locale.parse(region)) except babel.UnknownLocaleError: print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) continue @@ -249,3 +381,42 @@ def fetch_traits(engine_traits: EngineTraits): print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) continue engine_traits.regions[sxng_tag] = eng_tag + + # fetch languages + + engine_traits.custom['lang_region'] = {} + + pos = resp.text.find('languages:{') + 10 + js_code = resp.text[pos:] + pos = js_code.find('}') + 1 + js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') + languages = json.loads(js_code) + + for eng_lang, name in languages.items(): + + if eng_lang == 'wt_WT': + continue + + babel_tag = ddg_lang_map.get(eng_lang, eng_lang) + if babel_tag == 'skip': + continue + + try: + + if babel_tag == 'lang_region': + sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang)) + engine_traits.custom['lang_region'][sxng_tag] = eng_lang + continue + + sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag)) + + except babel.UnknownLocaleError: + print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang)) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) + continue + engine_traits.languages[sxng_tag] = eng_lang diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 8b42799be..39fed87e7 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,23 +1,33 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""DuckDuckGo (Instant Answer API) +""" +DuckDuckGo Instant Answer API +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from +reverse engineering we can see that some services (e.g. instant answers) still +in use from the DDG search engine. + +As far we can say the *instant answers* API does not support languages, or at +least we could not find out how language support should work. It seems that +most of the features are based on English terms. """ -import json +from typing import TYPE_CHECKING + from urllib.parse import urlencode, urlparse, urljoin from lxml import html from searx.data import WIKIDATA_UNITS -from searx.engines.duckduckgo import language_aliases -from searx.engines.duckduckgo import ( # pylint: disable=unused-import - fetch_traits, - _fetch_supported_languages, - supported_languages_url, -) -from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function +from searx.utils import extract_text, html_to_text, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom +if TYPE_CHECKING: + import logging + + logger: logging.Logger + # about about = { "website": 'https://duckduckgo.com/', @@ -38,7 +48,7 @@ replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) def is_broken_text(text): - """duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>" + """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>`` The href URL is broken, the "Related website" may contains some HTML. @@ -63,8 +73,6 @@ def result_to_text(text, htmlResult): def request(query, params): params['url'] = URL.format(query=urlencode({'q': query})) - language = match_language(params['language'], supported_languages, language_aliases) - language = language.split('-')[0] return params @@ -72,7 +80,7 @@ def response(resp): # pylint: disable=too-many-locals, too-many-branches, too-many-statements results = [] - search_res = json.loads(resp.text) + search_res = resp.json() # search_res.get('Entity') possible values (not exhaustive) : # * continent / country / department / location / waterfall @@ -236,7 +244,7 @@ def unit_to_str(unit): def area_to_str(area): - """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}""" + """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``""" unit = unit_to_str(area.get('unit')) if unit is not None: try: diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 927bc6cff..d8a6f1340 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -1,27 +1,30 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """ - DuckDuckGo (Images) +DuckDuckGo Images +~~~~~~~~~~~~~~~~~ """ -from json import loads +from typing import TYPE_CHECKING from urllib.parse import urlencode -from searx.exceptions import SearxEngineAPIException -from searx.engines.duckduckgo import get_region_code -from searx.engines.duckduckgo import ( # pylint: disable=unused-import - fetch_traits, - _fetch_supported_languages, - supported_languages_url, + +from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import +from searx.engines.duckduckgo import ( + get_ddg_lang, + get_vqd, ) -from searx.network import get +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { "website": 'https://duckduckgo.com/', "wikidata_id": 'Q12805', - "official_api_documentation": { - 'url': 'https://duckduckgo.com/api', - 'comment': 'but images are not supported', - }, "use_official_api": False, "require_api_key": False, "results": 'JSON (site requires js to get images)', @@ -33,70 +36,64 @@ paging = True safesearch = True send_accept_language_header = True -# search-url -images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' -site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' +safesearch_cookies = {0: '-2', 1: None, 2: '1'} +safesearch_args = {0: '1', 1: None, 2: '1'} -# run query in site to get vqd number needed for requesting images -# TODO: find a way to get this number without an extra request (is it a hash of the query?) -def get_vqd(query, headers): - query_url = site_url.format(query=urlencode({'q': query})) - res = get(query_url, headers=headers) - content = res.text - if content.find('vqd=\'') == -1: - raise SearxEngineAPIException('Request failed') - vqd = content[content.find('vqd=\'') + 5 :] - vqd = vqd[: vqd.find('\'')] - return vqd +def request(query, params): + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) -# do search-request -def request(query, params): - # to avoid running actual external requests when testing - if 'is_test' not in params: - vqd = get_vqd(query, params['headers']) - else: - vqd = '12345' + args = { + 'q': query, + 'o': 'json', + # 'u': 'bing', + 'l': eng_region, + 'vqd': get_vqd(query, params["headers"]), + } - offset = (params['pageno'] - 1) * 50 + if params['pageno'] > 1: + args['s'] = (params['pageno'] - 1) * 100 - safesearch = params['safesearch'] - 1 + params['cookies']['ad'] = eng_lang # zh_CN + params['cookies']['ah'] = eng_region # "us-en,de-de" + params['cookies']['l'] = eng_region # "hk-tzh" + logger.debug("cookies: %s", params['cookies']) - region_code = get_region_code(params['language'], lang_list=supported_languages) - if region_code: - params['url'] = images_url.format( - query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd - ) - else: - params['url'] = images_url.format(query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) + safe_search = safesearch_cookies.get(params['safesearch']) + if safe_search is not None: + params['cookies']['p'] = safe_search # "-2", "1" + safe_search = safesearch_args.get(params['safesearch']) + if safe_search is not None: + args['p'] = safe_search # "-1", "1" + + args = urlencode(args) + params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,') + + params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01' + params['headers']['Referer'] = 'https://duckduckgo.com/' + params['headers']['X-Requested-With'] = 'XMLHttpRequest' + logger.debug("headers: %s", params['headers']) return params -# get response from search-request def response(resp): results = [] + res_json = resp.json() - content = resp.text - res_json = loads(content) - - # parse results for result in res_json['results']: - title = result['title'] - url = result['url'] - thumbnail = result['thumbnail'] - image = result['image'] - - # append result results.append( { 'template': 'images.html', - 'title': title, + 'title': result['title'], 'content': '', - 'thumbnail_src': thumbnail, - 'img_src': image, - 'url': url, + 'thumbnail_src': result['thumbnail'], + 'img_src': result['image'], + 'url': result['url'], + 'img_format': '%s x %s' % (result['width'], result['height']), + 'source': result['source'], } ) diff --git a/searx/engines/duckduckgo_weather.py b/searx/engines/duckduckgo_weather.py index 0540cbcb5..4f0ce1b49 100644 --- a/searx/engines/duckduckgo_weather.py +++ b/searx/engines/duckduckgo_weather.py @@ -1,13 +1,29 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""DuckDuckGo Weather""" +""" +DuckDuckGo Weather +~~~~~~~~~~~~~~~~~~ +""" +from typing import TYPE_CHECKING from json import loads from urllib.parse import quote from datetime import datetime from flask_babel import gettext +from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import +from searx.engines.duckduckgo import get_ddg_lang +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + about = { "website": 'https://duckduckgo.com/', "wikidata_id": 'Q12805', @@ -17,9 +33,11 @@ about = { "results": "JSON", } -categories = ["others"] +send_accept_language_header = True -url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" +# engine dependent config +categories = ["others"] +URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" def generate_condition_table(condition): @@ -72,8 +90,17 @@ def generate_day_table(day): def request(query, params): - params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0]) + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + # !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} + params['cookies']['ad'] = eng_lang + params['cookies']['ah'] = eng_region + params['cookies']['l'] = eng_region + logger.debug("cookies: %s", params['cookies']) + + params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0]) return params diff --git a/searx/sxng_locales.py b/searx/sxng_locales.py index 4a722ef71..b6ae85848 100644 --- a/searx/sxng_locales.py +++ b/searx/sxng_locales.py @@ -49,10 +49,13 @@ sxng_locales = ( ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), ('he', 'עברית', '', 'Hebrew', '\U0001f1ee\U0001f1f7'), + ('hi', 'हिन्दी', '', 'Hindi', '\U0001f310'), + ('hr', 'Hrvatski', '', 'Croatian', '\U0001f310'), ('hu', 'Magyar', '', 'Hungarian', '\U0001f310'), ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), ('id', 'Indonesia', '', 'Indonesian', '\U0001f310'), ('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'), + ('is', 'Íslenska', '', 'Icelandic', '\U0001f310'), ('it', 'Italiano', '', 'Italian', '\U0001f310'), ('it-CH', 'Italiano', 'Svizzera', 'Italian', '\U0001f1e8\U0001f1ed'), ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), @@ -84,6 +87,8 @@ sxng_locales = ( ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), ('tr', 'Türkçe', '', 'Turkish', '\U0001f310'), ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), + ('uk', 'Українська', '', 'Ukrainian', '\U0001f310'), + ('vi', 'Tiếng Việt', '', 'Vietnamese', '\U0001f310'), ('zh', '中文', '', 'Chinese', '\U0001f310'), ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), ('zh-HK', '中文', '中國香港', 'Chinese', '\U0001f1ed\U0001f1f0'), |