searx/engines/duckduckgo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DuckDuckGo Lite
"""

import json
from lxml import html

from searx.utils import (
    dict_subset,
    eval_xpath,
    eval_xpath_getindex,
    extract_text,
    match_language,
)
from searx import network
from searx.enginelib.traits import EngineTraits

traits: EngineTraits

# about
about = {
    "website": 'https://lite.duckduckgo.com/lite/',
    "wikidata_id": 'Q12805',
    "official_api_documentation": 'https://duckduckgo.com/api',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

# engine dependent config
categories = ['general', 'web']
paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True
send_accept_language_header = True

language_aliases = {
    'ar-SA': 'ar-XA',
    'es-419': 'es-XL',
    'ja': 'jp-JP',
    'ko': 'kr-KR',
    'sl-SI': 'sl-SL',
    'zh-TW': 'tzh-TW',
    'zh-HK': 'tzh-HK',
}

time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}

# search-url
url = 'https://lite.duckduckgo.com/lite/'
url_ping = 'https://duckduckgo.com/t/sl_l'

# match query's language to a region code that duckduckgo will accept
def get_region_code(lang, lang_list=None):
    if lang == 'all':
        return None

    lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT')
    lang_parts = lang_code.split('-')

    # country code goes first
    return lang_parts[1].lower() + '-' + lang_parts[0].lower()


def request(query, params):

    params['url'] = url
    params['method'] = 'POST'

    params['data']['q'] = query

    # The API is not documented, so we do some reverse engineering and emulate
    # what https://lite.duckduckgo.com/lite/ does when you press "next Page"
    # link again and again ..

    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
    params['headers']['Referer'] = 'https://google.com/'

    # initial page does not have an offset
    if params['pageno'] == 2:
        # second page does have an offset of 30
        offset = (params['pageno'] - 1) * 30
        params['data']['s'] = offset
        params['data']['dc'] = offset + 1

    elif params['pageno'] > 2:
        # third and following pages do have an offset of 30 + n*50
        offset = 30 + (params['pageno'] - 2) * 50
        params['data']['s'] = offset
        params['data']['dc'] = offset + 1

    # initial page does not have additional data in the input form
    if params['pageno'] > 1:
        # request the second page (and more pages) needs 'o' and 'api' arguments
        params['data']['o'] = 'json'
        params['data']['api'] = 'd.js'

    # initial page does not have additional data in the input form
    if params['pageno'] > 2:
        # request the third page (and more pages) some more arguments
        params['data']['nextParams'] = ''
        params['data']['v'] = ''
        params['data']['vqd'] = ''

    region_code = get_region_code(params['language'], supported_languages)
    if region_code:
        params['data']['kl'] = region_code
        params['cookies']['kl'] = region_code

    params['data']['df'] = ''
    if params['time_range'] in time_range_dict:
        params['data']['df'] = time_range_dict[params['time_range']]
        params['cookies']['df'] = time_range_dict[params['time_range']]

    logger.debug("param data: %s", params['data'])
    logger.debug("param cookies: %s", params['cookies'])
    return params


# get response from search-request
def response(resp):

    headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
    network.get(url_ping, headers=headers_ping)

    if resp.status_code == 303:
        return []

    results = []
    doc = html.fromstring(resp.text)

    result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
    if not len(result_table) >= 3:
        # no more results
        return []
    result_table = result_table[2]

    tr_rows = eval_xpath(result_table, './/tr')

    # In the last <tr> is the form of the 'previous/next page' links
    tr_rows = tr_rows[:-1]

    len_tr_rows = len(tr_rows)
    offset = 0

    while len_tr_rows >= offset + 4:

        # assemble table rows we need to scrap
        tr_title = tr_rows[offset]
        tr_content = tr_rows[offset + 1]
        offset += 4

        # ignore sponsored Adds <tr class="result-sponsored">
        if tr_content.get('class') == 'result-sponsored':
            continue

        a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
        if a_tag is None:
            continue

        td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
        if td_content is None:
            continue

        results.append(
            {
                'title': a_tag.text_content(),
                'content': extract_text(td_content),
                'url': a_tag.get('href'),
            }
        )

    return results


# get supported languages from their site
def _fetch_supported_languages(resp):

    # response is a js file with regions as an embedded object
    response_page = resp.text
    response_page = response_page[response_page.find('regions:{') + 8 :]
    response_page = response_page[: response_page.find('}') + 1]

    regions_json = json.loads(response_page)
    supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())

    return list(supported_languages)


def fetch_traits(engine_traits: EngineTraits):
    """Fetch regions from DuckDuckGo."""
    # pylint: disable=import-outside-toplevel

    engine_traits.data_type = 'supported_languages'  # deprecated

    import babel
    from searx.locales import region_tag

    engine_traits.all_locale = 'wt-wt'

    resp = network.get('https://duckduckgo.com/util/u588.js')
    if not resp.ok:
        print("ERROR: response from DuckDuckGo is not OK.")

    pos = resp.text.find('regions:{') + 8
    js_code = resp.text[pos:]
    pos = js_code.find('}') + 1
    regions = json.loads(js_code[:pos])

    reg_map = {
        'tw-tzh': 'zh_TW',
        'hk-tzh': 'zh_HK',
        'ct-ca': 'skip',  # ct-ca and es-ca both map to ca_ES
        'es-ca': 'ca_ES',
        'id-en': 'id_ID',
        'no-no': 'nb_NO',
        'jp-jp': 'ja_JP',
        'kr-kr': 'ko_KR',
        'xa-ar': 'ar_SA',
        'sl-sl': 'sl_SI',
        'th-en': 'th_TH',
        'vn-en': 'vi_VN',
    }

    for eng_tag, name in regions.items():

        if eng_tag == 'wt-wt':
            engine_traits.all_locale = 'wt-wt'
            continue

        region = reg_map.get(eng_tag)
        if region == 'skip':
            continue

        if not region:
            eng_territory, eng_lang = eng_tag.split('-')
            region = eng_lang + '_' + eng_territory.upper()

        try:
            sxng_tag = region_tag(babel.Locale.parse(region))
        except babel.UnknownLocaleError:
            print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
            continue

        conflict = engine_traits.regions.get(sxng_tag)
        if conflict:
            if conflict != eng_tag:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
            continue
        engine_traits.regions[sxng_tag] = eng_tag