summaryrefslogtreecommitdiff
path: root/searxng_extra/update/update_external_bangs.py
blob: 1d367a887e492cd0e750780add8d89e2227f315c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
from :py:obj:`BANGS_URL`.

- :origin:`CI Update data ... <.github/workflows/data-update.yml>`

"""

import json
import httpx

from searx.external_bang import LEAF_KEY
from searx.data import data_dir

DATA_FILE = data_dir / 'external_bangs.json'

BANGS_URL = 'https://duckduckgo.com/bang.js'
"""JSON file which contains the bangs."""

HTTPS_COLON = 'https:'
HTTP_COLON = 'http:'


def main():
    print(f'fetch bangs from {BANGS_URL}')
    response = httpx.get(BANGS_URL)
    response.raise_for_status()
    ddg_bangs = json.loads(response.content.decode())
    trie = parse_ddg_bangs(ddg_bangs)
    output = {
        'version': 0,
        'trie': trie,
    }
    with DATA_FILE.open('w', encoding="utf8") as f:
        json.dump(output, f, indent=4, sort_keys=True, ensure_ascii=False)


def merge_when_no_leaf(node):
    """Minimize the number of nodes

    ``A -> B -> C``

    - ``B`` is child of ``A``
    - ``C`` is child of ``B``

    If there are no ``C`` equals to ``<LEAF_KEY>``, then each ``C`` are merged
    into ``A``.  For example (5 nodes)::

      d -> d -> g -> <LEAF_KEY> (ddg)
        -> i -> g -> <LEAF_KEY> (dig)

    becomes (3 nodes)::

      d -> dg -> <LEAF_KEY>
        -> ig -> <LEAF_KEY>

    """
    restart = False
    if not isinstance(node, dict):
        return

    # create a copy of the keys so node can be modified
    keys = list(node.keys())

    for key in keys:
        if key == LEAF_KEY:
            continue

        value = node[key]
        value_keys = list(value.keys())
        if LEAF_KEY not in value_keys:
            for value_key in value_keys:
                node[key + value_key] = value[value_key]
                merge_when_no_leaf(node[key + value_key])
            del node[key]
            restart = True
        else:
            merge_when_no_leaf(value)

    if restart:
        merge_when_no_leaf(node)


def optimize_leaf(parent, parent_key, node):
    if not isinstance(node, dict):
        return

    if len(node) == 1 and LEAF_KEY in node and parent is not None:
        parent[parent_key] = node[LEAF_KEY]
    else:
        for key, value in node.items():
            optimize_leaf(node, key, value)


def parse_ddg_bangs(ddg_bangs):
    bang_trie = {}
    bang_urls = {}

    for bang_definition in ddg_bangs:
        # bang_list
        bang_url = bang_definition['u']
        if '{{{s}}}' not in bang_url:
            # ignore invalid bang
            continue

        bang_url = bang_url.replace('{{{s}}}', chr(2))

        # only for the https protocol: "https://example.com" becomes "//example.com"
        if bang_url.startswith(HTTPS_COLON + '//'):
            bang_url = bang_url[len(HTTPS_COLON) :]

        #
        if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON) :] in bang_urls:
            # if the bang_url uses the http:// protocol, and the same URL exists in https://
            # then reuse the https:// bang definition. (written //example.com)
            bang_def_output = bang_urls[bang_url[len(HTTP_COLON) :]]
        else:
            # normal use case : new http:// URL or https:// URL (without "https:", see above)
            bang_rank = str(bang_definition['r'])
            bang_def_output = bang_url + chr(1) + bang_rank
            bang_def_output = bang_urls.setdefault(bang_url, bang_def_output)

        bang_urls[bang_url] = bang_def_output

        # bang name
        bang = bang_definition['t']

        # bang_trie
        t = bang_trie
        for bang_letter in bang:
            t = t.setdefault(bang_letter, {})
        t = t.setdefault(LEAF_KEY, bang_def_output)

    # optimize the trie
    merge_when_no_leaf(bang_trie)
    optimize_leaf(None, None, bang_trie)

    return bang_trie


if __name__ == '__main__':
    main()