summaryrefslogtreecommitdiff
path: root/searx/engines/wikipedia.py
blob: 187915d65f1efa2f9e1c7d2b49108bcbec0386cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This module implements the Wikipedia engine.  Some of this implementations
are shared by other engines:

- :ref:`wikidata engine`

The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from
the article linked by :py:obj:`list_of_wikipedias`.

Unlike traditional search engines, wikipedia does not support one Wikipedia for
all languages, but there is one Wikipedia for each supported language. Some of
these Wikipedias have a LanguageConverter_ enabled
(:py:obj:`rest_v1_summary_url`).

A LanguageConverter_ (LC) is a system based on language variants that
automatically converts the content of a page into a different variant. A variant
is mostly the same language in a different script.

- `Wikipedias in multiple writing systems`_
- `Automatic conversion between traditional and simplified Chinese characters`_

PR-2554_:
  The Wikipedia link returned by the API is still the same in all cases
  (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's
  ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK``
  or .. Wikipedia's LC automatically returns the desired script in their
  web-page.

  - You can test the API here: https://reqbin.com/gesg2kvx

.. _https://zh.wikipedia.org/wiki/出租車:
   https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A

To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses
:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the
:py:obj:`fetch_wikimedia_traits` function.

To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese
options:

- ``!wp 出租車 :zh``    should show 出租車
- ``!wp 出租車 :zh-CN`` should show 出租车
- ``!wp 出租車 :zh-TW`` should show 計程車
- ``!wp 出租車 :zh-HK`` should show 的士
- ``!wp 出租車 :zh-SG`` should show 德士

.. _LanguageConverter:
   https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
.. _Wikipedias in multiple writing systems:
   https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems
.. _Automatic conversion between traditional and simplified Chinese characters:
   https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters
.. _PR-2554: https://github.com/searx/searx/pull/2554

"""

import urllib.parse
import babel

from lxml import html

from searx import utils
from searx import network as _network
from searx import locales
from searx.enginelib.traits import EngineTraits

traits: EngineTraits

# about
about = {
    "website": 'https://www.wikipedia.org/',
    "wikidata_id": 'Q52',
    "official_api_documentation": 'https://en.wikipedia.org/api/',
    "use_official_api": True,
    "require_api_key": False,
    "results": 'JSON',
}

display_type = ["infobox"]
"""A list of display types composed from ``infobox`` and ``list``.  The latter
one will add a hit to the result list.  The first one will show a hit in the
info box.  Both values can be set, or one of the two can be set."""

send_accept_language_header = True
"""The HTTP ``Accept-Language`` header is needed for wikis where
LanguageConverter_ is enabled."""

list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
"""

wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
"""The *editing depth* of Wikipedia is one of several possible rough indicators
of the encyclopedia's collaborative quality, showing how frequently its articles
are updated.  The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized.
"""

rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""
`wikipedia rest_v1 summary API`_:
  The summary response includes an extract of the first paragraph of the page in
  plain text and HTML as well as the type of page. This is useful for page
  previews (fka. Hovercards, aka. Popups) on the web and link previews in the
  apps.

HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
  The desired language variant code for wikis where LanguageConverter_ is
  enabled.

.. _wikipedia rest_v1 summary API:
   https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_

"""

wiki_lc_locale_variants = {
    "zh": (
        "zh-CN",
        "zh-HK",
        "zh-MO",
        "zh-MY",
        "zh-SG",
        "zh-TW",
    ),
    "zh-classical": ("zh-classical",),
}
"""Mapping rule of the LanguageConverter_ to map a language and its variants to
a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC
Chinese`_.

.. _LC Chinese:
   https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese
"""

wikipedia_script_variants = {
    "zh": (
        "zh_Hant",
        "zh_Hans",
    )
}


def get_wiki_params(sxng_locale, eng_traits):
    """Returns the Wikipedia language tag and the netloc that fits to the
    ``sxng_locale``.  To support LanguageConverter_ this function rates a locale
    (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`).

    """
    eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en'))
    wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org')
    return eng_tag, wiki_netloc


def request(query, params):
    """Assemble a request (`wikipedia rest_v1 summary API`_)."""
    if query.islower():
        query = query.title()

    _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
    title = urllib.parse.quote(query)
    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)

    params['raise_for_httperror'] = False
    params['soft_max_redirects'] = 2

    return params


# get response from search-request
def response(resp):

    results = []
    if resp.status_code == 404:
        return []
    if resp.status_code == 400:
        try:
            api_result = resp.json()
        except Exception:  # pylint: disable=broad-except
            pass
        else:
            if (
                api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
                and api_result['detail'] == 'title-invalid-characters'
            ):
                return []

    _network.raise_for_httperror(resp)

    api_result = resp.json()
    title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
    wikipedia_link = api_result['content_urls']['desktop']['page']

    if "list" in display_type or api_result.get('type') != 'standard':
        # show item in the result list if 'list' is in the display options or it
        # is a item that can't be displayed in a infobox.
        results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})

    if "infobox" in display_type:
        if api_result.get('type') == 'standard':
            results.append(
                {
                    'infobox': title,
                    'id': wikipedia_link,
                    'content': api_result.get('extract', ''),
                    'img_src': api_result.get('thumbnail', {}).get('source'),
                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
                }
            )

    return results


# Nonstandard language codes
#
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).

lang_map = locales.LOCALE_BEST_MATCH.copy()
lang_map.update(
    {
        'be-tarask': 'bel',
        'ak': 'aka',
        'als': 'gsw',
        'bat-smg': 'sgs',
        'cbk-zam': 'cbk',
        'fiu-vro': 'vro',
        'map-bms': 'map',
        'no': 'nb-NO',
        'nrm': 'nrf',
        'roa-rup': 'rup',
        'nds-nl': 'nds',
        #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
        'zh-min-nan': 'nan',
        'zh-yue': 'yue',
        'an': 'arg',
    }
)


def fetch_traits(engine_traits: EngineTraits):
    fetch_wikimedia_traits(engine_traits)
    print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES']))


def fetch_wikimedia_traits(engine_traits: EngineTraits):
    """Fetch languages from Wikipedia.  Not all languages from the
    :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those
    known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal
    :py:obj:`editing depth <wikipedia_article_depth>`.

    The location of the Wikipedia address of a language is mapped in a
    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
    (``wiki_netloc``).  Here is a reduced example:

    .. code:: python

       traits.custom['wiki_netloc'] = {
           "en": "en.wikipedia.org",
           ..
           "gsw": "als.wikipedia.org",
           ..
           "zh": "zh.wikipedia.org",
           "zh-classical": "zh-classical.wikipedia.org"
       }
    """
    # pylint: disable=too-many-branches
    engine_traits.custom['wiki_netloc'] = {}
    engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []

    # insert alias to map from a script or region to a wikipedia variant

    for eng_tag, sxng_tag_list in wikipedia_script_variants.items():
        for sxng_tag in sxng_tag_list:
            engine_traits.languages[sxng_tag] = eng_tag
    for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items():
        for sxng_tag in sxng_tag_list:
            engine_traits.regions[sxng_tag] = eng_tag

    resp = _network.get(list_of_wikipedias)
    if not resp.ok:
        print("ERROR: response from Wikipedia is not OK.")

    dom = html.fromstring(resp.text)
    for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):

        cols = row.xpath('./td')
        if not cols:
            continue
        cols = [c.text_content().strip() for c in cols]

        depth = float(cols[11].replace('-', '0').replace(',', ''))
        articles = int(cols[4].replace(',', '').replace(',', ''))

        eng_tag = cols[3]
        wiki_url = row.xpath('./td[4]/a/@href')[0]
        wiki_url = urllib.parse.urlparse(wiki_url)

        try:
            sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
        except babel.UnknownLocaleError:
            # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
            continue
        finally:
            engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag)

        if sxng_tag not in locales.LOCALE_NAMES:

            if articles < 10000:
                # exclude languages with too few articles
                continue

            if int(depth) < 20:
                # Rough indicator of a Wikipedia’s quality, showing how
                # frequently its articles are updated.
                continue

        conflict = engine_traits.languages.get(sxng_tag)
        if conflict:
            if conflict != eng_tag:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
            continue

        engine_traits.languages[sxng_tag] = eng_tag
        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc

    engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort()