summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2022-10-08 16:22:26 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2023-03-24 10:37:42 +0100
commit7daf4f95efb2c8b37f682d42e470bb78ce464f19 (patch)
treefa6fc1cd4b3a527748c0b4af0c7d94c8aacbba01
parentf78f9083836be851c224b4334b53b9686835e300 (diff)
downloadsearxng-7daf4f95efb2c8b37f682d42e470bb78ce464f19.tar.gz
searxng-7daf4f95efb2c8b37f682d42e470bb78ce464f19.zip
[mod] Wikipedia: fetch engine traits (data_type: supported_languages)
Implements a fetch_traits function for the Wikipedia engines. .. note:: Does not include migration of the request methode from 'supported_languages' to 'traits' (EngineTraits) object! Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r--searx/data/engine_traits.json222
-rw-r--r--searx/engines/wikidata.py1
-rw-r--r--searx/engines/wikipedia.py174
3 files changed, 391 insertions, 6 deletions
diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json
index e5f85bf38..c97134f8b 100644
--- a/searx/data/engine_traits.json
+++ b/searx/data/engine_traits.json
@@ -5121,7 +5121,116 @@
"all_locale": null,
"custom": {},
"data_type": "supported_languages",
- "languages": {},
+ "languages": {
+ "af": "af",
+ "ak": "tw",
+ "am": "am",
+ "ar": "ar",
+ "as": "as",
+ "az": "az",
+ "be": "be",
+ "bg": "bg",
+ "bn": "bn",
+ "bo": "bo",
+ "bs": "bs",
+ "ca": "ca",
+ "chr": "chr",
+ "ckb": "ckb",
+ "cs": "cs",
+ "da": "da",
+ "de": "de",
+ "dsb": "dsb",
+ "el": "el",
+ "en": "en",
+ "es": "es",
+ "et": "et",
+ "fa": "fa",
+ "fi": "fi",
+ "fil": "tl",
+ "fo": "fo",
+ "fr": "fr",
+ "fur": "fur",
+ "fy": "fy",
+ "gl": "gl",
+ "gsw": "als",
+ "gu": "gu",
+ "gv": "gv",
+ "haw": "haw",
+ "he": "he",
+ "hi": "hi",
+ "hsb": "hsb",
+ "hu": "hu",
+ "hy": "hy",
+ "id": "id",
+ "is": "is",
+ "it": "it",
+ "ja": "ja",
+ "jv": "jv",
+ "ka": "ka",
+ "km": "km",
+ "kn": "kn",
+ "ko": "ko",
+ "ks": "ks",
+ "ksh": "ksh",
+ "kw": "kw",
+ "lb": "lb",
+ "lg": "lg",
+ "ln": "ln",
+ "lo": "lo",
+ "lt": "lt",
+ "lv": "lv",
+ "mai": "mai",
+ "mk": "mk",
+ "ml": "ml",
+ "mn": "mn",
+ "mr": "mr",
+ "ms": "ms",
+ "mt": "mt",
+ "nds": "nds-nl",
+ "ne": "ne",
+ "no": "no",
+ "om": "om",
+ "or": "or",
+ "os": "os",
+ "pa": "pa",
+ "pl": "pl",
+ "ps": "ps",
+ "pt": "pt",
+ "qu": "qu",
+ "rm": "rm",
+ "ro": "ro",
+ "ru": "ru",
+ "rw": "rw",
+ "sa": "sa",
+ "sah": "sah",
+ "sd": "sd",
+ "se": "se",
+ "shi": "shi",
+ "si": "si",
+ "sk": "sk",
+ "sl": "sl",
+ "smn": "smn",
+ "so": "so",
+ "sq": "sq",
+ "sr": "sr",
+ "ta": "ta",
+ "te": "te",
+ "th": "th",
+ "tk": "tk",
+ "to": "to",
+ "tr": "tr",
+ "ug": "ug",
+ "uk": "uk",
+ "ur": "ur",
+ "uz": "uz",
+ "vi": "vi",
+ "wo": "wo",
+ "xh": "xh",
+ "yi": "yi",
+ "zh": "zh",
+ "zh_Hans": "zh",
+ "zh_Hant": "zh-classical"
+ },
"regions": {},
"supported_languages": {
"ab": {
@@ -6402,7 +6511,116 @@
"all_locale": null,
"custom": {},
"data_type": "supported_languages",
- "languages": {},
+ "languages": {
+ "af": "af",
+ "ak": "tw",
+ "am": "am",
+ "ar": "ar",
+ "as": "as",
+ "az": "az",
+ "be": "be",
+ "bg": "bg",
+ "bn": "bn",
+ "bo": "bo",
+ "bs": "bs",
+ "ca": "ca",
+ "chr": "chr",
+ "ckb": "ckb",
+ "cs": "cs",
+ "da": "da",
+ "de": "de",
+ "dsb": "dsb",
+ "el": "el",
+ "en": "en",
+ "es": "es",
+ "et": "et",
+ "fa": "fa",
+ "fi": "fi",
+ "fil": "tl",
+ "fo": "fo",
+ "fr": "fr",
+ "fur": "fur",
+ "fy": "fy",
+ "gl": "gl",
+ "gsw": "als",
+ "gu": "gu",
+ "gv": "gv",
+ "haw": "haw",
+ "he": "he",
+ "hi": "hi",
+ "hsb": "hsb",
+ "hu": "hu",
+ "hy": "hy",
+ "id": "id",
+ "is": "is",
+ "it": "it",
+ "ja": "ja",
+ "jv": "jv",
+ "ka": "ka",
+ "km": "km",
+ "kn": "kn",
+ "ko": "ko",
+ "ks": "ks",
+ "ksh": "ksh",
+ "kw": "kw",
+ "lb": "lb",
+ "lg": "lg",
+ "ln": "ln",
+ "lo": "lo",
+ "lt": "lt",
+ "lv": "lv",
+ "mai": "mai",
+ "mk": "mk",
+ "ml": "ml",
+ "mn": "mn",
+ "mr": "mr",
+ "ms": "ms",
+ "mt": "mt",
+ "nds": "nds-nl",
+ "ne": "ne",
+ "no": "no",
+ "om": "om",
+ "or": "or",
+ "os": "os",
+ "pa": "pa",
+ "pl": "pl",
+ "ps": "ps",
+ "pt": "pt",
+ "qu": "qu",
+ "rm": "rm",
+ "ro": "ro",
+ "ru": "ru",
+ "rw": "rw",
+ "sa": "sa",
+ "sah": "sah",
+ "sd": "sd",
+ "se": "se",
+ "shi": "shi",
+ "si": "si",
+ "sk": "sk",
+ "sl": "sl",
+ "smn": "smn",
+ "so": "so",
+ "sq": "sq",
+ "sr": "sr",
+ "ta": "ta",
+ "te": "te",
+ "th": "th",
+ "tk": "tk",
+ "to": "to",
+ "tr": "tr",
+ "ug": "ug",
+ "uk": "uk",
+ "ur": "ur",
+ "uz": "uz",
+ "vi": "vi",
+ "wo": "wo",
+ "xh": "xh",
+ "yi": "yi",
+ "zh": "zh",
+ "zh_Hans": "zh",
+ "zh_Hant": "zh-classical"
+ },
"regions": {},
"supported_languages": {
"ab": {
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index 8d3b0839a..a38600978 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -16,6 +16,7 @@ from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import ( # pylint: disable=unused-import
+ fetch_traits,
_fetch_supported_languages,
supported_languages_url,
)
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index ca841e8b3..4d5474e17 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -5,9 +5,12 @@
from urllib.parse import quote
from json import loads
-from lxml.html import fromstring
+from lxml import html
from searx.utils import match_language, searx_useragent
-from searx.network import raise_for_httperror
+from searx import network
+from searx.enginelib.traits import EngineTraits
+
+engine_traits: EngineTraits
# about
about = {
@@ -68,7 +71,7 @@ def response(resp):
):
return []
- raise_for_httperror(resp)
+ network.raise_for_httperror(resp)
results = []
api_result = loads(resp.text)
@@ -98,7 +101,7 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = {}
- dom = fromstring(resp.text)
+ dom = html.fromstring(resp.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
@@ -114,3 +117,166 @@ def _fetch_supported_languages(resp):
supported_languages[code] = {"name": name, "english_name": english_name}
return supported_languages
+
+
+# Nonstandard language codes
+#
+# These Wikipedias use language codes that do not conform to the ISO 639
+# standard (which is how wiki subdomains are chosen nowadays).
+
+lang_map = {
+ 'be-tarask': 'bel',
+ 'ak': 'aka',
+ 'als': 'gsw',
+ 'bat-smg': 'sgs',
+ 'cbk-zam': 'cbk',
+ 'fiu-vro': 'vro',
+ 'map-bms': 'map',
+ 'nrm': 'nrf',
+ 'roa-rup': 'rup',
+ 'nds-nl': 'nds',
+ #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
+ #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
+ 'zh-classical': 'zh_Hant',
+ 'zh-min-nan': 'nan',
+ 'zh-yue': 'yue',
+ 'an': 'arg',
+}
+
+unknown_langs = [
+ 'ab', # Abkhazian
+ 'alt', # Southern Altai
+ 'an', # Aragonese
+ 'ang', # Anglo-Saxon
+ 'arc', # Aramaic
+ 'ary', # Moroccan Arabic
+ 'av', # Avar
+ 'ba', # Bashkir
+ 'be-tarask',
+ 'bar', # Bavarian
+ 'bcl', # Central Bicolano
+ 'bh', # Bhojpuri
+ 'bi', # Bislama
+ 'bjn', # Banjar
+ 'blk', # Pa'O
+ 'bpy', # Bishnupriya Manipuri
+ 'bxr', # Buryat
+ 'cbk-zam', # Zamboanga Chavacano
+ 'co', # Corsican
+ 'cu', # Old Church Slavonic
+ 'dty', # Doteli
+ 'dv', # Divehi
+ 'ext', # Extremaduran
+ 'fj', # Fijian
+ 'frp', # Franco-Provençal
+ 'gan', # Gan
+ 'gom', # Goan Konkani
+ 'hif', # Fiji Hindi
+ 'ilo', # Ilokano
+ 'inh', # Ingush
+ 'jbo', # Lojban
+ 'kaa', # Karakalpak
+ 'kbd', # Kabardian Circassian
+ 'kg', # Kongo
+ 'koi', # Komi-Permyak
+ 'krc', # Karachay-Balkar
+ 'kv', # Komi
+ 'lad', # Ladino
+ 'lbe', # Lak
+ 'lez', # Lezgian
+ 'li', # Limburgish
+ 'ltg', # Latgalian
+ 'mdf', # Moksha
+ 'mnw', # Mon
+ 'mwl', # Mirandese
+ 'myv', # Erzya
+ 'na', # Nauruan
+ 'nah', # Nahuatl
+ 'nov', # Novial
+ 'nrm', # Norman
+ 'pag', # Pangasinan
+ 'pam', # Kapampangan
+ 'pap', # Papiamentu
+ 'pdc', # Pennsylvania German
+ 'pfl', # Palatinate German
+ 'roa-rup', # Aromanian
+ 'sco', # Scots
+ 'sco', # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
+ 'sh', # Serbo-Croatian
+ 'simple', # simple english is not know as a natural language different to english (babel)
+ 'sm', # Samoan
+ 'srn', # Sranan
+ 'stq', # Saterland Frisian
+ 'szy', # Sakizaya
+ 'tcy', # Tulu
+ 'tet', # Tetum
+ 'tpi', # Tok Pisin
+ 'trv', # Seediq
+ 'ty', # Tahitian
+ 'tyv', # Tuvan
+ 'udm', # Udmurt
+ 'vep', # Vepsian
+ 'vls', # West Flemish
+ 'vo', # Volapük
+ 'wa', # Walloon
+ 'xal', # Kalmyk
+]
+
+
+def fetch_traits(engine_traits: EngineTraits):
+ """Fetch languages from Wikipedia"""
+ # pylint: disable=import-outside-toplevel
+
+ engine_traits.data_type = 'supported_languages' # deprecated
+
+ import babel
+ from searx.locales import language_tag
+
+ resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
+ if not resp.ok:
+ print("ERROR: response from Wikipedia is not OK.")
+
+ dom = html.fromstring(resp.text)
+ for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
+
+ cols = row.xpath('./td')
+ if not cols:
+ continue
+
+ cols = [c.text_content().strip() for c in cols]
+ articles = int(cols[4].replace(',', '').replace('-', '0'))
+ users = int(cols[8].replace(',', '').replace('-', '0'))
+ depth = cols[11].strip('-')
+
+ if articles < 1000:
+ # exclude languages with too few articles
+ continue
+
+ # depth: rough indicator of a Wikipedia’s quality, showing how
+ # frequently its articles are updated.
+ if depth == '':
+ if users < 1000:
+ # depth is not calculated --> at least 1000 user should registered
+ continue
+ elif int(depth) < 20:
+ continue
+
+ eng_tag = cols[3]
+
+ if eng_tag in unknown_langs:
+ continue
+
+ try:
+ sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
+ except babel.UnknownLocaleError:
+ print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
+ continue
+
+ conflict = engine_traits.languages.get(sxng_tag)
+ if conflict:
+ if conflict != eng_tag:
+ print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
+ continue
+ engine_traits.languages[sxng_tag] = eng_tag
+
+ engine_traits.languages['zh_Hans'] = 'zh'