diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2023-04-04 15:17:12 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-04-15 16:03:59 +0200 |
commit | 27369ebec23b39e9e61b175e6d23fdf2448da6b7 (patch) | |
tree | a8aa3754b7195c42c452b5c3c8a537217f4af1ac | |
parent | 0adfed195e64c334117576f059b844e28e0d0d34 (diff) | |
download | searxng-27369ebec23b39e9e61b175e6d23fdf2448da6b7.tar.gz searxng-27369ebec23b39e9e61b175e6d23fdf2448da6b7.zip |
[fix] searxng_extra/update/update_engine_descriptions.py (part 1)
Follow up of #2269
The script to update the descriptions of the engines does no longer work since
PR #2269 has been merged.
searx/engines/wikipedia.py
==========================
1. There was a misusage of zh-classical.wikipedia.org:
- `zh-classical` is dedicate to classical Chinese [1] which is not
traditional Chinese [2].
- zh.wikipedia.org has LanguageConverter enabled [3] and is going to
dynamically show simplified or traditional Chinese according to the
HTTP Accept-Language header.
2. The update_engine_descriptions.py needs a list of all wikipedias. The
implementation from #2269 included only a reduced list:
- https://meta.wikimedia.org/wiki/Wikipedia_article_depth
- https://meta.wikimedia.org/wiki/List_of_Wikipedias
searxng_extra/update/update_engine_descriptions.py
==================================================
Before PR #2269 there was a match_language() function that did an approximation
using various methods. With PR #2269 there are only the types in the data model
of the languages, which can be recognized by babel. The approximation methods,
which are needed (only here) in the determination of the descriptions, must be
replaced by other methods.
[1] https://en.wikipedia.org/wiki/Classical_Chinese
[2] https://en.wikipedia.org/wiki/Traditional_Chinese_characters
[3] https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
Closes: https://github.com/searxng/searxng/issues/2330
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r-- | searx/enginelib/traits.py | 4 | ||||
-rw-r--r-- | searx/engines/wikidata.py | 35 | ||||
-rw-r--r-- | searx/engines/wikipedia.py | 241 | ||||
-rw-r--r-- | searx/search/processors/online.py | 1 | ||||
-rwxr-xr-x | searxng_extra/update/update_engine_descriptions.py | 108 |
5 files changed, 263 insertions, 126 deletions
diff --git a/searx/enginelib/traits.py b/searx/enginelib/traits.py index df7851594..ae27d46f1 100644 --- a/searx/enginelib/traits.py +++ b/searx/enginelib/traits.py @@ -13,7 +13,7 @@ used. from __future__ import annotations import json import dataclasses -from typing import Dict, Union, Callable, Optional, TYPE_CHECKING +from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING from typing_extensions import Literal, Self from searx import locales @@ -81,7 +81,7 @@ class EngineTraits: """Data type, default is 'traits_v1'. """ - custom: Dict[str, Dict] = dataclasses.field(default_factory=dict) + custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict) """A place to store engine's custom traits, not related to the SearXNG core """ diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 6ea77f092..b9de67ef9 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -18,7 +18,10 @@ from searx.data import WIKIDATA_UNITS from searx.network import post, get from searx.utils import searx_useragent, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom -from searx.engines.wikipedia import fetch_traits as _fetch_traits +from searx.engines.wikipedia import ( + fetch_wikimedia_traits, + get_wiki_params, +) from searx.enginelib.traits import EngineTraits if TYPE_CHECKING: @@ -165,17 +168,15 @@ def request(query, params): # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN # mapped to zh - sxng_lang = params['searxng_locale'].split('-')[0] - language = traits.get_language(sxng_lang, 'en') - - query, attributes = get_query(query, language) - logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) + eng_tag, _wiki_netloc = get_wiki_params(params['searxng_locale'], traits) + query, attributes = get_query(query, eng_tag) + logger.debug("request --> language %s // len(attributes): %s", eng_tag, len(attributes)) params['method'] = 'POST' params['url'] = SPARQL_ENDPOINT_URL params['data'] = {'query': query} params['headers'] = get_headers() - params['language'] = language + params['language'] = eng_tag params['attributes'] = attributes return params @@ -769,12 +770,16 @@ def init(engine_settings=None): # pylint: disable=unused-argument def fetch_traits(engine_traits: EngineTraits): - """Use languages evaluated from :py:obj:`wikipedia.fetch_traits - <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what - is not supported by wikidata.""" - - _fetch_traits(engine_traits) - # wikidata does not support zh-classical (zh_Hans) - engine_traits.languages.pop('zh_Hans') - # wikidata does not have net-locations for the languages + """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits + <searx.engines.wikipedia.fetch_wikimedia_traits>` and removes + + - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for + the languages and the list of all + + - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine + + """ + + fetch_wikimedia_traits(engine_traits) engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['WIKIPEDIA_LANGUAGES'] = [] diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 9d2d30afa..98b3d6f9e 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -5,10 +5,54 @@ are shared by other engines: - :ref:`wikidata engine` -The list of supported languages is fetched from the article linked by -:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia -does not support one Wikipedia for all the languages, but there is one Wikipedia -for every language (:py:obj:`fetch_traits`). +The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from +the article linked by :py:obj:`list_of_wikipedias`. + +Unlike traditional search engines, wikipedia does not support one Wikipedia for +all languages, but there is one Wikipedia for each supported language. Some of +these Wikipedias have a LanguageConverter_ enabled +(:py:obj:`rest_v1_summary_url`). + +A LanguageConverter_ (LC) is a system based on language variants that +automatically converts the content of a page into a different variant. A variant +is mostly the same language in a different script. + +- `Wikipedias in multiple writing systems`_ +- `Automatic conversion between traditional and simplified Chinese characters`_ + +PR-2554_: + The Wikipedia link returned by the API is still the same in all cases + (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's + ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK`` + or .. Wikipedia's LC automatically returns the desired script in their + web-page. + + - You can test the API here: https://reqbin.com/gesg2kvx + +.. _https://zh.wikipedia.org/wiki/出租車: + https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A + +To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses +:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the +:py:obj:`fetch_wikimedia_traits` function. + +To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese +options: + +- ``!wp 出租車 :zh`` should show 出租車 +- ``!wp 出租車 :zh-CN`` should show 出租车 +- ``!wp 出租車 :zh-TW`` should show 計程車 +- ``!wp 出租車 :zh-HK`` should show 的士 +- ``!wp 出租車 :zh-SG`` should show 德士 + +.. _LanguageConverter: + https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter +.. _Wikipedias in multiple writing systems: + https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems +.. _Automatic conversion between traditional and simplified Chinese characters: + https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters +.. _PR-2554: https://github.com/searx/searx/pull/2554 + """ import urllib.parse @@ -16,8 +60,9 @@ import babel from lxml import html +from searx import utils from searx import network -from searx.locales import language_tag +from searx import locales from searx.enginelib.traits import EngineTraits traits: EngineTraits @@ -33,6 +78,12 @@ about = { } send_accept_language_header = True +"""The HTTP ``Accept-Language`` header is needed for wikis where +LanguageConverter_ is enabled.""" + +list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' +"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_ +""" wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth' """The *editing depth* of Wikipedia is one of several possible rough indicators @@ -41,29 +92,68 @@ are updated. The measurement of depth was introduced after some limitations of the classic measurement of article count were realized. """ -# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日 rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' -"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of -the first paragraph of the page in plain text and HTML as well as the type of -page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web -and link previews in the apps. +""" +`wikipedia rest_v1 summary API`_: + The summary response includes an extract of the first paragraph of the page in + plain text and HTML as well as the type of page. This is useful for page + previews (fka. Hovercards, aka. Popups) on the web and link previews in the + apps. + +HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`): + The desired language variant code for wikis where LanguageConverter_ is + enabled. -.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ +.. _wikipedia rest_v1 summary API: + https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ """ +wiki_lc_locale_variants = { + "zh": ( + "zh-CN", + "zh-HK", + "zh-MO", + "zh-MY", + "zh-SG", + "zh-TW", + ), + "zh-classical": ("zh-classical",), +} +"""Mapping rule of the LanguageConverter_ to map a language and its variants to +a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC +Chinese`_. + +.. _LC Chinese: + https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese +""" + +wikipedia_script_variants = { + "zh": ( + "zh_Hant", + "zh_Hans", + ) +} + + +def get_wiki_params(sxng_locale, eng_traits): + """Returns the Wikipedia language tag and the netloc that fits to the + ``sxng_locale``. To support LanguageConverter_ this function rates a locale + (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`). + + """ + eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en')) + wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org') + return eng_tag, wiki_netloc + def request(query, params): """Assemble a request (`wikipedia rest_v1 summary API`_).""" if query.islower(): query = query.title() - engine_language = traits.get_language(params['searxng_locale'], 'en') - wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/') + _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits) title = urllib.parse.quote(query) - - # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/ - # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/ params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title) params['raise_for_httperror'] = False @@ -93,7 +183,7 @@ def response(resp): network.raise_for_httperror(resp) api_result = resp.json() - title = api_result['title'] + title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title')) wikipedia_link = api_result['content_urls']['desktop']['page'] results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')}) @@ -116,44 +206,38 @@ def response(resp): # These Wikipedias use language codes that do not conform to the ISO 639 # standard (which is how wiki subdomains are chosen nowadays). -lang_map = { - 'be-tarask': 'bel', - 'ak': 'aka', - 'als': 'gsw', - 'bat-smg': 'sgs', - 'cbk-zam': 'cbk', - 'fiu-vro': 'vro', - 'map-bms': 'map', - 'nrm': 'nrf', - 'roa-rup': 'rup', - 'nds-nl': 'nds', - #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) - 'zh-min-nan': 'nan', - 'zh-yue': 'yue', - 'an': 'arg', - 'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason) -} - -unknown_langs = [ - 'an', # Aragonese - 'ba', # Bashkir - 'bar', # Bavarian - 'bcl', # Central Bicolano - 'be-tarask', # Belarusian variant / Belarusian is already covered by 'be' - 'bpy', # Bishnupriya Manipuri is unknown by babel - 'hif', # Fiji Hindi - 'ilo', # Ilokano - 'li', # Limburgish - 'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel - 'sh', # Serbo-Croatian - 'simple', # simple english is not know as a natural language different to english (babel) - 'vo', # Volapük - 'wa', # Walloon -] +lang_map = locales.LOCALE_BEST_MATCH.copy() +lang_map.update( + { + 'be-tarask': 'bel', + 'ak': 'aka', + 'als': 'gsw', + 'bat-smg': 'sgs', + 'cbk-zam': 'cbk', + 'fiu-vro': 'vro', + 'map-bms': 'map', + 'no': 'nb-NO', + 'nrm': 'nrf', + 'roa-rup': 'rup', + 'nds-nl': 'nds', + #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) + 'zh-min-nan': 'nan', + 'zh-yue': 'yue', + 'an': 'arg', + } +) def fetch_traits(engine_traits: EngineTraits): - """Fetch languages from Wikipedia. + fetch_wikimedia_traits(engine_traits) + print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES'])) + + +def fetch_wikimedia_traits(engine_traits: EngineTraits): + """Fetch languages from Wikipedia. Not all languages from the + :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those + known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal + :py:obj:`editing depth <wikipedia_article_depth>`. The location of the Wikipedia address of a language is mapped in a :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>` @@ -169,15 +253,21 @@ def fetch_traits(engine_traits: EngineTraits): "zh": "zh.wikipedia.org", "zh-classical": "zh-classical.wikipedia.org" } - """ - + # pylint: disable=too-many-branches engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['WIKIPEDIA_LANGUAGES'] = [] + + # insert alias to map from a script or region to a wikipedia variant - # insert alias to map from a region like zh-CN to a language zh_Hans - engine_traits.languages['zh_Hans'] = 'zh' + for eng_tag, sxng_tag_list in wikipedia_script_variants.items(): + for sxng_tag in sxng_tag_list: + engine_traits.languages[sxng_tag] = eng_tag + for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items(): + for sxng_tag in sxng_tag_list: + engine_traits.regions[sxng_tag] = eng_tag - resp = network.get(wikipedia_article_depth) + resp = network.get(list_of_wikipedias) if not resp.ok: print("ERROR: response from Wikipedia is not OK.") @@ -189,30 +279,31 @@ def fetch_traits(engine_traits: EngineTraits): continue cols = [c.text_content().strip() for c in cols] - depth = float(cols[3].replace('-', '0').replace(',', '')) + depth = float(cols[11].replace('-', '0').replace(',', '')) articles = int(cols[4].replace(',', '').replace(',', '')) - if articles < 10000: - # exclude languages with too few articles - continue - - if int(depth) < 20: - # Rough indicator of a Wikipedia’s quality, showing how frequently - # its articles are updated. - continue - - eng_tag = cols[2] - wiki_url = row.xpath('./td[3]/a/@href')[0] + eng_tag = cols[3] + wiki_url = row.xpath('./td[4]/a/@href')[0] wiki_url = urllib.parse.urlparse(wiki_url) - if eng_tag in unknown_langs: - continue - try: - sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-')) + sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-')) except babel.UnknownLocaleError: - print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag)) + # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag)) continue + finally: + engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag) + + if sxng_tag not in locales.LOCALE_NAMES: + + if articles < 10000: + # exclude languages with too few articles + continue + + if int(depth) < 20: + # Rough indicator of a Wikipedia’s quality, showing how + # frequently its articles are updated. + continue conflict = engine_traits.languages.get(sxng_tag) if conflict: @@ -222,3 +313,5 @@ def fetch_traits(engine_traits: EngineTraits): engine_traits.languages[sxng_tag] = eng_tag engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc + + engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort() diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 697533d8c..7b2ec856b 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -75,6 +75,7 @@ class OnlineProcessor(EngineProcessor): ) params['headers']['Accept-Language'] = ac_lang + self.logger.debug('HTTP Accept-Language: %s', params['headers'].get('Accept-Language', '')) return params def _send_http_request(self, params): diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index 66bc303db..737b99775 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -70,7 +70,7 @@ SKIP_ENGINE_SOURCE = [ ] LANGUAGES = LOCALE_NAMES.keys() -WIKIPEDIA_LANGUAGES = {'language': 'wikipedia_language'} +WIKIPEDIA_LANGUAGES = {} # {'<sxng-ui-lang>': '<wikipedia_language>'} LANGUAGES_SPARQL = '' IDS = None @@ -103,7 +103,7 @@ def update_description(engine_name, lang, description, source, replace=True): def get_wikipedia_summary(lang, pageid): - params = {'language': lang.replace('_', '-'), 'headers': {}} + params = {'searxng_locale': lang.replace('_', '-'), 'headers': {}} searx.engines.engines['wikipedia'].request(pageid, params) try: response = searx.network.get(params['url'], headers=params['headers'], timeout=10) @@ -154,11 +154,25 @@ def get_website_description(url, lang1, lang2=None): def initialize(): - global IDS, WIKIPEDIA_LANGUAGES, LANGUAGES_SPARQL + global IDS, LANGUAGES_SPARQL searx.search.initialize() wikipedia_engine = searx.engines.engines['wikipedia'] - WIKIPEDIA_LANGUAGES = {language: wikipedia_engine.url_lang(language.replace('_', '-')) for language in LANGUAGES} - WIKIPEDIA_LANGUAGES['nb_NO'] = 'no' + + locale2lang = {'nl-BE': 'nl'} + for sxng_ui_lang in LANGUAGES: + + sxng_ui_alias = locale2lang.get(sxng_ui_lang, sxng_ui_lang) + wiki_lang = None + + if sxng_ui_alias in wikipedia_engine.traits.custom['WIKIPEDIA_LANGUAGES']: + wiki_lang = sxng_ui_alias + if not wiki_lang: + wiki_lang = wikipedia_engine.traits.get_language(sxng_ui_alias) + if not wiki_lang: + print(f"WIKIPEDIA_LANGUAGES missing {sxng_ui_lang}") + continue + WIKIPEDIA_LANGUAGES[sxng_ui_lang] = wiki_lang + LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values())) for engine_name, engine in searx.engines.engines.items(): descriptions[engine_name] = {} @@ -170,6 +184,7 @@ def initialize(): def fetch_wikidata_descriptions(): + print('Fetching wikidata descriptions') searx.network.set_timeout_for_thread(60) result = wikidata.send_wikidata_query( SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL) @@ -178,14 +193,17 @@ def fetch_wikidata_descriptions(): for binding in result['results']['bindings']: wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_lang = binding['itemDescription']['xml:lang'] - description = binding['itemDescription']['value'] + desc = binding['itemDescription']['value'] for engine_name in wd_to_engine_name[wikidata_id]: for lang in LANGUAGES: - if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang: - update_description(engine_name, lang, description, 'wikidata') + if WIKIPEDIA_LANGUAGES[lang] != wikidata_lang: + continue + print(f" engine: {engine_name} / wikidata_lang: {wikidata_lang} / len(desc): {len(desc)}") + update_description(engine_name, lang, desc, 'wikidata') def fetch_wikipedia_descriptions(): + print('Fetching wikipedia descriptions') result = wikidata.send_wikidata_query( SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL) ) @@ -196,9 +214,13 @@ def fetch_wikipedia_descriptions(): pageid = binding['name']['value'] for engine_name in wd_to_engine_name[wikidata_id]: for lang in LANGUAGES: - if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang: - description = get_wikipedia_summary(lang, pageid) - update_description(engine_name, lang, description, 'wikipedia') + if WIKIPEDIA_LANGUAGES[lang] != wikidata_lang: + continue + desc = get_wikipedia_summary(lang, pageid) + if not desc: + continue + print(f" engine: {engine_name} / wikidata_lang: {wikidata_lang} / len(desc): {len(desc)}") + update_description(engine_name, lang, desc, 'wikipedia') def normalize_url(url): @@ -209,41 +231,60 @@ def normalize_url(url): def fetch_website_description(engine_name, website): + print(f"- fetch website descr: {engine_name} / {website}") default_lang, default_description = get_website_description(website, None, None) + if default_lang is None or default_description is None: # the front page can't be fetched: skip this engine return - wikipedia_languages_r = {V: K for K, V in WIKIPEDIA_LANGUAGES.items()} + # to specify an order in where the most common languages are in front of the + # language list .. languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr'] languages = languages + [l for l in LANGUAGES if l not in languages] previous_matched_lang = None previous_count = 0 + for lang in languages: - if lang not in descriptions[engine_name]: - fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) - if fetched_lang is None or desc is None: - continue - matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None) - if matched_lang is None: - fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) - matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) - if matched_lang is not None: - update_description(engine_name, matched_lang, desc, website, replace=False) - # check if desc changed with the different lang values - if matched_lang == previous_matched_lang: - previous_count += 1 - if previous_count == 6: - # the website has returned the same description for 6 different languages in Accept-Language header - # stop now - break - else: - previous_matched_lang = matched_lang - previous_count = 0 + + if lang in descriptions[engine_name]: + continue + + fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) + if fetched_lang is None or desc is None: + continue + + # check if desc changed with the different lang values + + if fetched_lang == previous_matched_lang: + previous_count += 1 + if previous_count == 6: + # the website has returned the same description for 6 different languages in Accept-Language header + # stop now + break + else: + previous_matched_lang = fetched_lang + previous_count = 0 + + # Don't trust in the value of fetched_lang, some websites return + # for some inappropriate values, by example bing-images:: + # + # requested lang: zh-Hans-CN / fetched lang: ceb / desc: 查看根据您的兴趣量身定制的提要 + # + # The lang ceb is "Cebuano" but the description is given in zh-Hans-CN + + print( + f" engine: {engine_name:20} / requested lang:{lang:7}" + f" / fetched lang: {fetched_lang:7} / len(desc): {len(desc)}" + ) + + matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=lang) + update_description(engine_name, matched_lang, desc, website, replace=False) def fetch_website_descriptions(): + print('Fetching website descriptions') for engine_name, engine in searx.engines.engines.items(): website = getattr(engine, "about", {}).get('website') if website is None and hasattr(engine, "search_url"): @@ -289,11 +330,8 @@ def get_output(): def main(): initialize() - print('Fetching wikidata descriptions') fetch_wikidata_descriptions() - print('Fetching wikipedia descriptions') fetch_wikipedia_descriptions() - print('Fetching website descriptions') fetch_website_descriptions() output = get_output() |