summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/locales.py108
-rw-r--r--searx/utils.py88
-rwxr-xr-xsearx/webapp.py30
-rwxr-xr-xsearxng_extra/update/update_engine_descriptions.py8
-rw-r--r--tests/unit/test_locales.py111
-rw-r--r--tests/unit/test_utils.py33
6 files changed, 240 insertions, 138 deletions
diff --git a/searx/locales.py b/searx/locales.py
index a4560aab7..ffa5e731c 100644
--- a/searx/locales.py
+++ b/searx/locales.py
@@ -4,7 +4,7 @@
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
"""
-from typing import Set
+from typing import Set, Optional, List
import os
import pathlib
@@ -177,6 +177,17 @@ def language_tag(locale: babel.Locale) -> str:
return sxng_lang
+def get_locale(locale_tag: str) -> Optional[babel.Locale]:
+ """Returns a :py:obj:`babel.Locale` object parsed from argument
+ ``locale_tag``"""
+ try:
+ locale = babel.Locale.parse(locale_tag, sep='-')
+ return locale
+
+ except babel.core.UnknownLocaleError:
+ return None
+
+
def get_offical_locales(
territory: str, languages=None, regional: bool = False, de_facto: bool = True
) -> Set[babel.Locale]:
@@ -363,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
engine_locale = default
return default
+
+
+def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]:
+ """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.
+
+ :param str searxng_locale: SearXNG's internal representation of locale (de,
+ de-DE, fr-BE, zh, zh-CN, zh-TW ..).
+
+ :param list locale_tag_list: The list of locale tags to select from
+
+ :param str fallback: fallback locale tag (if unset --> ``None``)
+
+ The rules to find a match are implemented in :py:obj:`get_engine_locale`,
+ the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.
+
+ .. hint::
+
+ The *SearXNG locale* string and the members of ``locale_tag_list`` has to
+ be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
+ UI and are not known by babel --> will be ignored.
+ """
+
+ # searxng_locale = 'es'
+ # locale_tag_list = ['es-AR', 'es-ES', 'es-MX']
+
+ if not searxng_locale:
+ return fallback
+
+ locale = get_locale(searxng_locale)
+ if locale is None:
+ return fallback
+
+ # normalize to a SearXNG locale that can be passed to get_engine_locale
+
+ searxng_locale = language_tag(locale)
+ if locale.territory:
+ searxng_locale = region_tag(locale)
+
+ # clean up locale_tag_list
+
+ tag_list = []
+ for tag in locale_tag_list:
+ if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
+ continue
+ tag_list.append(tag)
+
+ # emulate fetch_traits
+ engine_locales = build_engine_locales(tag_list)
+ return get_engine_locale(searxng_locale, engine_locales, default=fallback)
+
+
+def build_engine_locales(tag_list: List[str]):
+ """From a list of locale tags a dictionary is build that can be passed by
+ argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function
+ is mainly used by :py:obj:`match_locale` and is similar to what the
+ ``fetch_traits(..)`` function of engines do.
+
+ If there are territory codes in the ``tag_list`` that have a *script code*
+ additional keys are added to the returned dictionary.
+
+ .. code:: python
+
+ >>> import locales
+ >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
+ >>> engine_locales
+ {
+ 'en': 'en', 'en-US': 'en-US',
+ 'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
+ 'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
+ }
+ >>> get_engine_locale('zh-Hans', engine_locales)
+ 'zh-CN'
+
+ This function is a good example to understand the language/region model
+ of SearXNG:
+
+ SearXNG only distinguishes between **search languages** and **search
+ regions**, by adding the *script-tags*, languages with *script-tags* can
+ be assigned to the **regions** that SearXNG supports.
+
+ """
+ engine_locales = {}
+
+ for tag in tag_list:
+ locale = get_locale(tag)
+ if locale is None:
+ logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag)
+ continue
+ if locale.territory:
+ engine_locales[region_tag(locale)] = tag
+ if locale.script:
+ engine_locales[language_tag(locale)] = tag
+ else:
+ engine_locales[language_tag(locale)] = tag
+ return engine_locales
diff --git a/searx/utils.py b/searx/utils.py
index f7a71b649..161983011 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -18,8 +18,6 @@ from urllib.parse import urljoin, urlparse
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
-from babel.core import get_global
-
from searx import settings
from searx.data import USER_AGENTS, data_dir
@@ -365,92 +363,6 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
return None
-def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]:
- key = str(lang_list)
- value = _LANG_TO_LC_CACHE.get(key, None)
- if value is None:
- value = {}
- for lang in lang_list:
- value.setdefault(lang.split('-')[0], lang)
- _LANG_TO_LC_CACHE[key] = value
- return value
-
-
-# babel's get_global contains all sorts of miscellaneous locale and territory related data
-# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
-def _get_from_babel(lang_code: str, key):
- match = get_global(key).get(lang_code.replace('-', '_'))
- # for some keys, such as territory_aliases, match may be a list
- if isinstance(match, str):
- return match.replace('_', '-')
- return match
-
-
-def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102
- """auxiliary function to match lang_code in lang_list"""
- # replace language code with a custom alias if necessary
- if lang_code in custom_aliases:
- lang_code = custom_aliases[lang_code]
-
- if lang_code in lang_list:
- return lang_code
-
- # try to get the most likely country for this language
- subtags = _get_from_babel(lang_code, 'likely_subtags')
- if subtags:
- if subtags in lang_list:
- return subtags
- subtag_parts = subtags.split('-')
- new_code = subtag_parts[0] + '-' + subtag_parts[-1]
- if new_code in custom_aliases:
- new_code = custom_aliases[new_code]
- if new_code in lang_list:
- return new_code
-
- # try to get the any supported country for this language
- return _get_lang_to_lc_dict(lang_list).get(lang_code)
-
-
-def match_language( # pylint: disable=W0102
- locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US'
-) -> Optional[str]:
- """get the language code from lang_list that best matches locale_code"""
- # try to get language from given locale_code
- language = _match_language(locale_code, lang_list, custom_aliases)
- if language:
- return language
-
- locale_parts = locale_code.split('-')
- lang_code = locale_parts[0]
-
- # if locale_code has script, try matching without it
- if len(locale_parts) > 2:
- language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases)
- if language:
- return language
-
- # try to get language using an equivalent country code
- if len(locale_parts) > 1:
- country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases')
- if country_alias:
- language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
- if language:
- return language
-
- # try to get language using an equivalent language code
- alias = _get_from_babel(lang_code, 'language_aliases')
- if alias:
- language = _match_language(alias, lang_list, custom_aliases)
- if language:
- return language
-
- if lang_code != locale_code:
- # try to get language from given language without giving the country
- language = _match_language(lang_code, lang_list, custom_aliases)
-
- return language or fallback
-
-
def load_module(filename: str, module_dir: str) -> types.ModuleType:
modname = splitext(filename)[0]
modpath = join(module_dir, filename)
diff --git a/searx/webapp.py b/searx/webapp.py
index bc2a50784..4ed6c2eb7 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -89,7 +89,6 @@ from searx.utils import (
html_to_text,
gen_useragent,
dict_subset,
- match_language,
)
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
from searx.query import RawTextQuery
@@ -117,6 +116,7 @@ from searx.locales import (
RTL_LOCALES,
localeselector,
locales_initialize,
+ match_locale,
)
# renaming names from searx imports ...
@@ -227,7 +227,7 @@ def _get_browser_language(req, lang_list):
if '-' in lang:
lang_parts = lang.split('-')
lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
- locale = match_language(lang, lang_list, fallback=None)
+ locale = match_locale(lang, lang_list, fallback=None)
if locale is not None:
return locale
return 'en'
@@ -407,7 +407,7 @@ def get_client_settings():
def render(template_name: str, **kwargs):
-
+ # pylint: disable=too-many-statements
kwargs['client_settings'] = str(
base64.b64encode(
bytes(
@@ -445,10 +445,13 @@ def render(template_name: str, **kwargs):
if locale in RTL_LOCALES and 'rtl' not in kwargs:
kwargs['rtl'] = True
+
if 'current_language' not in kwargs:
- kwargs['current_language'] = match_language(
- request.preferences.get_value('language'), settings['search']['languages']
- )
+ _locale = request.preferences.get_value('language')
+ if _locale in ('auto', 'all'):
+ kwargs['current_language'] = _locale
+ else:
+ kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])
# values from settings
kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
@@ -810,6 +813,13 @@ def search():
)
)
+ if search_query.lang in ('auto', 'all'):
+ current_language = search_query.lang
+ else:
+ current_language = match_locale(
+ search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
+ )
+
# search_query.lang contains the user choice (all, auto, en, ...)
# when the user choice is "auto", search.search_query.lang contains the detected language
# otherwise it is equals to search_query.lang
@@ -832,12 +842,8 @@ def search():
result_container.unresponsive_engines
),
current_locale = request.preferences.get_value("locale"),
- current_language = match_language(
- search_query.lang,
- settings['search']['languages'],
- fallback=request.preferences.get_value("language")
- ),
- search_language = match_language(
+ current_language = current_language,
+ search_language = match_locale(
search.search_query.lang,
settings['search']['languages'],
fallback=request.preferences.get_value("language")
diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py
index 6052bf084..66bc303db 100755
--- a/searxng_extra/update/update_engine_descriptions.py
+++ b/searxng_extra/update/update_engine_descriptions.py
@@ -18,8 +18,8 @@ from os.path import join
from lxml.html import fromstring
from searx.engines import wikidata, set_loggers
-from searx.utils import extract_text, match_language
-from searx.locales import LOCALE_NAMES, locales_initialize
+from searx.utils import extract_text
+from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
import searx.search
@@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website):
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None:
continue
- matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None)
+ matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None:
- fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
+ fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False)
diff --git a/tests/unit/test_locales.py b/tests/unit/test_locales.py
new file mode 100644
index 000000000..61561c17b
--- /dev/null
+++ b/tests/unit/test_locales.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Test some code from module :py:obj:`searx.locales`"""
+
+from searx import locales
+from searx.sxng_locales import sxng_locales
+from tests import SearxTestCase
+
+
+class TestLocales(SearxTestCase):
+ """Implemented tests:
+
+ - :py:obj:`searx.locales.match_locale`
+ """
+
+ def test_match_locale(self):
+
+ locale_tag_list = [x[0] for x in sxng_locales]
+
+ # Test SearXNG search languages
+
+ self.assertEqual(locales.match_locale('de', locale_tag_list), 'de')
+ self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr')
+ self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh')
+
+ # Test SearXNG search regions
+
+ self.assertEqual(locales.match_locale('ca-es', locale_tag_list), 'ca-ES')
+ self.assertEqual(locales.match_locale('de-at', locale_tag_list), 'de-AT')
+ self.assertEqual(locales.match_locale('de-de', locale_tag_list), 'de-DE')
+ self.assertEqual(locales.match_locale('en-UK', locale_tag_list), 'en-GB')
+ self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
+ self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
+ self.assertEqual(locales.match_locale('fr-ca', locale_tag_list), 'fr-CA')
+ self.assertEqual(locales.match_locale('fr-ch', locale_tag_list), 'fr-CH')
+ self.assertEqual(locales.match_locale('zh-cn', locale_tag_list), 'zh-CN')
+ self.assertEqual(locales.match_locale('zh-tw', locale_tag_list), 'zh-TW')
+ self.assertEqual(locales.match_locale('zh-hk', locale_tag_list), 'zh-HK')
+
+ # Test language script code
+
+ self.assertEqual(locales.match_locale('zh-hans', locale_tag_list), 'zh-CN')
+ self.assertEqual(locales.match_locale('zh-hans-cn', locale_tag_list), 'zh-CN')
+ self.assertEqual(locales.match_locale('zh-hant', locale_tag_list), 'zh-TW')
+ self.assertEqual(locales.match_locale('zh-hant-tw', locale_tag_list), 'zh-TW')
+
+ # Test individual locale lists
+
+ self.assertEqual(locales.match_locale('es', [], fallback='fallback'), 'fallback')
+
+ self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
+ self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
+ self.assertEqual(locales.match_locale('es', ['ES']), 'ES')
+ self.assertEqual(locales.match_locale('es', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
+ self.assertEqual(locales.match_locale('es-AR', ['es-AR', 'es-ES', 'es-MX']), 'es-AR')
+ self.assertEqual(locales.match_locale('es-CO', ['es-AR', 'es-ES']), 'es-ES')
+ self.assertEqual(locales.match_locale('es-CO', ['es-AR']), 'es-AR')
+
+ # Tests from the commit message of 9ae409a05a
+
+ # Assumption:
+ # A. When a user selects a language the results should be optimized according to
+ # the selected language.
+ #
+ # B. When user selects a language and a territory the results should be
+ # optimized with first priority on territory and second on language.
+
+ # Assume we have an engine that supports the follwoing locales:
+ locale_tag_list = ['zh-CN', 'zh-HK', 'nl-BE', 'fr-CA']
+
+ # Examples (Assumption A.)
+ # ------------------------
+
+ # A user selects region 'zh-TW' which should end in zh_HK.
+ # hint: CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')
+ self.assertEqual(locales.match_locale('zh-TW', locale_tag_list), 'zh-HK')
+
+ # A user selects only the language 'zh' which should end in CN
+ self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh-CN')
+
+ # A user selects only the language 'fr' which should end in fr_CA
+ self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-CA')
+
+ # The difference in priority on the territory is best shown with a
+ # engine that supports the following locales:
+ locale_tag_list = ['fr-FR', 'fr-CA', 'en-GB', 'nl-BE']
+
+ # A user selects only a language
+ self.assertEqual(locales.match_locale('en', locale_tag_list), 'en-GB')
+
+ # hint: the engine supports fr_FR and fr_CA since no territory is given,
+ # fr_FR takes priority ..
+ self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-FR')
+
+ # Examples (Assumption B.)
+ # ------------------------
+
+ # A user selects region 'fr-BE' which should end in nl-BE
+ self.assertEqual(locales.match_locale('fr-BE', locale_tag_list), 'nl-BE')
+
+ # If the user selects a language and there are two locales like the
+ # following:
+
+ locale_tag_list = ['fr-BE', 'fr-CH']
+
+ # The get_engine_locale selects the locale by looking at the "population
+ # percent" and this percentage has an higher amount in BE (68.%)
+ # compared to CH (21%)
+
+ self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-BE')
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 6f51f1ee3..2ad4593a1 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -87,39 +87,6 @@ class TestUtils(SearxTestCase):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
- def test_match_language(self):
- self.assertEqual(utils.match_language('es', ['es']), 'es')
- self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
- self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
-
- # handle script tags
- self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN')
- self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW')
- self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN')
- self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW')
- self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN')
- self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW')
-
- aliases = {'en-GB': 'en-UK', 'he': 'iw'}
-
- # guess country
- self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
- self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
- self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
- self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
- self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
- self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
-
- # language aliases
- self.assertEqual(utils.match_language('iw', ['he']), 'he')
- self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
- self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
- self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
- self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
- self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
- self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
- self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
-
def test_ecma_unscape(self):
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: รณ')