diff options
author | Alexandre Flament <alex@al-f.net> | 2022-12-16 20:28:57 +0000 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2023-02-17 15:17:36 +0000 |
commit | 6748e8e2d5eff3c2202b2a714afb5534b1573101 (patch) | |
tree | 57f6fcf3d5b9bb5ee3b4a03aaf8aac2a53ee7106 /searx | |
parent | 54389a29feb3feea5a868f7b3b83c9718fb71014 (diff) | |
download | searxng-6748e8e2d5eff3c2202b2a714afb5534b1573101.tar.gz searxng-6748e8e2d5eff3c2202b2a714afb5534b1573101.zip |
Add "Auto-detected" as a language.
When the user choose "Auto-detected", the choice remains on the following queries.
The detected language is displayed.
For example "Auto-detected (en)":
* the next query language is going to be auto detected
* for the current query, the detected language is English.
This replace the autodetect_search_language plugin.
Diffstat (limited to 'searx')
-rw-r--r-- | searx/plugins/autodetect_search_language.py | 97 | ||||
-rw-r--r-- | searx/preferences.py | 2 | ||||
-rw-r--r-- | searx/query.py | 2 | ||||
-rw-r--r-- | searx/search/__init__.py | 44 | ||||
-rw-r--r-- | searx/search/models.py | 13 | ||||
-rw-r--r-- | searx/settings_defaults.py | 2 | ||||
-rw-r--r-- | searx/templates/simple/filters/languages.html | 4 | ||||
-rw-r--r-- | searx/templates/simple/preferences.html | 5 | ||||
-rw-r--r-- | searx/utils.py | 53 | ||||
-rw-r--r-- | searx/webadapter.py | 2 | ||||
-rwxr-xr-x | searx/webapp.py | 8 |
11 files changed, 125 insertions, 107 deletions
diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py deleted file mode 100644 index 026ca9b6f..000000000 --- a/searx/plugins/autodetect_search_language.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Plugin to detect the search language from the search query. - -The language detection is done by using the fastText_ library (`python -fasttext`_). fastText_ distributes the `language identification model`_, for -reference: - -- `FastText.zip: Compressing text classification models`_ -- `Bag of Tricks for Efficient Text Classification`_ - -The `language identification model`_ support the language codes (ISO-639-3):: - - af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr - ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa - fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io - is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv - mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn - no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd - sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep - vi vls vo wa war wuu xal xmf yi yo yue zh - -The `language identification model`_ is harmonized with the SearXNG's language -(locale) model. General conditions of SearXNG's locale model are: - -a. SearXNG's locale of a query is passed to the - :py:obj:`searx.locales.get_engine_locale` to get a language and/or region - code that is used by an engine. - -b. SearXNG and most of the engines do not support all the languages from - language model and there might be also a discrepancy in the ISO-639-3 and - ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further - more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to - ``zh_Hant`` (``zh_Hans``). - -Conclusion: This plugin does only auto-detect the languages a user can select in -the language menu (:py:obj:`supported_langs`). - -SearXNG's locale of a query comes from (*highest wins*): - -1. The ``Accept-Language`` header from user's HTTP client. -2. The user select a locale in the preferences. -3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``) -4. This plugin is activated in the preferences and the locale (only the language - code / none region code) comes from the fastText's language detection. - -Conclusion: There is a conflict between the language selected by the user and -the language from language detection of this plugin. For example, the user -explicitly selects the German locale via the search syntax to search for a term -that is identified as an English term (try ``:de-DE thermomix``, for example). - -.. hint:: - - To SearXNG maintainers; please take into account: under some circumstances - the auto-detection of the language of this plugin could be detrimental to - users expectations. Its not recommended to activate this plugin by - default. It should always be the user's decision whether to activate this - plugin or not. - -.. _fastText: https://fasttext.cc/ -.. _python fasttext: https://pypi.org/project/fasttext/ -.. _language identification model: https://fasttext.cc/docs/en/language-identification.html -.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 -.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 - -""" - -from flask_babel import gettext -import babel - -from searx.utils import detect_language -from searx.languages import language_codes - -name = gettext('Autodetect search language') -description = gettext('Automatically detect the query search language and switch to it.') -preference_section = 'general' -default_on = False - -supported_langs = set() -"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" - - -def pre_search(request, search): # pylint: disable=unused-argument - lang = detect_language(search.search_query.query, min_probability=0) - if lang in supported_langs: - search.search_query.lang = lang - try: - search.search_query.locale = babel.Locale.parse(lang) - except babel.core.UnknownLocaleError: - pass - return True - - -def init(app, settings): # pylint: disable=unused-argument - for searxng_locale in language_codes: - supported_langs.add(searxng_locale[0].split('-')[0]) - return True diff --git a/searx/preferences.py b/searx/preferences.py index 3d23c9331..0eac8441c 100644 --- a/searx/preferences.py +++ b/searx/preferences.py @@ -154,7 +154,7 @@ class SearchLanguageSetting(EnumStringSetting): """Available choices may change, so user's value may not be in choices anymore""" def _validate_selection(self, selection): - if selection != '' and not VALID_LANGUAGE_CODE.match(selection): + if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection): raise ValidationException('Invalid language code: "{0}"'.format(selection)) def parse(self, data: str): diff --git a/searx/query.py b/searx/query.py index b8e1c1275..dbc52ec75 100644 --- a/searx/query.py +++ b/searx/query.py @@ -104,7 +104,7 @@ class LanguageParser(QueryPartParser): break # user may set a valid, yet not selectable language - if VALID_LANGUAGE_CODE.match(value): + if VALID_LANGUAGE_CODE.match(value) or value == 'auto': lang_parts = value.split('-') if len(lang_parts) > 1: value = lang_parts[0].lower() + '-' + lang_parts[1].upper() diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 9d337916c..c5f225aa4 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -3,10 +3,12 @@ # pylint: disable=missing-module-docstring, too-few-public-methods import threading +from copy import copy from timeit import default_timer from uuid import uuid4 import flask +import babel from searx import settings from searx.answerers import ask @@ -20,6 +22,7 @@ from searx.network import initialize as initialize_network, check_network_config from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time from searx.search.processors import PROCESSORS, initialize as initialize_processors from searx.search.checker import initialize as initialize_checker +from searx.utils import detect_language logger = logger.getChild('search') @@ -37,18 +40,57 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False, initialize_checker() +def replace_auto_language(search_query: SearchQuery): + """ + Do nothing except if `search_query.lang` is "auto". + In this case: + * the value "auto" is replaced by the detected language of the query. + The default value is "all" when no language is detected. + * `search_query.locale` is updated accordingly + + Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep + only languages supported by the engines. + """ + if search_query.lang != 'auto': + return + + detected_lang = detect_language(search_query.query, threshold=0.0, only_search_languages=True) + if detected_lang is None: + # fallback to 'all' if no language has been detected + search_query.lang = 'all' + search_query.locale = None + return + search_query.lang = detected_lang + try: + search_query.locale = babel.Locale.parse(search_query.lang) + except babel.core.UnknownLocaleError: + search_query.locale = None + + class Search: """Search information container""" __slots__ = "search_query", "result_container", "start_time", "actual_timeout" def __init__(self, search_query: SearchQuery): + """Initialize the Search + + search_query is copied + """ # init vars super().__init__() - self.search_query = search_query self.result_container = ResultContainer() self.start_time = None self.actual_timeout = None + self.search_query = copy(search_query) + self.update_search_query(self.search_query) + + def update_search_query(self, search_query: SearchQuery): + """Update search_query. + + call replace_auto_language to replace the "auto" language + """ + replace_auto_language(search_query) def search_external_bang(self): """ diff --git a/searx/search/models.py b/searx/search/models.py index bbca1cd1d..91e5d5982 100644 --- a/searx/search/models.py +++ b/searx/search/models.py @@ -109,3 +109,16 @@ class SearchQuery: self.external_bang, ) ) + + def __copy__(self): + return SearchQuery( + self.query, + self.engineref_list, + self.lang, + self.safesearch, + self.pageno, + self.time_range, + self.timeout_limit, + self.external_bang, + self.engine_data, + ) diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py index 7baa23cac..6e98076ff 100644 --- a/searx/settings_defaults.py +++ b/searx/settings_defaults.py @@ -18,7 +18,7 @@ searx_dir = abspath(dirname(__file__)) logger = logging.getLogger('searx') OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss'] -LANGUAGE_CODES = ['all'] + list(l[0] for l in languages) +LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages) SIMPLE_STYLE = ('auto', 'light', 'dark') CATEGORIES_AS_TABS = { 'general': {}, diff --git a/searx/templates/simple/filters/languages.html b/searx/templates/simple/filters/languages.html index e9e4d47ce..54e07e209 100644 --- a/searx/templates/simple/filters/languages.html +++ b/searx/templates/simple/filters/languages.html @@ -1,5 +1,9 @@ <select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}} <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> + <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}> + {{- _('Auto-detect') -}} + {%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%} + </option> {%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} <option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}> {% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %} diff --git a/searx/templates/simple/preferences.html b/searx/templates/simple/preferences.html index 4aef7f986..9626b04d4 100644 --- a/searx/templates/simple/preferences.html +++ b/searx/templates/simple/preferences.html @@ -116,12 +116,15 @@ <p class="value">{{- '' -}} <select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}} <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> + <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option> {%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} <option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option> {%- endfor -%} </select>{{- '' -}} </p> - <div class="description" id="desc_language">{{ _('What language do you prefer for search?') }}</div> + <div class="description" id="desc_language"> + {{- _('What language do you prefer for search?') }} {{ _('Choose Auto-detect to let SearXNG detect the language of your query.') -}} + </div> </fieldset> {% endif %} {% if 'autocomplete' not in locked_preferences %} diff --git a/searx/utils.py b/searx/utils.py index cda336035..c3958ae78 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None """fasttext model to predict laguage of a search term""" +SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes]) +"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" + class _NotSetClass: # pylint: disable=too-few-public-methods """Internal class for this module, do not create instance of this class. @@ -637,11 +640,53 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": return _FASTTEXT_MODEL -def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]: - """https://fasttext.cc/docs/en/language-identification.html""" +def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]: + """Detect the language of the text parameter + + Args: + * text (str): the string whose language is to be detected. + * threshold (float): threshold filters the returned labels by a threshold on probability. + A choice of 0.3 will return labels with at least 0.3 probability. + * only_search_languages (bool): if True, returns only supported SearXNG search languages. + see :py:obj:`searx.languages` + + + Raises: + * ValueError: if text is not a string + + Returns: + * result (str, None): the detected language code or None. See below. + + The language detection is done by using `a fork`_ of the fastText_ library (`python + fasttext`_). fastText_ distributes the `language identification model`_, for + reference: + + - `FastText.zip: Compressing text classification models`_ + - `Bag of Tricks for Efficient Text Classification`_ + + The `language identification model`_ support the language codes (ISO-639-3):: + af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr + ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa + fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io + is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv + mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn + no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd + sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep + vi vls vo wa war wuu xal xmf yi yo yue zh + + .. _a fork: https://github.com/searxng/fasttext-predict + .. _fastText: https://fasttext.cc/ + .. _python fasttext: https://pypi.org/project/fasttext/ + .. _language identification model: https://fasttext.cc/docs/en/language-identification.html + .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 + .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 + """ if not isinstance(text, str): raise ValueError('text must a str') r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) - if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability: - return r[0][0].split('__label__')[1] + if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: + language = r[0][0].split('__label__')[1] + if only_search_languages and language not in SEARCH_LANGUAGE_CODES: + return None + return language return None diff --git a/searx/webadapter.py b/searx/webadapter.py index 00dead2a9..dbcf25058 100644 --- a/searx/webadapter.py +++ b/searx/webadapter.py @@ -63,7 +63,7 @@ def parse_lang(preferences: Preferences, form: Dict[str, str], raw_text_query: R query_lang = preferences.get_value('language') # check language - if not VALID_LANGUAGE_CODE.match(query_lang): + if not VALID_LANGUAGE_CODE.match(query_lang) and query_lang != 'auto': raise SearxParameterException('language', query_lang) return query_lang diff --git a/searx/webapp.py b/searx/webapp.py index 6746f7a00..95c33f704 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -810,6 +810,9 @@ def search(): ) ) + # search_query.lang contains the user choice (all, auto, en, ...) + # when the user choice is "auto", search.search_query.lang contains the detected language + # otherwise it is equals to search_query.lang return render( # fmt: off 'results.html', @@ -834,6 +837,11 @@ def search(): settings['search']['languages'], fallback=request.preferences.get_value("language") ), + search_language = match_language( + search.search_query.lang, + settings['search']['languages'], + fallback=request.preferences.get_value("language") + ), timeout_limit = request.form.get('timeout_limit', None) # fmt: on ) |