summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2022-12-16 20:28:57 +0000
committerAlexandre Flament <alex@al-f.net>2023-02-17 15:17:36 +0000
commit6748e8e2d5eff3c2202b2a714afb5534b1573101 (patch)
tree57f6fcf3d5b9bb5ee3b4a03aaf8aac2a53ee7106
parent54389a29feb3feea5a868f7b3b83c9718fb71014 (diff)
downloadsearxng-6748e8e2d5eff3c2202b2a714afb5534b1573101.tar.gz
searxng-6748e8e2d5eff3c2202b2a714afb5534b1573101.zip
Add "Auto-detected" as a language.
When the user choose "Auto-detected", the choice remains on the following queries. The detected language is displayed. For example "Auto-detected (en)": * the next query language is going to be auto detected * for the current query, the detected language is English. This replace the autodetect_search_language plugin.
-rw-r--r--docs/src/searx.plugins.autodetect_search_language.rst8
-rw-r--r--searx/plugins/autodetect_search_language.py97
-rw-r--r--searx/preferences.py2
-rw-r--r--searx/query.py2
-rw-r--r--searx/search/__init__.py44
-rw-r--r--searx/search/models.py13
-rw-r--r--searx/settings_defaults.py2
-rw-r--r--searx/templates/simple/filters/languages.html4
-rw-r--r--searx/templates/simple/preferences.html5
-rw-r--r--searx/utils.py53
-rw-r--r--searx/webadapter.py2
-rwxr-xr-xsearx/webapp.py8
-rw-r--r--tests/unit/test_query.py11
-rw-r--r--tests/unit/test_search.py7
14 files changed, 143 insertions, 115 deletions
diff --git a/docs/src/searx.plugins.autodetect_search_language.rst b/docs/src/searx.plugins.autodetect_search_language.rst
deleted file mode 100644
index 7b66a6bf3..000000000
--- a/docs/src/searx.plugins.autodetect_search_language.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-.. _autodetect search language:
-
-======================
-Search language plugin
-======================
-
-.. automodule:: searx.plugins.autodetect_search_language
- :members:
diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py
deleted file mode 100644
index 026ca9b6f..000000000
--- a/searx/plugins/autodetect_search_language.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-or-later
-# lint: pylint
-"""Plugin to detect the search language from the search query.
-
-The language detection is done by using the fastText_ library (`python
-fasttext`_). fastText_ distributes the `language identification model`_, for
-reference:
-
-- `FastText.zip: Compressing text classification models`_
-- `Bag of Tricks for Efficient Text Classification`_
-
-The `language identification model`_ support the language codes (ISO-639-3)::
-
- af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
- ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
- fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
- is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
- mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
- no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
- sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
- vi vls vo wa war wuu xal xmf yi yo yue zh
-
-The `language identification model`_ is harmonized with the SearXNG's language
-(locale) model. General conditions of SearXNG's locale model are:
-
-a. SearXNG's locale of a query is passed to the
- :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
- code that is used by an engine.
-
-b. SearXNG and most of the engines do not support all the languages from
- language model and there might be also a discrepancy in the ISO-639-3 and
- ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further
- more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
- ``zh_Hant`` (``zh_Hans``).
-
-Conclusion: This plugin does only auto-detect the languages a user can select in
-the language menu (:py:obj:`supported_langs`).
-
-SearXNG's locale of a query comes from (*highest wins*):
-
-1. The ``Accept-Language`` header from user's HTTP client.
-2. The user select a locale in the preferences.
-3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
-4. This plugin is activated in the preferences and the locale (only the language
- code / none region code) comes from the fastText's language detection.
-
-Conclusion: There is a conflict between the language selected by the user and
-the language from language detection of this plugin. For example, the user
-explicitly selects the German locale via the search syntax to search for a term
-that is identified as an English term (try ``:de-DE thermomix``, for example).
-
-.. hint::
-
- To SearXNG maintainers; please take into account: under some circumstances
- the auto-detection of the language of this plugin could be detrimental to
- users expectations. Its not recommended to activate this plugin by
- default. It should always be the user's decision whether to activate this
- plugin or not.
-
-.. _fastText: https://fasttext.cc/
-.. _python fasttext: https://pypi.org/project/fasttext/
-.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
-.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
-.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
-
-"""
-
-from flask_babel import gettext
-import babel
-
-from searx.utils import detect_language
-from searx.languages import language_codes
-
-name = gettext('Autodetect search language')
-description = gettext('Automatically detect the query search language and switch to it.')
-preference_section = 'general'
-default_on = False
-
-supported_langs = set()
-"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
-
-
-def pre_search(request, search): # pylint: disable=unused-argument
- lang = detect_language(search.search_query.query, min_probability=0)
- if lang in supported_langs:
- search.search_query.lang = lang
- try:
- search.search_query.locale = babel.Locale.parse(lang)
- except babel.core.UnknownLocaleError:
- pass
- return True
-
-
-def init(app, settings): # pylint: disable=unused-argument
- for searxng_locale in language_codes:
- supported_langs.add(searxng_locale[0].split('-')[0])
- return True
diff --git a/searx/preferences.py b/searx/preferences.py
index 3d23c9331..0eac8441c 100644
--- a/searx/preferences.py
+++ b/searx/preferences.py
@@ -154,7 +154,7 @@ class SearchLanguageSetting(EnumStringSetting):
"""Available choices may change, so user's value may not be in choices anymore"""
def _validate_selection(self, selection):
- if selection != '' and not VALID_LANGUAGE_CODE.match(selection):
+ if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection):
raise ValidationException('Invalid language code: "{0}"'.format(selection))
def parse(self, data: str):
diff --git a/searx/query.py b/searx/query.py
index b8e1c1275..dbc52ec75 100644
--- a/searx/query.py
+++ b/searx/query.py
@@ -104,7 +104,7 @@ class LanguageParser(QueryPartParser):
break
# user may set a valid, yet not selectable language
- if VALID_LANGUAGE_CODE.match(value):
+ if VALID_LANGUAGE_CODE.match(value) or value == 'auto':
lang_parts = value.split('-')
if len(lang_parts) > 1:
value = lang_parts[0].lower() + '-' + lang_parts[1].upper()
diff --git a/searx/search/__init__.py b/searx/search/__init__.py
index 9d337916c..c5f225aa4 100644
--- a/searx/search/__init__.py
+++ b/searx/search/__init__.py
@@ -3,10 +3,12 @@
# pylint: disable=missing-module-docstring, too-few-public-methods
import threading
+from copy import copy
from timeit import default_timer
from uuid import uuid4
import flask
+import babel
from searx import settings
from searx.answerers import ask
@@ -20,6 +22,7 @@ from searx.network import initialize as initialize_network, check_network_config
from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time
from searx.search.processors import PROCESSORS, initialize as initialize_processors
from searx.search.checker import initialize as initialize_checker
+from searx.utils import detect_language
logger = logger.getChild('search')
@@ -37,18 +40,57 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False,
initialize_checker()
+def replace_auto_language(search_query: SearchQuery):
+ """
+ Do nothing except if `search_query.lang` is "auto".
+ In this case:
+ * the value "auto" is replaced by the detected language of the query.
+ The default value is "all" when no language is detected.
+ * `search_query.locale` is updated accordingly
+
+ Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep
+ only languages supported by the engines.
+ """
+ if search_query.lang != 'auto':
+ return
+
+ detected_lang = detect_language(search_query.query, threshold=0.0, only_search_languages=True)
+ if detected_lang is None:
+ # fallback to 'all' if no language has been detected
+ search_query.lang = 'all'
+ search_query.locale = None
+ return
+ search_query.lang = detected_lang
+ try:
+ search_query.locale = babel.Locale.parse(search_query.lang)
+ except babel.core.UnknownLocaleError:
+ search_query.locale = None
+
+
class Search:
"""Search information container"""
__slots__ = "search_query", "result_container", "start_time", "actual_timeout"
def __init__(self, search_query: SearchQuery):
+ """Initialize the Search
+
+ search_query is copied
+ """
# init vars
super().__init__()
- self.search_query = search_query
self.result_container = ResultContainer()
self.start_time = None
self.actual_timeout = None
+ self.search_query = copy(search_query)
+ self.update_search_query(self.search_query)
+
+ def update_search_query(self, search_query: SearchQuery):
+ """Update search_query.
+
+ call replace_auto_language to replace the "auto" language
+ """
+ replace_auto_language(search_query)
def search_external_bang(self):
"""
diff --git a/searx/search/models.py b/searx/search/models.py
index bbca1cd1d..91e5d5982 100644
--- a/searx/search/models.py
+++ b/searx/search/models.py
@@ -109,3 +109,16 @@ class SearchQuery:
self.external_bang,
)
)
+
+ def __copy__(self):
+ return SearchQuery(
+ self.query,
+ self.engineref_list,
+ self.lang,
+ self.safesearch,
+ self.pageno,
+ self.time_range,
+ self.timeout_limit,
+ self.external_bang,
+ self.engine_data,
+ )
diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py
index 7baa23cac..6e98076ff 100644
--- a/searx/settings_defaults.py
+++ b/searx/settings_defaults.py
@@ -18,7 +18,7 @@ searx_dir = abspath(dirname(__file__))
logger = logging.getLogger('searx')
OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
-LANGUAGE_CODES = ['all'] + list(l[0] for l in languages)
+LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages)
SIMPLE_STYLE = ('auto', 'light', 'dark')
CATEGORIES_AS_TABS = {
'general': {},
diff --git a/searx/templates/simple/filters/languages.html b/searx/templates/simple/filters/languages.html
index e9e4d47ce..54e07e209 100644
--- a/searx/templates/simple/filters/languages.html
+++ b/searx/templates/simple/filters/languages.html
@@ -1,5 +1,9 @@
<select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
+ <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>
+ {{- _('Auto-detect') -}}
+ {%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%}
+ </option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}
diff --git a/searx/templates/simple/preferences.html b/searx/templates/simple/preferences.html
index 4aef7f986..9626b04d4 100644
--- a/searx/templates/simple/preferences.html
+++ b/searx/templates/simple/preferences.html
@@ -116,12 +116,15 @@
<p class="value">{{- '' -}}
<select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
+ <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option>
{%- endfor -%}
</select>{{- '' -}}
</p>
- <div class="description" id="desc_language">{{ _('What language do you prefer for search?') }}</div>
+ <div class="description" id="desc_language">
+ {{- _('What language do you prefer for search?') }} {{ _('Choose Auto-detect to let SearXNG detect the language of your query.') -}}
+ </div>
</fieldset>
{% endif %}
{% if 'autocomplete' not in locked_preferences %}
diff --git a/searx/utils.py b/searx/utils.py
index cda336035..c3958ae78 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
"""fasttext model to predict laguage of a search term"""
+SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes])
+"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
+
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
@@ -637,11 +640,53 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
return _FASTTEXT_MODEL
-def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
- """https://fasttext.cc/docs/en/language-identification.html"""
+def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
+ """Detect the language of the text parameter
+
+ Args:
+ * text (str): the string whose language is to be detected.
+ * threshold (float): threshold filters the returned labels by a threshold on probability.
+ A choice of 0.3 will return labels with at least 0.3 probability.
+ * only_search_languages (bool): if True, returns only supported SearXNG search languages.
+ see :py:obj:`searx.languages`
+
+
+ Raises:
+ * ValueError: if text is not a string
+
+ Returns:
+ * result (str, None): the detected language code or None. See below.
+
+ The language detection is done by using `a fork`_ of the fastText_ library (`python
+ fasttext`_). fastText_ distributes the `language identification model`_, for
+ reference:
+
+ - `FastText.zip: Compressing text classification models`_
+ - `Bag of Tricks for Efficient Text Classification`_
+
+ The `language identification model`_ support the language codes (ISO-639-3)::
+ af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
+ ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
+ fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
+ is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
+ mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
+ no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
+ sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
+ vi vls vo wa war wuu xal xmf yi yo yue zh
+
+ .. _a fork: https://github.com/searxng/fasttext-predict
+ .. _fastText: https://fasttext.cc/
+ .. _python fasttext: https://pypi.org/project/fasttext/
+ .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
+ .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
+ .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
+ """
if not isinstance(text, str):
raise ValueError('text must a str')
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
- if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
- return r[0][0].split('__label__')[1]
+ if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
+ language = r[0][0].split('__label__')[1]
+ if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
+ return None
+ return language
return None
diff --git a/searx/webadapter.py b/searx/webadapter.py
index 00dead2a9..dbcf25058 100644
--- a/searx/webadapter.py
+++ b/searx/webadapter.py
@@ -63,7 +63,7 @@ def parse_lang(preferences: Preferences, form: Dict[str, str], raw_text_query: R
query_lang = preferences.get_value('language')
# check language
- if not VALID_LANGUAGE_CODE.match(query_lang):
+ if not VALID_LANGUAGE_CODE.match(query_lang) and query_lang != 'auto':
raise SearxParameterException('language', query_lang)
return query_lang
diff --git a/searx/webapp.py b/searx/webapp.py
index 6746f7a00..95c33f704 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -810,6 +810,9 @@ def search():
)
)
+ # search_query.lang contains the user choice (all, auto, en, ...)
+ # when the user choice is "auto", search.search_query.lang contains the detected language
+ # otherwise it is equals to search_query.lang
return render(
# fmt: off
'results.html',
@@ -834,6 +837,11 @@ def search():
settings['search']['languages'],
fallback=request.preferences.get_value("language")
),
+ search_language = match_language(
+ search.search_query.lang,
+ settings['search']['languages'],
+ fallback=request.preferences.get_value("language")
+ ),
timeout_limit = request.form.get('timeout_limit', None)
# fmt: on
)
diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py
index db25da8f3..7274a8da5 100644
--- a/tests/unit/test_query.py
+++ b/tests/unit/test_query.py
@@ -91,6 +91,17 @@ class TestLanguageParser(SearxTestCase):
self.assertIn('all', query.languages)
self.assertFalse(query.specific)
+ def test_auto_language_code(self):
+ language = 'auto'
+ query_text = 'una consulta'
+ full_query = ':' + language + ' ' + query_text
+ query = RawTextQuery(full_query, [])
+
+ self.assertEqual(query.getFullQuery(), full_query)
+ self.assertEqual(len(query.query_parts), 1)
+ self.assertIn('auto', query.languages)
+ self.assertFalse(query.specific)
+
def test_invalid_language_code(self):
language = 'not_a_language'
query_text = 'the query'
diff --git a/tests/unit/test_search.py b/tests/unit/test_search.py
index fa16947be..33bf90840 100644
--- a/tests/unit/test_search.py
+++ b/tests/unit/test_search.py
@@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
+from copy import copy
+
import searx.search
from searx.search import SearchQuery, EngineRef
from searx import settings
@@ -34,6 +36,11 @@ class SearchQueryTestCase(SearxTestCase):
self.assertEqual(s, s)
self.assertNotEqual(s, t)
+ def test_copy(self):
+ s = SearchQuery('test', [EngineRef('bing', 'general')], 'all', 0, 1, None, None, None)
+ t = copy(s)
+ self.assertEqual(s, t)
+
class SearchTestCase(SearxTestCase):
@classmethod