summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2022-12-10 16:20:09 +0100
committerMarkus Heiser <markus.heiser@darmarit.de>2022-12-11 11:26:07 +0100
commited901ab18edddcfd566ac37ddb1b429969ccc2ad (patch)
tree2d698ac11b6dd96ba4a88984b0919cbecbe52ef9
parent9925a209503cc41e37a1c032548a5fd8fd8ea362 (diff)
downloadsearxng-ed901ab18edddcfd566ac37ddb1b429969ccc2ad.tar.gz
searxng-ed901ab18edddcfd566ac37ddb1b429969ccc2ad.zip
[mod] improve 'Autodetect search language' plugin
- Add documentation to the plugin - Harmonize FastText language model with SearXNG's language model Reosurces:: import fasttext # --> +10 MB fasttext.load_model(str(data_dir / 'lid.176.ftz')) # --> +4MB Suggested-by: @dalf - To speed up and simplify the deployment use fasttext-wheel instead of fasttext - Building numpy on the Alpine Linux of docker-images takes ages --> install py3-numpy from Alpines package manager (apk) - Alpine Linux on docker-images (musl libc) do not support fasttext-wheel (gnu libc) --> patch Dockerfile and build from fastetxt: sed -i s/fasttext-wheel/fasttext/ requirements.txt Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r--Dockerfile3
-rw-r--r--docs/src/searx.plugins.autodetect_search_language.rst8
-rw-r--r--requirements.txt2
-rw-r--r--searx/plugins/autodetect_search_language.py112
-rw-r--r--searx/settings.yml3
5 files changed, 119 insertions, 9 deletions
diff --git a/Dockerfile b/Dockerfile
index 66f58395d..ece20c86b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -36,6 +36,7 @@ RUN apk add --no-cache -t build-dependencies \
su-exec \
python3 \
py3-pip \
+ py3-numpy \
libxml2 \
libxslt \
openssl \
@@ -43,6 +44,8 @@ RUN apk add --no-cache -t build-dependencies \
uwsgi \
uwsgi-python3 \
brotli \
+ && pip3 install --no-cache setuptools wheel \
+ && sed -i s/fasttext-wheel/fasttext/ requirements.txt \
&& pip3 install --no-cache -r requirements.txt \
&& apk del build-dependencies \
&& rm -rf /root/.cache
diff --git a/docs/src/searx.plugins.autodetect_search_language.rst b/docs/src/searx.plugins.autodetect_search_language.rst
new file mode 100644
index 000000000..7b66a6bf3
--- /dev/null
+++ b/docs/src/searx.plugins.autodetect_search_language.rst
@@ -0,0 +1,8 @@
+.. _autodetect search language:
+
+======================
+Search language plugin
+======================
+
+.. automodule:: searx.plugins.autodetect_search_language
+ :members:
diff --git a/requirements.txt b/requirements.txt
index b5efb8d59..de48aea00 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,4 @@ setproctitle==1.3.2
redis==4.4.0
markdown-it-py==2.1.0
typing_extensions==4.4.0
-fasttext==0.9.2
+fasttext-wheel==0.9.2
diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py
index 3bcb80098..034668041 100644
--- a/searx/plugins/autodetect_search_language.py
+++ b/searx/plugins/autodetect_search_language.py
@@ -1,19 +1,115 @@
-import fasttext
-import os
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Plugin to detect the search language from the search query.
+
+The language detection is done by using the fastText_ library (`python
+fasttext`_). fastText_ distributes the `language identification model`_, for
+reference:
+
+- `FastText.zip: Compressing text classification models`_
+- `Bag of Tricks for Efficient Text Classification`_
+
+The `language identification model`_ support the language codes (ISO-639-3)::
+
+ af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
+ ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
+ fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
+ is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
+ mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
+ no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
+ sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
+ vi vls vo wa war wuu xal xmf yi yo yue zh
+
+The `language identification model`_ is harmonized with the SearXNG's language
+(locale) model. General conditions of SearXNG's locale model are:
+
+a. SearXNG's locale of a query is passed to the
+ :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
+ code that is used by an engine.
+
+b. SearXNG and most of the engines do not support all the languages from
+ language model and there might be also a discrepancy in the ISO-639-3 and
+ ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further
+ more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
+ ``zh_Hant`` (``zh_Hans``).
+
+Conclusion: This plugin does only auto-detect the languages a user can select in
+the language menu (:py:obj:`supported_langs`).
+
+SearXNG's locale of a query comes from (*highest wins*):
+
+1. The ``Accept-Language`` header from user's HTTP client.
+2. The user select a locale in the preferences.
+3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
+4. This plugin is activated in the preferences and the locale (only the language
+ code / none region code) comes from the fastText's language detection.
+
+Conclusion: There is a conflict between the language selected by the user and
+the language from language detection of this plugin. For example, the user
+explicitly selects the German locale via the search syntax to search for a term
+that is identified as an English term (try ``:de-DE thermomix``, for example).
+
+.. hint::
+
+ To SearXNG maintainers; please take into account: under some circumstances
+ the auto-detection of the language of this plugin could be detrimental to
+ users expectations. Its not recommended to activate this plugin by
+ default. It should always be the user's decision whether to activate this
+ plugin or not.
+
+.. _fastText: https://fasttext.cc/
+.. _python fasttext: https://pypi.org/project/fasttext/
+.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
+.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
+.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
+
+"""
+
from flask_babel import gettext
+import fasttext
+import babel
+
+from searx.data import data_dir
+from searx.languages import language_codes
+
+# Monkey patch: prevent fasttext from showing a (useless) warning when loading a
+# model.
+fasttext.FastText.eprint = lambda x: None
name = gettext('Autodetect search language')
description = gettext('Automatically detect the query search language and switch to it.')
preference_section = 'general'
default_on = False
+lang_model: fasttext.FastText._FastText = None
+"""fasttext model to predict laguage of a search term"""
-fasttext.FastText.eprint = lambda x: None
-model = fasttext.load_model(os.path.dirname(os.path.realpath(__file__)) + '/../data/lid.176.ftz')
+supported_langs = set()
+"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
+
+
+def get_model():
+ # lazy load, in order to to save memory
+ global lang_model # pylint: disable=global-statement
+ if lang_model is None:
+ lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
+ return lang_model
+
+
+def pre_search(request, search): # pylint: disable=unused-argument
+ prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3)
+ if prediction:
+ lang = prediction[0][0].split('__label__')[1]
+ if lang in supported_langs:
+ search.search_query.lang = lang
+ try:
+ search.search_query.locale = babel.Locale.parse(lang)
+ except babel.core.UnknownLocaleError:
+ pass
+ return True
-def pre_search(request, search):
- lang = model.predict(search.search_query.query, k=1)
- if lang[1][0] >= 0.3:
- search.search_query.lang = lang[0][0].split('__label__')[1]
+def init(app, settings): # pylint: disable=unused-argument
+ for searxng_locale in language_codes:
+ supported_langs.add(searxng_locale[0].split('-')[0])
return True
diff --git a/searx/settings.yml b/searx/settings.yml
index 9dc2199e5..3e7c2aae4 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -196,6 +196,9 @@ outgoing:
# - 'Open Access DOI rewrite'
# - 'Vim-like hotkeys'
# - 'Tor check plugin'
+# # Read the docs before activate: auto-detection of the language could be
+# # detrimental to users expectations / users can activate the plugin in the
+# # preferences if they want.
# - 'Autodetect search language'
# Configuration of the "Hostname replace" plugin: