From ed901ab18edddcfd566ac37ddb1b429969ccc2ad Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 10 Dec 2022 16:20:09 +0100 Subject: [mod] improve 'Autodetect search language' plugin - Add documentation to the plugin - Harmonize FastText language model with SearXNG's language model Reosurces:: import fasttext # --> +10 MB fasttext.load_model(str(data_dir / 'lid.176.ftz')) # --> +4MB Suggested-by: @dalf - To speed up and simplify the deployment use fasttext-wheel instead of fasttext - Building numpy on the Alpine Linux of docker-images takes ages --> install py3-numpy from Alpines package manager (apk) - Alpine Linux on docker-images (musl libc) do not support fasttext-wheel (gnu libc) --> patch Dockerfile and build from fastetxt: sed -i s/fasttext-wheel/fasttext/ requirements.txt Signed-off-by: Markus Heiser --- Dockerfile | 3 + .../searx.plugins.autodetect_search_language.rst | 8 ++ requirements.txt | 2 +- searx/plugins/autodetect_search_language.py | 112 +++++++++++++++++++-- searx/settings.yml | 3 + 5 files changed, 119 insertions(+), 9 deletions(-) create mode 100644 docs/src/searx.plugins.autodetect_search_language.rst diff --git a/Dockerfile b/Dockerfile index 66f58395d..ece20c86b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,6 +36,7 @@ RUN apk add --no-cache -t build-dependencies \ su-exec \ python3 \ py3-pip \ + py3-numpy \ libxml2 \ libxslt \ openssl \ @@ -43,6 +44,8 @@ RUN apk add --no-cache -t build-dependencies \ uwsgi \ uwsgi-python3 \ brotli \ + && pip3 install --no-cache setuptools wheel \ + && sed -i s/fasttext-wheel/fasttext/ requirements.txt \ && pip3 install --no-cache -r requirements.txt \ && apk del build-dependencies \ && rm -rf /root/.cache diff --git a/docs/src/searx.plugins.autodetect_search_language.rst b/docs/src/searx.plugins.autodetect_search_language.rst new file mode 100644 index 000000000..7b66a6bf3 --- /dev/null +++ b/docs/src/searx.plugins.autodetect_search_language.rst @@ -0,0 +1,8 @@ +.. _autodetect search language: + +====================== +Search language plugin +====================== + +.. automodule:: searx.plugins.autodetect_search_language + :members: diff --git a/requirements.txt b/requirements.txt index b5efb8d59..de48aea00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ setproctitle==1.3.2 redis==4.4.0 markdown-it-py==2.1.0 typing_extensions==4.4.0 -fasttext==0.9.2 +fasttext-wheel==0.9.2 diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py index 3bcb80098..034668041 100644 --- a/searx/plugins/autodetect_search_language.py +++ b/searx/plugins/autodetect_search_language.py @@ -1,19 +1,115 @@ -import fasttext -import os +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Plugin to detect the search language from the search query. + +The language detection is done by using the fastText_ library (`python +fasttext`_). fastText_ distributes the `language identification model`_, for +reference: + +- `FastText.zip: Compressing text classification models`_ +- `Bag of Tricks for Efficient Text Classification`_ + +The `language identification model`_ support the language codes (ISO-639-3):: + + af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr + ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa + fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io + is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv + mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn + no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd + sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep + vi vls vo wa war wuu xal xmf yi yo yue zh + +The `language identification model`_ is harmonized with the SearXNG's language +(locale) model. General conditions of SearXNG's locale model are: + +a. SearXNG's locale of a query is passed to the + :py:obj:`searx.locales.get_engine_locale` to get a language and/or region + code that is used by an engine. + +b. SearXNG and most of the engines do not support all the languages from + language model and there might be also a discrepancy in the ISO-639-3 and + ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further + more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to + ``zh_Hant`` (``zh_Hans``). + +Conclusion: This plugin does only auto-detect the languages a user can select in +the language menu (:py:obj:`supported_langs`). + +SearXNG's locale of a query comes from (*highest wins*): + +1. The ``Accept-Language`` header from user's HTTP client. +2. The user select a locale in the preferences. +3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``) +4. This plugin is activated in the preferences and the locale (only the language + code / none region code) comes from the fastText's language detection. + +Conclusion: There is a conflict between the language selected by the user and +the language from language detection of this plugin. For example, the user +explicitly selects the German locale via the search syntax to search for a term +that is identified as an English term (try ``:de-DE thermomix``, for example). + +.. hint:: + + To SearXNG maintainers; please take into account: under some circumstances + the auto-detection of the language of this plugin could be detrimental to + users expectations. Its not recommended to activate this plugin by + default. It should always be the user's decision whether to activate this + plugin or not. + +.. _fastText: https://fasttext.cc/ +.. _python fasttext: https://pypi.org/project/fasttext/ +.. _language identification model: https://fasttext.cc/docs/en/language-identification.html +.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 +.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 + +""" + from flask_babel import gettext +import fasttext +import babel + +from searx.data import data_dir +from searx.languages import language_codes + +# Monkey patch: prevent fasttext from showing a (useless) warning when loading a +# model. +fasttext.FastText.eprint = lambda x: None name = gettext('Autodetect search language') description = gettext('Automatically detect the query search language and switch to it.') preference_section = 'general' default_on = False +lang_model: fasttext.FastText._FastText = None +"""fasttext model to predict laguage of a search term""" -fasttext.FastText.eprint = lambda x: None -model = fasttext.load_model(os.path.dirname(os.path.realpath(__file__)) + '/../data/lid.176.ftz') +supported_langs = set() +"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" + + +def get_model(): + # lazy load, in order to to save memory + global lang_model # pylint: disable=global-statement + if lang_model is None: + lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz')) + return lang_model + + +def pre_search(request, search): # pylint: disable=unused-argument + prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3) + if prediction: + lang = prediction[0][0].split('__label__')[1] + if lang in supported_langs: + search.search_query.lang = lang + try: + search.search_query.locale = babel.Locale.parse(lang) + except babel.core.UnknownLocaleError: + pass + return True -def pre_search(request, search): - lang = model.predict(search.search_query.query, k=1) - if lang[1][0] >= 0.3: - search.search_query.lang = lang[0][0].split('__label__')[1] +def init(app, settings): # pylint: disable=unused-argument + for searxng_locale in language_codes: + supported_langs.add(searxng_locale[0].split('-')[0]) return True diff --git a/searx/settings.yml b/searx/settings.yml index 9dc2199e5..3e7c2aae4 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -196,6 +196,9 @@ outgoing: # - 'Open Access DOI rewrite' # - 'Vim-like hotkeys' # - 'Tor check plugin' +# # Read the docs before activate: auto-detection of the language could be +# # detrimental to users expectations / users can activate the plugin in the +# # preferences if they want. # - 'Autodetect search language' # Configuration of the "Hostname replace" plugin: -- cgit v1.2.3-54-g00ecf