diff options
author | ArtikusHG <artiomoleynic@gmail.com> | 2022-12-11 17:45:47 +0200 |
---|---|---|
committer | ArtikusHG <artiomoleynic@gmail.com> | 2022-12-16 21:07:39 +0200 |
commit | 1f8f8c1e91040fd10bacdc473b8d5f97dda6424e (patch) | |
tree | 30ed501e9a052ae4a9d81af45026af9a1deecfcf /searx | |
parent | a6d870d5cfecfee78dcdd2a67cce318c894da2da (diff) | |
download | searxng-1f8f8c1e91040fd10bacdc473b8d5f97dda6424e.tar.gz searxng-1f8f8c1e91040fd10bacdc473b8d5f97dda6424e.zip |
Replace langdetect with fasttext
Diffstat (limited to 'searx')
-rw-r--r-- | searx/plugins/autodetect_search_language.py | 98 | ||||
-rw-r--r-- | searx/search/checker/impl.py | 16 | ||||
-rw-r--r-- | searx/utils.py | 26 |
3 files changed, 128 insertions, 12 deletions
diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py new file mode 100644 index 000000000..625f85373 --- /dev/null +++ b/searx/plugins/autodetect_search_language.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Plugin to detect the search language from the search query. + +The language detection is done by using the fastText_ library (`python +fasttext`_). fastText_ distributes the `language identification model`_, for +reference: + +- `FastText.zip: Compressing text classification models`_ +- `Bag of Tricks for Efficient Text Classification`_ + +The `language identification model`_ support the language codes (ISO-639-3):: + + af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr + ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa + fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io + is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv + mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn + no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd + sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep + vi vls vo wa war wuu xal xmf yi yo yue zh + +The `language identification model`_ is harmonized with the SearXNG's language +(locale) model. General conditions of SearXNG's locale model are: + +a. SearXNG's locale of a query is passed to the + :py:obj:`searx.locales.get_engine_locale` to get a language and/or region + code that is used by an engine. + +b. SearXNG and most of the engines do not support all the languages from + language model and there might be also a discrepancy in the ISO-639-3 and + ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further + more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to + ``zh_Hant`` (``zh_Hans``). + +Conclusion: This plugin does only auto-detect the languages a user can select in +the language menu (:py:obj:`supported_langs`). + +SearXNG's locale of a query comes from (*highest wins*): + +1. The ``Accept-Language`` header from user's HTTP client. +2. The user select a locale in the preferences. +3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``) +4. This plugin is activated in the preferences and the locale (only the language + code / none region code) comes from the fastText's language detection. + +Conclusion: There is a conflict between the language selected by the user and +the language from language detection of this plugin. For example, the user +explicitly selects the German locale via the search syntax to search for a term +that is identified as an English term (try ``:de-DE thermomix``, for example). + +.. hint:: + + To SearXNG maintainers; please take into account: under some circumstances + the auto-detection of the language of this plugin could be detrimental to + users expectations. Its not recommended to activate this plugin by + default. It should always be the user's decision whether to activate this + plugin or not. + +.. _fastText: https://fasttext.cc/ +.. _python fasttext: https://pypi.org/project/fasttext/ +.. _language identification model: https://fasttext.cc/docs/en/language-identification.html +.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 +.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 + +""" + +from flask_babel import gettext +import babel + +from searx.utils import detect_language +from searx.languages import language_codes + + +name = gettext('Autodetect search language') +description = gettext('Automatically detect the query search language and switch to it.') +preference_section = 'general' +default_on = False + +supported_langs = set() +"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" + + +def pre_search(request, search): # pylint: disable=unused-argument + lang = detect_language(search.search_query.query, min_probability=0) + if lang in supported_langs: + search.search_query.lang = lang + try: + search.search_query.locale = babel.Locale.parse(lang) + except babel.core.UnknownLocaleError: + pass + return True + + +def init(app, settings): # pylint: disable=unused-argument + for searxng_locale in language_codes: + supported_langs.add(searxng_locale[0].split('-')[0]) + return True diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index 0705b6822..37f145e1e 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -10,12 +10,10 @@ from timeit import default_timer from urllib.parse import urlparse import re -from langdetect import detect_langs -from langdetect.lang_detect_exception import LangDetectException import httpx from searx import network, logger -from searx.utils import gen_useragent +from searx.utils import gen_useragent, detect_language from searx.results import ResultContainer from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -208,14 +206,10 @@ class ResultContainerTests: self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') def _add_language(self, text: str) -> typing.Optional[str]: - try: - r = detect_langs(str(text)) # pylint: disable=E1101 - except LangDetectException: - return None - - if len(r) > 0 and r[0].prob > 0.95: - self.languages.add(r[0].lang) - self.test_results.add_language(r[0].lang) + langStr = detect_language(text) + if langStr: + self.languages.add(langStr) + self.test_results.add_language(langStr) return None def _check_result(self, result): diff --git a/searx/utils.py b/searx/utils.py index effb9139a..2157a4ce0 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -15,6 +15,7 @@ from os.path import splitext, join from random import choice from html.parser import HTMLParser from urllib.parse import urljoin, urlparse +import fasttext from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult @@ -22,7 +23,7 @@ from babel.core import get_global from searx import settings -from searx.data import USER_AGENTS +from searx.data import USER_AGENTS, data_dir from searx.version import VERSION_TAG from searx.languages import language_codes from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException @@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = { _XPATH_CACHE: Dict[str, XPath] = {} _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} +_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None +"""fasttext model to predict laguage of a search term""" + +# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model. +fasttext.FastText.eprint = lambda x: None + class _NotSetClass: # pylint: disable=too-few-public-methods """Internal class for this module, do not create instance of this class. @@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: # to record xpath_spec raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found') return default + + +def _get_fasttext_model() -> fasttext.FastText._FastText: + global _FASTTEXT_MODEL # pylint: disable=global-statement + if _FASTTEXT_MODEL is None: + _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) + return _FASTTEXT_MODEL + + +def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]: + """https://fasttext.cc/docs/en/language-identification.html""" + if not isinstance(text, str): + raise ValueError('text must a str') + r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) + if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability: + return r[0][0].split('__label__')[1] + return None |