diff options
author | Alexandre Flament <alex@al-f.net> | 2022-12-16 21:54:07 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-16 21:54:07 +0100 |
commit | b92748219504e67b18771b9831ffcebe76cca62f (patch) | |
tree | 61aac219bb9d5d3ce91ce5214171c9170c28502c | |
parent | 2a51c856722df19831c20ba455f74a33180a4ec8 (diff) | |
parent | 735e388cec91097cc95bfffd0e1a5e4c25e595ef (diff) | |
download | searxng-b92748219504e67b18771b9831ffcebe76cca62f.tar.gz searxng-b92748219504e67b18771b9831ffcebe76cca62f.zip |
Merge pull request #2019 from ArtikusHG/fasttext
Replace langdetect with fasttext (followup of #1969)
-rw-r--r-- | requirements.txt | 1 | ||||
-rw-r--r-- | searx/plugins/autodetect_search_language.py | 34 | ||||
-rw-r--r-- | searx/search/checker/impl.py | 16 | ||||
-rw-r--r-- | searx/utils.py | 26 | ||||
-rwxr-xr-x | searxng_extra/update/update_engine_descriptions.py | 16 | ||||
-rw-r--r-- | tests/unit/test_utils.py | 22 |
6 files changed, 61 insertions, 54 deletions
diff --git a/requirements.txt b/requirements.txt index 8ddb32e7a..5c45fff6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ httpx[http2]==0.21.2 Brotli==1.0.9 uvloop==0.17.0 httpx-socks[asyncio]==0.7.2 -langdetect==1.0.9 setproctitle==1.3.2 redis==4.4.0 markdown-it-py==2.1.0 diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py index 034668041..026ca9b6f 100644 --- a/searx/plugins/autodetect_search_language.py +++ b/searx/plugins/autodetect_search_language.py @@ -66,46 +66,28 @@ that is identified as an English term (try ``:de-DE thermomix``, for example). """ from flask_babel import gettext -import fasttext import babel -from searx.data import data_dir +from searx.utils import detect_language from searx.languages import language_codes -# Monkey patch: prevent fasttext from showing a (useless) warning when loading a -# model. -fasttext.FastText.eprint = lambda x: None - name = gettext('Autodetect search language') description = gettext('Automatically detect the query search language and switch to it.') preference_section = 'general' default_on = False -lang_model: fasttext.FastText._FastText = None -"""fasttext model to predict laguage of a search term""" - supported_langs = set() """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" -def get_model(): - # lazy load, in order to to save memory - global lang_model # pylint: disable=global-statement - if lang_model is None: - lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz')) - return lang_model - - def pre_search(request, search): # pylint: disable=unused-argument - prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3) - if prediction: - lang = prediction[0][0].split('__label__')[1] - if lang in supported_langs: - search.search_query.lang = lang - try: - search.search_query.locale = babel.Locale.parse(lang) - except babel.core.UnknownLocaleError: - pass + lang = detect_language(search.search_query.query, min_probability=0) + if lang in supported_langs: + search.search_query.lang = lang + try: + search.search_query.locale = babel.Locale.parse(lang) + except babel.core.UnknownLocaleError: + pass return True diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index 0705b6822..37f145e1e 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -10,12 +10,10 @@ from timeit import default_timer from urllib.parse import urlparse import re -from langdetect import detect_langs -from langdetect.lang_detect_exception import LangDetectException import httpx from searx import network, logger -from searx.utils import gen_useragent +from searx.utils import gen_useragent, detect_language from searx.results import ResultContainer from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -208,14 +206,10 @@ class ResultContainerTests: self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') def _add_language(self, text: str) -> typing.Optional[str]: - try: - r = detect_langs(str(text)) # pylint: disable=E1101 - except LangDetectException: - return None - - if len(r) > 0 and r[0].prob > 0.95: - self.languages.add(r[0].lang) - self.test_results.add_language(r[0].lang) + langStr = detect_language(text) + if langStr: + self.languages.add(langStr) + self.test_results.add_language(langStr) return None def _check_result(self, result): diff --git a/searx/utils.py b/searx/utils.py index effb9139a..2157a4ce0 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -15,6 +15,7 @@ from os.path import splitext, join from random import choice from html.parser import HTMLParser from urllib.parse import urljoin, urlparse +import fasttext from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult @@ -22,7 +23,7 @@ from babel.core import get_global from searx import settings -from searx.data import USER_AGENTS +from searx.data import USER_AGENTS, data_dir from searx.version import VERSION_TAG from searx.languages import language_codes from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException @@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = { _XPATH_CACHE: Dict[str, XPath] = {} _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} +_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None +"""fasttext model to predict laguage of a search term""" + +# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model. +fasttext.FastText.eprint = lambda x: None + class _NotSetClass: # pylint: disable=too-few-public-methods """Internal class for this module, do not create instance of this class. @@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: # to record xpath_spec raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found') return default + + +def _get_fasttext_model() -> fasttext.FastText._FastText: + global _FASTTEXT_MODEL # pylint: disable=global-statement + if _FASTTEXT_MODEL is None: + _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) + return _FASTTEXT_MODEL + + +def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]: + """https://fasttext.cc/docs/en/language-identification.html""" + if not isinstance(text, str): + raise ValueError('text must a str') + r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) + if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability: + return r[0][0].split('__label__')[1] + return None diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index f3d6e7fa8..6052bf084 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -17,14 +17,11 @@ from os.path import join from lxml.html import fromstring -from langdetect import detect_langs -from langdetect.lang_detect_exception import LangDetectException - from searx.engines import wikidata, set_loggers from searx.utils import extract_text, match_language from searx.locales import LOCALE_NAMES, locales_initialize from searx import searx_dir -from searx.utils import gen_useragent +from searx.utils import gen_useragent, detect_language import searx.search import searx.network @@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid): return None -def detect_language(text): - try: - r = detect_langs(str(text)) # pylint: disable=E1101 - except LangDetectException: - return None - - if len(r) > 0 and r[0].prob > 0.95: - return r[0].lang - return None - - def get_website_description(url, lang1, lang2=None): headers = { 'User-Agent': gen_useragent(), diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 8ac7db479..6f51f1ee3 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase): with self.assertRaises(SearxEngineXPathException) as context: utils.eval_xpath_getindex(doc, 'count(//i)', 1) self.assertEqual(context.exception.message, 'the result is not a list') + + def test_detect_language(self): + # make sure new line are not an issue + # fasttext.predict('') does not accept new line. + l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog') + self.assertEqual(l, 'en') + + l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす') + self.assertEqual(l, 'ja') + + l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.') + self.assertEqual(l, 'tr') + + l = utils.detect_language('') + self.assertIsNone(l) + + # mix languages --> None + l = utils.detect_language('The いろはにほへと Pijamalı') + self.assertIsNone(l) + + with self.assertRaises(ValueError): + utils.detect_language(None) |