summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2022-12-16 21:54:07 +0100
committerGitHub <noreply@github.com>2022-12-16 21:54:07 +0100
commitb92748219504e67b18771b9831ffcebe76cca62f (patch)
tree61aac219bb9d5d3ce91ce5214171c9170c28502c
parent2a51c856722df19831c20ba455f74a33180a4ec8 (diff)
parent735e388cec91097cc95bfffd0e1a5e4c25e595ef (diff)
downloadsearxng-b92748219504e67b18771b9831ffcebe76cca62f.tar.gz
searxng-b92748219504e67b18771b9831ffcebe76cca62f.zip
Merge pull request #2019 from ArtikusHG/fasttext
Replace langdetect with fasttext (followup of #1969)
-rw-r--r--requirements.txt1
-rw-r--r--searx/plugins/autodetect_search_language.py34
-rw-r--r--searx/search/checker/impl.py16
-rw-r--r--searx/utils.py26
-rwxr-xr-xsearxng_extra/update/update_engine_descriptions.py16
-rw-r--r--tests/unit/test_utils.py22
6 files changed, 61 insertions, 54 deletions
diff --git a/requirements.txt b/requirements.txt
index 8ddb32e7a..5c45fff6b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,6 @@ httpx[http2]==0.21.2
Brotli==1.0.9
uvloop==0.17.0
httpx-socks[asyncio]==0.7.2
-langdetect==1.0.9
setproctitle==1.3.2
redis==4.4.0
markdown-it-py==2.1.0
diff --git a/searx/plugins/autodetect_search_language.py b/searx/plugins/autodetect_search_language.py
index 034668041..026ca9b6f 100644
--- a/searx/plugins/autodetect_search_language.py
+++ b/searx/plugins/autodetect_search_language.py
@@ -66,46 +66,28 @@ that is identified as an English term (try ``:de-DE thermomix``, for example).
"""
from flask_babel import gettext
-import fasttext
import babel
-from searx.data import data_dir
+from searx.utils import detect_language
from searx.languages import language_codes
-# Monkey patch: prevent fasttext from showing a (useless) warning when loading a
-# model.
-fasttext.FastText.eprint = lambda x: None
-
name = gettext('Autodetect search language')
description = gettext('Automatically detect the query search language and switch to it.')
preference_section = 'general'
default_on = False
-lang_model: fasttext.FastText._FastText = None
-"""fasttext model to predict laguage of a search term"""
-
supported_langs = set()
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
-def get_model():
- # lazy load, in order to to save memory
- global lang_model # pylint: disable=global-statement
- if lang_model is None:
- lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
- return lang_model
-
-
def pre_search(request, search): # pylint: disable=unused-argument
- prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3)
- if prediction:
- lang = prediction[0][0].split('__label__')[1]
- if lang in supported_langs:
- search.search_query.lang = lang
- try:
- search.search_query.locale = babel.Locale.parse(lang)
- except babel.core.UnknownLocaleError:
- pass
+ lang = detect_language(search.search_query.query, min_probability=0)
+ if lang in supported_langs:
+ search.search_query.lang = lang
+ try:
+ search.search_query.locale = babel.Locale.parse(lang)
+ except babel.core.UnknownLocaleError:
+ pass
return True
diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py
index 0705b6822..37f145e1e 100644
--- a/searx/search/checker/impl.py
+++ b/searx/search/checker/impl.py
@@ -10,12 +10,10 @@ from timeit import default_timer
from urllib.parse import urlparse
import re
-from langdetect import detect_langs
-from langdetect.lang_detect_exception import LangDetectException
import httpx
from searx import network, logger
-from searx.utils import gen_useragent
+from searx.utils import gen_useragent, detect_language
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@@ -208,14 +206,10 @@ class ResultContainerTests:
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
def _add_language(self, text: str) -> typing.Optional[str]:
- try:
- r = detect_langs(str(text)) # pylint: disable=E1101
- except LangDetectException:
- return None
-
- if len(r) > 0 and r[0].prob > 0.95:
- self.languages.add(r[0].lang)
- self.test_results.add_language(r[0].lang)
+ langStr = detect_language(text)
+ if langStr:
+ self.languages.add(langStr)
+ self.test_results.add_language(langStr)
return None
def _check_result(self, result):
diff --git a/searx/utils.py b/searx/utils.py
index effb9139a..2157a4ce0 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
+import fasttext
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -22,7 +23,7 @@ from babel.core import get_global
from searx import settings
-from searx.data import USER_AGENTS
+from searx.data import USER_AGENTS, data_dir
from searx.version import VERSION_TAG
from searx.languages import language_codes
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
@@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
_XPATH_CACHE: Dict[str, XPath] = {}
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
+_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
+"""fasttext model to predict laguage of a search term"""
+
+# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
+fasttext.FastText.eprint = lambda x: None
+
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
@@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
# to record xpath_spec
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
return default
+
+
+def _get_fasttext_model() -> fasttext.FastText._FastText:
+ global _FASTTEXT_MODEL # pylint: disable=global-statement
+ if _FASTTEXT_MODEL is None:
+ _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
+ return _FASTTEXT_MODEL
+
+
+def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
+ """https://fasttext.cc/docs/en/language-identification.html"""
+ if not isinstance(text, str):
+ raise ValueError('text must a str')
+ r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
+ if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
+ return r[0][0].split('__label__')[1]
+ return None
diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py
index f3d6e7fa8..6052bf084 100755
--- a/searxng_extra/update/update_engine_descriptions.py
+++ b/searxng_extra/update/update_engine_descriptions.py
@@ -17,14 +17,11 @@ from os.path import join
from lxml.html import fromstring
-from langdetect import detect_langs
-from langdetect.lang_detect_exception import LangDetectException
-
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, match_language
from searx.locales import LOCALE_NAMES, locales_initialize
from searx import searx_dir
-from searx.utils import gen_useragent
+from searx.utils import gen_useragent, detect_language
import searx.search
import searx.network
@@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
return None
-def detect_language(text):
- try:
- r = detect_langs(str(text)) # pylint: disable=E1101
- except LangDetectException:
- return None
-
- if len(r) > 0 and r[0].prob > 0.95:
- return r[0].lang
- return None
-
-
def get_website_description(url, lang1, lang2=None):
headers = {
'User-Agent': gen_useragent(),
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 8ac7db479..6f51f1ee3 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
self.assertEqual(context.exception.message, 'the result is not a list')
+
+ def test_detect_language(self):
+ # make sure new line are not an issue
+ # fasttext.predict('') does not accept new line.
+ l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
+ self.assertEqual(l, 'en')
+
+ l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
+ self.assertEqual(l, 'ja')
+
+ l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
+ self.assertEqual(l, 'tr')
+
+ l = utils.detect_language('')
+ self.assertIsNone(l)
+
+ # mix languages --> None
+ l = utils.detect_language('The いろはにほへと Pijamalı')
+ self.assertIsNone(l)
+
+ with self.assertRaises(ValueError):
+ utils.detect_language(None)