summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorArtikusHG <artiomoleynic@gmail.com>2022-12-11 17:45:47 +0200
committerArtikusHG <artiomoleynic@gmail.com>2022-12-16 21:07:39 +0200
commit1f8f8c1e91040fd10bacdc473b8d5f97dda6424e (patch)
tree30ed501e9a052ae4a9d81af45026af9a1deecfcf /searx/utils.py
parenta6d870d5cfecfee78dcdd2a67cce318c894da2da (diff)
downloadsearxng-1f8f8c1e91040fd10bacdc473b8d5f97dda6424e.tar.gz
searxng-1f8f8c1e91040fd10bacdc473b8d5f97dda6424e.zip
Replace langdetect with fasttext
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py26
1 files changed, 25 insertions, 1 deletions
diff --git a/searx/utils.py b/searx/utils.py
index effb9139a..2157a4ce0 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
+import fasttext
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -22,7 +23,7 @@ from babel.core import get_global
from searx import settings
-from searx.data import USER_AGENTS
+from searx.data import USER_AGENTS, data_dir
from searx.version import VERSION_TAG
from searx.languages import language_codes
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
@@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
_XPATH_CACHE: Dict[str, XPath] = {}
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
+_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
+"""fasttext model to predict laguage of a search term"""
+
+# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
+fasttext.FastText.eprint = lambda x: None
+
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
@@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
# to record xpath_spec
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
return default
+
+
+def _get_fasttext_model() -> fasttext.FastText._FastText:
+ global _FASTTEXT_MODEL # pylint: disable=global-statement
+ if _FASTTEXT_MODEL is None:
+ _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
+ return _FASTTEXT_MODEL
+
+
+def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
+ """https://fasttext.cc/docs/en/language-identification.html"""
+ if not isinstance(text, str):
+ raise ValueError('text must a str')
+ r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
+ if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
+ return r[0][0].split('__label__')[1]
+ return None