summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-01-30 08:53:48 +0100
committerAlexandre Flament <alex@al-f.net>2023-02-19 10:09:52 +0000
commit0b1444b61eb12a2d23d4c95e2440f24161daaec7 (patch)
tree1537f89be07a1661f5db32809e61c00a566a532e /searx/utils.py
parent6748e8e2d5eff3c2202b2a714afb5534b1573101 (diff)
downloadsearxng-0b1444b61eb12a2d23d4c95e2440f24161daaec7.tar.gz
searxng-0b1444b61eb12a2d23d4c95e2440f24161daaec7.zip
[doc] improved docs of implementations for automatic speech recognition
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py65
1 files changed, 42 insertions, 23 deletions
diff --git a/searx/utils.py b/searx/utils.py
index c3958ae78..e6180906b 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -641,38 +641,56 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
- """Detect the language of the text parameter
+ """Detect the language of the ``text`` parameter.
- Args:
- * text (str): the string whose language is to be detected.
- * threshold (float): threshold filters the returned labels by a threshold on probability.
- A choice of 0.3 will return labels with at least 0.3 probability.
- * only_search_languages (bool): if True, returns only supported SearXNG search languages.
- see :py:obj:`searx.languages`
+ :param str text: The string whose language is to be detected.
+ :param float threshold: Threshold filters the returned labels by a threshold
+ on probability. A choice of 0.3 will return labels with at least 0.3
+ probability.
- Raises:
- * ValueError: if text is not a string
+ :param bool only_search_languages: If ``True``, returns only supported
+ SearXNG search languages. see :py:obj:`searx.languages`
- Returns:
- * result (str, None): the detected language code or None. See below.
+ :rtype: str, None
+ :returns:
+ The detected language code or ``None``. See below.
- The language detection is done by using `a fork`_ of the fastText_ library (`python
- fasttext`_). fastText_ distributes the `language identification model`_, for
- reference:
+ :raises ValueError: If ``text`` is not a string.
+
+ The language detection is done by using `a fork`_ of the fastText_ library
+ (`python fasttext`_). fastText_ distributes the `language identification
+ model`_, for reference:
- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_
- The `language identification model`_ support the language codes (ISO-639-3)::
- af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
- ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
- fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
- is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
- mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
- no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
- sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
- vi vls vo wa war wuu xal xmf yi yo yue zh
+ The `language identification model`_ support the language codes
+ (ISO-639-3)::
+
+ af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
+ bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
+ et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
+ id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
+ lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
+ nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
+ rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
+ tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
+
+ By using ``only_search_languages=True`` the `language identification model`_
+ is harmonized with the SearXNG's language (locale) model. General
+ conditions of SearXNG's locale model are:
+
+ a. SearXNG's locale of a query is passed to the
+ :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
+ code that is used by an engine.
+
+ b. Most of SearXNG's engines do not support all the languages from `language
+ identification model`_ and there is also a discrepancy in the ISO-639-3
+ (fastext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
+ locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
+ (``zh_Hans``) while the `language identification model`_ reduce both to
+ ``zh``.
.. _a fork: https://github.com/searxng/fasttext-predict
.. _fastText: https://fasttext.cc/
@@ -680,6 +698,7 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
+
"""
if not isinstance(text, str):
raise ValueError('text must a str')