diff options
author | Markus Heiser <markus.heiser@darmarIT.de> | 2023-04-25 15:46:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-25 15:46:26 +0200 |
commit | 45529f51a11c30e484dcce566108681003ea076d (patch) | |
tree | 38cbf0fddd3adcf85d67eceb97045605ba736fc0 | |
parent | 12df30070be65d655611365042cccb7dabc0cd5e (diff) | |
parent | 8adbc4fcecd3c3acb50e719bfe3794615649c793 (diff) | |
download | searxng-45529f51a11c30e484dcce566108681003ea076d.tar.gz searxng-45529f51a11c30e484dcce566108681003ea076d.zip |
Merge pull request #2347 from return42/mod-lang-detection
If language recognition fails use the Accept-Language
-rw-r--r-- | searx/preferences.py | 61 | ||||
-rw-r--r-- | searx/search/__init__.py | 43 | ||||
-rw-r--r-- | searx/settings.yml | 2 | ||||
-rw-r--r-- | searx/webadapter.py | 32 | ||||
-rwxr-xr-x | searx/webapp.py | 39 |
5 files changed, 106 insertions, 71 deletions
diff --git a/searx/preferences.py b/searx/preferences.py index 3da6d5d16..8552305a7 100644 --- a/searx/preferences.py +++ b/searx/preferences.py @@ -8,9 +8,10 @@ from base64 import urlsafe_b64encode, urlsafe_b64decode from zlib import compress, decompress from urllib.parse import parse_qs, urlencode -from typing import Iterable, Dict, List +from typing import Iterable, Dict, List, Optional import flask +import babel from searx import settings, autocomplete from searx.enginelib import Engine @@ -287,10 +288,65 @@ class PluginsSetting(BooleanChoices): return [item[len('plugin_') :] for item in items] +class ClientPref: + """Container to assemble client prefferences and settings.""" + + # hint: searx.webapp.get_client_settings should be moved into this class + + locale: babel.Locale + """Locale prefered by the client.""" + + def __init__(self, locale: Optional[babel.Locale] = None): + self.locale = locale + + @property + def locale_tag(self): + if self.locale is None: + return None + tag = self.locale.language + if self.locale.territory: + tag += '-' + self.locale.territory + return tag + + @classmethod + def from_http_request(cls, http_request: flask.Request): + """Build ClientPref object from HTTP request. + + - `Accept-Language used for locale setting + <https://www.w3.org/International/questions/qa-accept-lang-locales.en>`__ + + """ + al_header = http_request.headers.get("Accept-Language") + if not al_header: + return cls(locale=None) + + pairs = [] + for l in al_header.split(','): + # fmt: off + lang, qvalue = [_.strip() for _ in (l.split(';') + ['q=1',])[:2]] + # fmt: on + try: + qvalue = float(qvalue.split('=')[-1]) + locale = babel.Locale.parse(lang, sep='-') + except (ValueError, babel.core.UnknownLocaleError): + continue + pairs.append((locale, qvalue)) + pairs.sort(reverse=True, key=lambda x: x[1]) + return cls(locale=pairs[0][0]) + + class Preferences: """Validates and saves preferences to cookies""" - def __init__(self, themes: List[str], categories: List[str], engines: Dict[str, Engine], plugins: Iterable[Plugin]): + def __init__( + self, + themes: List[str], + categories: List[str], + engines: Dict[str, Engine], + plugins: Iterable[Plugin], + client: Optional[ClientPref] = None, + ): + super().__init__() self.key_value_settings: Dict[str, Setting] = { @@ -414,6 +470,7 @@ class Preferences: self.engines = EnginesSetting('engines', engines=engines.values()) self.plugins = PluginsSetting('plugins', plugins=plugins) self.tokens = SetSetting('tokens') + self.client = client or ClientPref() self.unknown_params: Dict[str, str] = {} def get_as_url_params(self): diff --git a/searx/search/__init__.py b/searx/search/__init__.py index e5465880c..77121c426 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -22,7 +22,6 @@ from searx.network import initialize as initialize_network, check_network_config from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time from searx.search.processors import PROCESSORS, initialize as initialize_processors from searx.search.checker import initialize as initialize_checker -from searx.utils import detect_language logger = logger.getChild('search') @@ -40,57 +39,19 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False, initialize_checker() -def replace_auto_language(search_query: SearchQuery): - """ - Do nothing except if `search_query.lang` is "auto". - In this case: - * the value "auto" is replaced by the detected language of the query. - The default value is "all" when no language is detected. - * `search_query.locale` is updated accordingly - - Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep - only languages supported by the engines. - """ - if search_query.lang != 'auto': - return - - detected_lang = detect_language(search_query.query, threshold=0.3, only_search_languages=True) - if detected_lang is None: - # fallback to 'all' if no language has been detected - search_query.lang = 'all' - search_query.locale = None - return - search_query.lang = detected_lang - try: - search_query.locale = babel.Locale.parse(search_query.lang) - except babel.core.UnknownLocaleError: - search_query.locale = None - - class Search: """Search information container""" __slots__ = "search_query", "result_container", "start_time", "actual_timeout" def __init__(self, search_query: SearchQuery): - """Initialize the Search - - search_query is copied - """ + """Initialize the Search""" # init vars super().__init__() + self.search_query = search_query self.result_container = ResultContainer() self.start_time = None self.actual_timeout = None - self.search_query = copy(search_query) - self.update_search_query(self.search_query) - - def update_search_query(self, search_query: SearchQuery): - """Update search_query. - - call replace_auto_language to replace the "auto" language - """ - replace_auto_language(search_query) def search_external_bang(self): """ diff --git a/searx/settings.yml b/searx/settings.yml index e63ded9ad..5e593dc40 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -31,7 +31,7 @@ search: autocomplete_min: 4 # Default search language - leave blank to detect from browser information or # use codes from 'languages.py' - default_lang: "" + default_lang: "auto" # Available languages # languages: # - all diff --git a/searx/webadapter.py b/searx/webadapter.py index dbcf25058..121319eeb 100644 --- a/searx/webadapter.py +++ b/searx/webadapter.py @@ -6,6 +6,7 @@ from searx.query import RawTextQuery from searx.engines import categories, engines from searx.search import SearchQuery, EngineRef from searx.preferences import Preferences, is_locked +from searx.utils import detect_language # remove duplicate queries. @@ -214,7 +215,27 @@ def parse_engine_data(form): def get_search_query_from_webapp( preferences: Preferences, form: Dict[str, str] -) -> Tuple[SearchQuery, RawTextQuery, List[EngineRef], List[EngineRef]]: +) -> Tuple[SearchQuery, RawTextQuery, List[EngineRef], List[EngineRef], str]: + """Assemble data from preferences and request.form (from the HTML form) needed + in a search query. + + The returned tuple consits of: + + 1. instance of :py:obj:`searx.search.SearchQuery` + 2. instance of :py:obj:`searx.query.RawTextQuery` + 3. list of :py:obj:`searx.search.EngineRef` instances + 4. string with the *selected locale* of the query + + About language/locale: if the client selects the alias ``auto`` the + ``SearchQuery`` object is build up by the :py:obj:`detected language + <searx.utils.detect_language>`. If language recognition does not have a + match the language preferred by the :py:obj:`Preferences.client` is used. + If client does not have a preference, the default ``all`` is used. + + The *selected locale* in the tuple always represents the selected + language/locale and might differ from the language recognition. + + """ # no text for the query ? if not form.get('q'): raise SearxParameterException('q', '') @@ -229,13 +250,19 @@ def get_search_query_from_webapp( # set query query = raw_text_query.getQuery() query_pageno = parse_pageno(form) - query_lang = parse_lang(preferences, form, raw_text_query) query_safesearch = parse_safesearch(preferences, form) query_time_range = parse_time_range(form) query_timeout = parse_timeout(form, raw_text_query) external_bang = raw_text_query.external_bang engine_data = parse_engine_data(form) + query_lang = parse_lang(preferences, form, raw_text_query) + selected_locale = query_lang + + if query_lang == 'auto': + query_lang = detect_language(query, threshold=0.8, only_search_languages=True) + query_lang = query_lang or preferences.client.locale_tag or 'all' + if not is_locked('categories') and raw_text_query.specific: # if engines are calculated from query, # set categories by using that information @@ -265,4 +292,5 @@ def get_search_query_from_webapp( raw_text_query, query_engineref_list_unknown, query_engineref_list_notoken, + selected_locale, ) diff --git a/searx/webapp.py b/searx/webapp.py index 388b28c38..79255652f 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -84,6 +84,7 @@ from searx.webutils import ( from searx.webadapter import ( get_search_query_from_webapp, get_selected_categories, + parse_lang, ) from searx.utils import ( html_to_text, @@ -96,6 +97,7 @@ from searx.plugins import Plugin, plugins, initialize as plugin_initialize from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import ( Preferences, + ClientPref, ValidationException, ) from searx.answerers import ( @@ -221,16 +223,9 @@ babel = Babel(app, locale_selector=get_locale) def _get_browser_language(req, lang_list): - for lang in req.headers.get("Accept-Language", "en").split(","): - if ';' in lang: - lang = lang.split(';')[0] - if '-' in lang: - lang_parts = lang.split('-') - lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper()) - locale = match_locale(lang, lang_list, fallback=None) - if locale is not None: - return locale - return 'en' + client = ClientPref.from_http_request(req) + locale = match_locale(client.locale_tag, lang_list, fallback='en') + return locale def _get_locale_rfc5646(locale): @@ -446,11 +441,7 @@ def render(template_name: str, **kwargs): kwargs['rtl'] = True if 'current_language' not in kwargs: - _locale = request.preferences.get_value('language') - if _locale in ('auto', 'all'): - kwargs['current_language'] = _locale - else: - kwargs['current_language'] = match_locale(_locale, settings['search']['languages']) + kwargs['current_language'] = parse_lang(request.preferences, {}, RawTextQuery('', [])) # values from settings kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html'] @@ -512,7 +503,10 @@ def pre_request(): request.timings = [] # pylint: disable=assigning-non-slot request.errors = [] # pylint: disable=assigning-non-slot - preferences = Preferences(themes, list(categories.keys()), engines, plugins) # pylint: disable=redefined-outer-name + client_pref = ClientPref.from_http_request(request) + # pylint: disable=redefined-outer-name + preferences = Preferences(themes, list(categories.keys()), engines, plugins, client_pref) + user_agent = request.headers.get('User-Agent', '').lower() if 'webkit' in user_agent and 'android' in user_agent: preferences.key_value_settings['method'].value = 'GET' @@ -681,7 +675,9 @@ def search(): raw_text_query = None result_container = None try: - search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form) + search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp( + request.preferences, request.form + ) # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name @@ -812,13 +808,6 @@ def search(): ) ) - if search_query.lang in ('auto', 'all'): - current_language = search_query.lang - else: - current_language = match_locale( - search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language") - ) - # search_query.lang contains the user choice (all, auto, en, ...) # when the user choice is "auto", search.search_query.lang contains the detected language # otherwise it is equals to search_query.lang @@ -841,7 +830,7 @@ def search(): result_container.unresponsive_engines ), current_locale = request.preferences.get_value("locale"), - current_language = current_language, + current_language = selected_locale, search_language = match_locale( search.search_query.lang, settings['search']['languages'], |