diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2022-10-08 11:32:08 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-03-24 10:37:42 +0100 |
commit | f78f9083836be851c224b4334b53b9686835e300 (patch) | |
tree | d547df5ce6e817da3c2dd1b89fcb07a6891ae87c /searx/engines | |
parent | dba8977b098b7f32dde78b8d7c27c5df50aacecb (diff) | |
download | searxng-f78f9083836be851c224b4334b53b9686835e300.tar.gz searxng-f78f9083836be851c224b4334b53b9686835e300.zip |
[mod] Google: fetch engine traits (data_type: supported_languages)
Implements a fetch_traits function for the Google engines.
.. note::
Does not include migration of the request methode from 'supported_languages'
to 'traits' (EngineTraits) object!
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/google.py | 87 | ||||
-rw-r--r-- | searx/engines/google_images.py | 2 | ||||
-rw-r--r-- | searx/engines/google_news.py | 1 | ||||
-rw-r--r-- | searx/engines/google_scholar.py | 1 | ||||
-rw-r--r-- | searx/engines/google_videos.py | 2 |
5 files changed, 91 insertions, 2 deletions
diff --git a/searx/engines/google.py b/searx/engines/google.py index bdb351432..bee7085ec 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -29,6 +29,9 @@ from urllib.parse import urlencode from lxml import html from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex from searx.exceptions import SearxEngineCaptchaException +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits # about about = { @@ -373,3 +376,87 @@ def _fetch_supported_languages(resp): ret_val[code] = {"name": name} return ret_val + + +skip_countries = [ + # official language of google-country not in google-languages + 'AL', # Albanien (sq) + 'AZ', # Aserbaidschan (az) + 'BD', # Bangladesch (bn) + 'BN', # Brunei Darussalam (ms) + 'BT', # Bhutan (dz) + 'ET', # Äthiopien (am) + 'GE', # Georgien (ka, os) + 'GL', # Grönland (kl) + 'KH', # Kambodscha (km) + 'LA', # Laos (lo) + 'LK', # Sri Lanka (si, ta) + 'ME', # Montenegro (sr) + 'MK', # Nordmazedonien (mk, sq) + 'MM', # Myanmar (my) + 'MN', # Mongolei (mn) + 'MV', # Malediven (dv) // dv_MV is unknown by babel + 'MY', # Malaysia (ms) + 'NP', # Nepal (ne) + 'TJ', # Tadschikistan (tg) + 'TM', # Turkmenistan (tk) + 'UZ', # Usbekistan (uz) +] + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from Google.""" + # pylint: disable=import-outside-toplevel + + engine_traits.data_type = 'supported_languages' # deprecated + + import babel + import babel.languages + from searx import network + from searx.locales import language_tag, region_tag, get_offical_locales + + resp = network.get('https://www.google.com/preferences') + if not resp.ok: + print("ERROR: response from Google is not OK.") + + dom = html.fromstring(resp.text) + + lang_map = {'no': 'nb'} + + for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'): + + eng_lang = x.get("value").split('_')[-1] + try: + locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) + continue + sxng_lang = language_tag(locale) + + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = 'lang_' + eng_lang + + # alias languages + engine_traits.languages['zh'] = 'lang_zh-CN' + + for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'): + eng_country = x.get("value") + + if eng_country in skip_countries: + continue + if eng_country == 'ZZ': + engine_traits.all_locale = 'ZZ' + continue + + sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True) + + if not sxng_locales: + print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country)) + continue + + for sxng_locale in sxng_locales: + engine_traits.regions[region_tag(sxng_locale)] = 'country' + eng_country diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 528f8d21d..219f2adee 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -23,7 +23,7 @@ from searx.engines.google import ( ) # pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages +from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits # pylint: enable=unused-import diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 1ada2d64d..8962af36a 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -28,6 +28,7 @@ from searx.utils import ( # pylint: disable=unused-import from searx.engines.google import ( + fetch_traits, supported_languages_url, _fetch_supported_languages, ) diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index c07cd4cea..38aaf904b 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -31,6 +31,7 @@ from searx.engines.google import ( # pylint: disable=unused-import from searx.engines.google import ( + fetch_traits, supported_languages_url, _fetch_supported_languages, ) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index fc574bd48..5ab29f9ff 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -38,7 +38,7 @@ from searx.engines.google import ( ) # pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages +from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits # pylint: enable=unused-import |