diff options
Diffstat (limited to 'utils/fetch_engine_descriptions.py')
-rw-r--r-- | utils/fetch_engine_descriptions.py | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/utils/fetch_engine_descriptions.py b/utils/fetch_engine_descriptions.py new file mode 100644 index 000000000..9ca001d45 --- /dev/null +++ b/utils/fetch_engine_descriptions.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +import sys +import json +from urllib.parse import quote, urlparse +from os.path import realpath, dirname +import cld3 +from lxml.html import fromstring + +# set path +sys.path.append(realpath(dirname(realpath(__file__)) + '/../')) + +from searx.engines.wikidata import send_wikidata_query +from searx.utils import extract_text +import searx +import searx.search +import searx.poolrequests + +SPARQL_WIKIPEDIA_ARTICLE = """ +SELECT DISTINCT ?item ?name +WHERE { + VALUES ?item { %IDS% } + ?article schema:about ?item ; + schema:inLanguage ?lang ; + schema:name ?name ; + schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] . + FILTER(?lang in (%LANGUAGES_SPARQL%)) . + FILTER (!CONTAINS(?name, ':')) . +} +""" + +SPARQL_DESCRIPTION = """ +SELECT DISTINCT ?item ?itemDescription +WHERE { + VALUES ?item { %IDS% } + ?item schema:description ?itemDescription . + FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%)) +} +ORDER BY ?itemLang +""" + +LANGUAGES = searx.settings['locales'].keys() +LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) +IDS = None + +descriptions = {} +wd_to_engine_name = {} + + +def normalize_description(description): + for c in [chr(c) for c in range(0, 31)]: + description = description.replace(c, ' ') + description = ' '.join(description.strip().split()) + return description + + +def update_description(engine_name, lang, description, source, replace=True): + if replace or lang not in descriptions[engine_name]: + descriptions[engine_name][lang] = [normalize_description(description), source] + + +def get_wikipedia_summary(language, pageid): + search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' + url = search_url.format(title=quote(pageid), language=language) + try: + response = searx.poolrequests.get(url) + response.raise_for_status() + api_result = json.loads(response.text) + return api_result.get('extract') + except: + return None + + +def detect_language(text): + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.98 and r.is_reliable: + return r.language + return None + + +def get_website_description(url, lang1, lang2=None): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0', + } + if lang1 is not None: + lang_list = [lang1] + if lang2 is not None: + lang_list.append(lang2) + headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' + try: + response = searx.poolrequests.get(url, headers=headers, timeout=10) + response.raise_for_status() + except Exception: + return (None, None) + + try: + html = fromstring(response.text) + except ValueError: + html = fromstring(response.content) + + description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/title')) + lang = extract_text(html.xpath('/html/@lang')) + if lang is None and len(lang1) > 0: + lang = lang1 + lang = detect_language(description) or lang or 'en' + lang = lang.split('_')[0] + lang = lang.split('-')[0] + return (lang, description) + + +def initialize(): + global descriptions, wd_to_engine_name, IDS + searx.search.initialize() + for engine_name, engine in searx.engines.engines.items(): + descriptions[engine_name] = {} + wikidata_id = getattr(engine, "about", {}).get('wikidata_id') + if wikidata_id is not None: + wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name) + + IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys()))) + + +def fetch_wikidata_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_DESCRIPTION + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['itemDescription']['xml:lang'] + description = binding['itemDescription']['value'] + if ' ' in description: # skip unique word description (like "website") + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikidata') + + +def fetch_wikipedia_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['name']['xml:lang'] + pageid = binding['name']['value'] + description = get_wikipedia_summary(lang, pageid) + if description is not None and ' ' in description: + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikipedia') + + +def normalize_url(url): + url = url.replace('{language}', 'en') + url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl() + url = url.replace('https://api.', 'https://') + return url + + +def fetch_website_description(engine_name, website): + default_lang, default_description = get_website_description(website, None, None) + if default_lang is None or default_description is None: + return + if default_lang not in descriptions[engine_name]: + descriptions[engine_name][default_lang] = [normalize_description(default_description), website] + for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'): + if request_lang.split('-')[0] not in descriptions[engine_name]: + lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0]) + if desc is not None and desc != default_description: + update_description(engine_name, lang, desc, website, replace=False) + else: + break + + +def fetch_website_descriptions(): + for engine_name, engine in searx.engines.engines.items(): + website = getattr(engine, "about", {}).get('website') + if website is None: + website = normalize_url(getattr(engine, "search_url")) + if website is None: + website = normalize_url(getattr(engine, "base_url")) + if website is not None: + fetch_website_description(engine_name, website) + + +def main(): + initialize() + fetch_wikidata_descriptions() + fetch_wikipedia_descriptions() + fetch_website_descriptions() + + sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False)) + + +if __name__ == "__main__": + main() |