diff options
author | Marc Abonce Seguin <marc-abonce@mailbox.org> | 2020-09-07 22:05:21 -0700 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2020-09-10 09:54:30 +0200 |
commit | ab20ca182cc9d3ea12029fb04181377d56b5e814 (patch) | |
tree | d3141d5d123d216759323d36e188bff6b3215d27 /searx/engines/wikipedia.py | |
parent | d0f9778c2a975771750f7a6566a5768745e51847 (diff) | |
download | searxng-ab20ca182cc9d3ea12029fb04181377d56b5e814.tar.gz searxng-ab20ca182cc9d3ea12029fb04181377d56b5e814.zip |
use Wikipedia's REST v1 API
Diffstat (limited to 'searx/engines/wikipedia.py')
-rw-r--r-- | searx/engines/wikipedia.py | 60 |
1 files changed, 18 insertions, 42 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 7c9378dd8..bff24d16b 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -1,7 +1,7 @@ """ Wikipedia (Web) - @website https://{language}.wikipedia.org + @website https://en.wikipedia.org/api/rest_v1/ @provide-api yes @using-api yes @@ -12,21 +12,11 @@ from json import loads from lxml.html import fromstring -from searx.url_utils import quote, urlencode -from searx.utils import match_language +from searx.url_utils import quote +from searx.utils import match_language, searx_useragent # search-url -base_url = u'https://{language}.wikipedia.org/' -search_url = base_url + u'w/api.php?'\ - 'action=query'\ - '&format=json'\ - '&{query}'\ - '&prop=extracts|pageimages|pageprops'\ - '&ppprop=disambiguation'\ - '&exintro'\ - '&explaintext'\ - '&pithumbsize=300'\ - '&redirects' +search_url = u'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' @@ -41,51 +31,37 @@ def url_lang(lang): # do search-request def request(query, params): if query.islower(): - query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8') + query = query.title() - params['url'] = search_url.format(query=urlencode({'titles': query}), + params['url'] = search_url.format(title=quote(query), language=url_lang(params['language'])) + params['headers']['User-Agent'] = searx_useragent() + return params # get response from search-request def response(resp): - results = [] - - search_result = loads(resp.text) - - # wikipedia article's unique id - # first valid id is assumed to be the requested article - if 'pages' not in search_result['query']: - return results - - for article_id in search_result['query']['pages']: - page = search_result['query']['pages'][article_id] - if int(article_id) > 0: - break - - if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}): + if not resp.ok: return [] - title = page.get('title') - - image = page.get('thumbnail') - if image: - image = image.get('source') + results = [] + api_result = loads(resp.text) - summary = page.get('extract', '').split('\n')[0].replace('()', '') + # skip disambiguation pages + if api_result['type'] != 'standard': + return [] - # link to wikipedia article - wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ - + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) + title = api_result['title'] + wikipedia_link = api_result['content_urls']['desktop']['page'] results.append({'url': wikipedia_link, 'title': title}) results.append({'infobox': title, 'id': wikipedia_link, - 'content': summary, - 'img_src': image, + 'content': api_result.get('extract', ''), + 'img_src': api_result.get('thumbnail', {}).get('source'), 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) return results |