diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2023-08-04 16:54:22 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarIT.de> | 2023-08-04 19:06:50 +0200 |
commit | db522cf76d26e6c871ae5c012787985685e82c17 (patch) | |
tree | 33a80158dbb8212e9bee562b682cff96af9b6a18 /searx | |
parent | 7d8c20c80d86ac47eac86f11533fee038cd48190 (diff) | |
download | searxng-db522cf76d26e6c871ae5c012787985685e82c17.tar.gz searxng-db522cf76d26e6c871ae5c012787985685e82c17.zip |
[mod] engine: wikimedia - improve results, add addition settings & doc
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r-- | searx/engines/mediawiki.py | 175 | ||||
-rw-r--r-- | searx/settings.yml | 21 |
2 files changed, 132 insertions, 64 deletions
diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 27ce36e87..6a9ac974a 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -1,18 +1,59 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by +the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have +endpoints that follow this pattern:: + + https://{base_url}/w/api.php?action=query&list=search&format=json + +.. note:: + + In its actual state, this engine is implemented to parse JSON result + (`format=json`_) from a search query (`list=search`_). If you need other + ``action`` and ``list`` types ask SearXNG developers to extend the + implementation according to your needs. + +.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page +.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query +.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch +.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json + +Configuration +============= + +Request: + +- :py:obj:`base_url` +- :py:obj:`search_type` +- :py:obj:`srenablerewrites` +- :py:obj:`srsort` +- :py:obj:`srprop` + +Implementations +=============== + """ - General mediawiki-engine (Web) -""" +from __future__ import annotations +from typing import TYPE_CHECKING -from string import Formatter +from datetime import datetime from urllib.parse import urlencode, quote from searx.utils import html_to_text +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { "website": None, "wikidata_id": None, - "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search', + "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query', "use_official_api": True, "require_api_key": False, "results": 'JSON', @@ -21,73 +62,119 @@ about = { # engine dependent config categories = ['general'] paging = True -number_of_results = 1 -search_type = 'nearmatch' # possible values: title, text, nearmatch - -# search-url -base_url = 'https://{language}.wikipedia.org/' -search_postfix = ( - 'w/api.php?action=query' - '&list=search' - '&{query}' - '&format=json' - '&sroffset={offset}' - '&srlimit={limit}' - '&srwhat={searchtype}' -) - - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * number_of_results +number_of_results = 5 - string_args = dict( - query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type - ) +search_type: str = 'nearmatch' +"""Which type of search to perform. One of the following values: ``nearmatch``, +``text`` or ``title``. - format_strings = list(Formatter().parse(base_url)) +See ``srwhat`` argument in `list=search`_ documentation. +""" - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('-')[0] +srenablerewrites: bool = True +"""Enable internal query rewriting (Type: boolean). Some search backends can +rewrite the query into another which is thought to provide better results, for +instance by correcting spelling errors. + +See ``srenablerewrites`` argument in `list=search`_ documentation. +""" + +srsort: str = 'relevance' +"""Set the sort order of returned results. One of the following values: +``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``, +``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``, +``none``, ``random``, ``relevance``, ``user_random``. + +See ``srenablerewrites`` argument in `list=search`_ documentation. +""" + +srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet' +"""Which properties to return. + +See ``srprop`` argument in `list=search`_ documentation. +""" + +base_url: str = 'https://{language}.wikipedia.org/' +"""Base URL of the Wikimedia wiki. - # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] - if any(x[1] == 'language' for x in format_strings): - string_args['language'] = language +``{language}``: + ISO 639-1 language code (en, de, fr ..) of the search language. +""" + +timestamp_format = '%Y-%m-%dT%H:%M:%SZ' +"""The longhand version of MediaWiki time strings.""" + + +def request(query, params): # write search-language back to params, required in response - params['language'] = language - search_url = base_url + search_postfix + if params['language'] == 'all': + params['language'] = 'en' + else: + params['language'] = params['language'].split('-')[0] + + if base_url.endswith('/'): + api_url = base_url + 'w/api.php?' + else: + api_url = base_url + '/w/api.php?' + api_url = api_url.format(language=params['language']) - params['url'] = search_url.format(**string_args) + offset = (params['pageno'] - 1) * number_of_results + args = { + 'action': 'query', + 'list': 'search', + 'format': 'json', + 'srsearch': query, + 'sroffset': offset, + 'srlimit': number_of_results, + 'srwhat': search_type, + 'srprop': srprop, + 'srsort': srsort, + } + if srenablerewrites: + args['srenablerewrites'] = '1' + + params['url'] = api_url + urlencode(args) return params # get response from search-request def response(resp): - results = [] + results = [] search_results = resp.json() # return empty array if there are no results if not search_results.get('query', {}).get('search'): return [] - # parse results for result in search_results['query']['search']: + if result.get('snippet', '').startswith('#REDIRECT'): continue + + title = result['title'] + sectiontitle = result.get('sectiontitle') + content = html_to_text(result.get('snippet', '')) + metadata = html_to_text(result.get('categorysnippet', '')) + timestamp = result.get('timestamp') + url = ( - base_url.format(language=resp.search_params['language']) - + 'wiki/' - + quote(result['title'].replace(' ', '_').encode()) + base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode()) ) + if sectiontitle: + # in case of sectiontitle create a link to the section in the wiki page + url += '#' + quote(sectiontitle.replace(' ', '_').encode()) + title += ' / ' + sectiontitle + + item = {'url': url, 'title': title, 'content': content, 'metadata': metadata} + + if timestamp: + item['publishedDate'] = datetime.strptime(timestamp, timestamp_format) - # append result - results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))}) + results.append(item) # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index e498a9ea2..bc9f5a29d 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -667,11 +667,6 @@ engines: shortcut: fsd categories: [it, software wikis] base_url: https://directory.fsf.org/ - number_of_results: 5 - # what part of a page matches the query string: title, text, nearmatch - # * title - query matches title - # * text - query matches the text of page - # * nearmatch - nearmatch in title search_type: title timeout: 5.0 disabled: true @@ -1449,13 +1444,6 @@ engines: engine: twitter disabled: true - # maybe in a fun category - # - name: uncyclopedia - # engine: mediawiki - # shortcut: unc - # base_url: https://uncyclopedia.wikia.com/ - # number_of_results: 5 - # tmp suspended - too slow, too many errors # - name: urbandictionary # engine : xpath @@ -1534,7 +1522,6 @@ engines: shortcut: wb categories: general base_url: "https://{language}.wikibooks.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1546,9 +1533,9 @@ engines: shortcut: wn categories: news base_url: "https://{language}.wikinews.org/" - number_of_results: 5 search_type: text disabled: true + srsort: create_timestamp_desc about: website: https://www.wikinews.org/ wikidata_id: Q964 @@ -1558,7 +1545,6 @@ engines: shortcut: wq categories: general base_url: "https://{language}.wikiquote.org/" - number_of_results: 5 search_type: text disabled: true additional_tests: @@ -1572,7 +1558,6 @@ engines: shortcut: ws categories: general base_url: "https://{language}.wikisource.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1584,7 +1569,6 @@ engines: shortcut: wsp categories: [general, science] base_url: "https://species.wikimedia.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1596,7 +1580,6 @@ engines: shortcut: wt categories: [dictionaries] base_url: "https://{language}.wiktionary.org/" - number_of_results: 5 search_type: text about: website: https://www.wiktionary.org/ @@ -1607,7 +1590,6 @@ engines: shortcut: wv categories: general base_url: "https://{language}.wikiversity.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1619,7 +1601,6 @@ engines: shortcut: wy categories: general base_url: "https://{language}.wikivoyage.org/" - number_of_results: 5 search_type: text disabled: true about: |