summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-08-04 16:54:22 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-08-04 19:06:50 +0200
commitdb522cf76d26e6c871ae5c012787985685e82c17 (patch)
tree33a80158dbb8212e9bee562b682cff96af9b6a18 /searx
parent7d8c20c80d86ac47eac86f11533fee038cd48190 (diff)
downloadsearxng-db522cf76d26e6c871ae5c012787985685e82c17.tar.gz
searxng-db522cf76d26e6c871ae5c012787985685e82c17.zip
[mod] engine: wikimedia - improve results, add addition settings & doc
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/mediawiki.py175
-rw-r--r--searx/settings.yml21
2 files changed, 132 insertions, 64 deletions
diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py
index 27ce36e87..6a9ac974a 100644
--- a/searx/engines/mediawiki.py
+++ b/searx/engines/mediawiki.py
@@ -1,18 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
+the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
+endpoints that follow this pattern::
+
+ https://{base_url}/w/api.php?action=query&list=search&format=json
+
+.. note::
+
+ In its actual state, this engine is implemented to parse JSON result
+ (`format=json`_) from a search query (`list=search`_). If you need other
+ ``action`` and ``list`` types ask SearXNG developers to extend the
+ implementation according to your needs.
+
+.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
+.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
+.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
+.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
+
+Configuration
+=============
+
+Request:
+
+- :py:obj:`base_url`
+- :py:obj:`search_type`
+- :py:obj:`srenablerewrites`
+- :py:obj:`srsort`
+- :py:obj:`srprop`
+
+Implementations
+===============
+
"""
- General mediawiki-engine (Web)
-"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
-from string import Formatter
+from datetime import datetime
from urllib.parse import urlencode, quote
from searx.utils import html_to_text
+from searx.enginelib.traits import EngineTraits
+
+if TYPE_CHECKING:
+ import logging
+
+ logger: logging.Logger
+
+traits: EngineTraits
# about
about = {
"website": None,
"wikidata_id": None,
- "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search',
+ "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
@@ -21,73 +62,119 @@ about = {
# engine dependent config
categories = ['general']
paging = True
-number_of_results = 1
-search_type = 'nearmatch' # possible values: title, text, nearmatch
-
-# search-url
-base_url = 'https://{language}.wikipedia.org/'
-search_postfix = (
- 'w/api.php?action=query'
- '&list=search'
- '&{query}'
- '&format=json'
- '&sroffset={offset}'
- '&srlimit={limit}'
- '&srwhat={searchtype}'
-)
-
-
-# do search-request
-def request(query, params):
- offset = (params['pageno'] - 1) * number_of_results
+number_of_results = 5
- string_args = dict(
- query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type
- )
+search_type: str = 'nearmatch'
+"""Which type of search to perform. One of the following values: ``nearmatch``,
+``text`` or ``title``.
- format_strings = list(Formatter().parse(base_url))
+See ``srwhat`` argument in `list=search`_ documentation.
+"""
- if params['language'] == 'all':
- language = 'en'
- else:
- language = params['language'].split('-')[0]
+srenablerewrites: bool = True
+"""Enable internal query rewriting (Type: boolean). Some search backends can
+rewrite the query into another which is thought to provide better results, for
+instance by correcting spelling errors.
+
+See ``srenablerewrites`` argument in `list=search`_ documentation.
+"""
+
+srsort: str = 'relevance'
+"""Set the sort order of returned results. One of the following values:
+``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
+``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
+``none``, ``random``, ``relevance``, ``user_random``.
+
+See ``srenablerewrites`` argument in `list=search`_ documentation.
+"""
+
+srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
+"""Which properties to return.
+
+See ``srprop`` argument in `list=search`_ documentation.
+"""
+
+base_url: str = 'https://{language}.wikipedia.org/'
+"""Base URL of the Wikimedia wiki.
- # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
- if any(x[1] == 'language' for x in format_strings):
- string_args['language'] = language
+``{language}``:
+ ISO 639-1 language code (en, de, fr ..) of the search language.
+"""
+
+timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
+"""The longhand version of MediaWiki time strings."""
+
+
+def request(query, params):
# write search-language back to params, required in response
- params['language'] = language
- search_url = base_url + search_postfix
+ if params['language'] == 'all':
+ params['language'] = 'en'
+ else:
+ params['language'] = params['language'].split('-')[0]
+
+ if base_url.endswith('/'):
+ api_url = base_url + 'w/api.php?'
+ else:
+ api_url = base_url + '/w/api.php?'
+ api_url = api_url.format(language=params['language'])
- params['url'] = search_url.format(**string_args)
+ offset = (params['pageno'] - 1) * number_of_results
+ args = {
+ 'action': 'query',
+ 'list': 'search',
+ 'format': 'json',
+ 'srsearch': query,
+ 'sroffset': offset,
+ 'srlimit': number_of_results,
+ 'srwhat': search_type,
+ 'srprop': srprop,
+ 'srsort': srsort,
+ }
+ if srenablerewrites:
+ args['srenablerewrites'] = '1'
+
+ params['url'] = api_url + urlencode(args)
return params
# get response from search-request
def response(resp):
- results = []
+ results = []
search_results = resp.json()
# return empty array if there are no results
if not search_results.get('query', {}).get('search'):
return []
- # parse results
for result in search_results['query']['search']:
+
if result.get('snippet', '').startswith('#REDIRECT'):
continue
+
+ title = result['title']
+ sectiontitle = result.get('sectiontitle')
+ content = html_to_text(result.get('snippet', ''))
+ metadata = html_to_text(result.get('categorysnippet', ''))
+ timestamp = result.get('timestamp')
+
url = (
- base_url.format(language=resp.search_params['language'])
- + 'wiki/'
- + quote(result['title'].replace(' ', '_').encode())
+ base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
)
+ if sectiontitle:
+ # in case of sectiontitle create a link to the section in the wiki page
+ url += '#' + quote(sectiontitle.replace(' ', '_').encode())
+ title += ' / ' + sectiontitle
+
+ item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
+
+ if timestamp:
+ item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
- # append result
- results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))})
+ results.append(item)
# return results
return results
diff --git a/searx/settings.yml b/searx/settings.yml
index e498a9ea2..bc9f5a29d 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -667,11 +667,6 @@ engines:
shortcut: fsd
categories: [it, software wikis]
base_url: https://directory.fsf.org/
- number_of_results: 5
- # what part of a page matches the query string: title, text, nearmatch
- # * title - query matches title
- # * text - query matches the text of page
- # * nearmatch - nearmatch in title
search_type: title
timeout: 5.0
disabled: true
@@ -1449,13 +1444,6 @@ engines:
engine: twitter
disabled: true
- # maybe in a fun category
- # - name: uncyclopedia
- # engine: mediawiki
- # shortcut: unc
- # base_url: https://uncyclopedia.wikia.com/
- # number_of_results: 5
-
# tmp suspended - too slow, too many errors
# - name: urbandictionary
# engine : xpath
@@ -1534,7 +1522,6 @@ engines:
shortcut: wb
categories: general
base_url: "https://{language}.wikibooks.org/"
- number_of_results: 5
search_type: text
disabled: true
about:
@@ -1546,9 +1533,9 @@ engines:
shortcut: wn
categories: news
base_url: "https://{language}.wikinews.org/"
- number_of_results: 5
search_type: text
disabled: true
+ srsort: create_timestamp_desc
about:
website: https://www.wikinews.org/
wikidata_id: Q964
@@ -1558,7 +1545,6 @@ engines:
shortcut: wq
categories: general
base_url: "https://{language}.wikiquote.org/"
- number_of_results: 5
search_type: text
disabled: true
additional_tests:
@@ -1572,7 +1558,6 @@ engines:
shortcut: ws
categories: general
base_url: "https://{language}.wikisource.org/"
- number_of_results: 5
search_type: text
disabled: true
about:
@@ -1584,7 +1569,6 @@ engines:
shortcut: wsp
categories: [general, science]
base_url: "https://species.wikimedia.org/"
- number_of_results: 5
search_type: text
disabled: true
about:
@@ -1596,7 +1580,6 @@ engines:
shortcut: wt
categories: [dictionaries]
base_url: "https://{language}.wiktionary.org/"
- number_of_results: 5
search_type: text
about:
website: https://www.wiktionary.org/
@@ -1607,7 +1590,6 @@ engines:
shortcut: wv
categories: general
base_url: "https://{language}.wikiversity.org/"
- number_of_results: 5
search_type: text
disabled: true
about:
@@ -1619,7 +1601,6 @@ engines:
shortcut: wy
categories: general
base_url: "https://{language}.wikivoyage.org/"
- number_of_results: 5
search_type: text
disabled: true
about: