summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2021-01-05 11:24:39 +0100
committerAlexandre Flament <alex@al-f.net>2021-01-12 11:47:17 +0100
commit3a9f513521d006a7939538cce368d7b799e32c30 (patch)
tree238da19ef56643af53b13ef24a330277ed53ae51 /searx
parent6e2872f43625aba71eba019e16f7fbd74743f590 (diff)
downloadsearxng-3a9f513521d006a7939538cce368d7b799e32c30.tar.gz
searxng-3a9f513521d006a7939538cce368d7b799e32c30.zip
[enh] checker: background check
See settings.yml for the options SIGUSR1 signal starts the checker. The result is available at /stats/checker
Diffstat (limited to 'searx')
-rw-r--r--searx/search/__init__.py72
-rw-r--r--searx/search/checker/__init__.py3
-rw-r--r--searx/search/checker/__main__.py30
-rw-r--r--searx/search/checker/background.py106
-rw-r--r--searx/search/checker/impl.py12
-rw-r--r--searx/search/models.py69
-rw-r--r--searx/settings.yml45
-rwxr-xr-xsearx/webapp.py12
8 files changed, 253 insertions, 96 deletions
diff --git a/searx/search/__init__.py b/searx/search/__init__.py
index 7768d21e9..f777e8595 100644
--- a/searx/search/__init__.py
+++ b/searx/search/__init__.py
@@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
+from searx.search.models import EngineRef, SearchQuery
from searx.search.processors import processors, initialize as initialize_processors
+from searx.search.checker import initialize as initialize_checker
logger = logger.getChild('search')
@@ -45,75 +47,11 @@ else:
sys.exit(1)
-def initialize(settings_engines=None):
+def initialize(settings_engines=None, enable_checker=False):
settings_engines = settings_engines or settings['engines']
initialize_processors(settings_engines)
-
-
-class EngineRef:
-
- __slots__ = 'name', 'category'
-
- def __init__(self, name: str, category: str):
- self.name = name
- self.category = category
-
- def __repr__(self):
- return "EngineRef({!r}, {!r})".format(self.name, self.category)
-
- def __eq__(self, other):
- return self.name == other.name and self.category == other.category
-
- def __hash__(self):
- return hash((self.name, self.category))
-
-
-class SearchQuery:
- """container for all the search parameters (query, language, etc...)"""
-
- __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
- 'timeout_limit', 'external_bang'
-
- def __init__(self,
- query: str,
- engineref_list: typing.List[EngineRef],
- lang: str='all',
- safesearch: int=0,
- pageno: int=1,
- time_range: typing.Optional[str]=None,
- timeout_limit: typing.Optional[float]=None,
- external_bang: typing.Optional[str]=None):
- self.query = query
- self.engineref_list = engineref_list
- self.lang = lang
- self.safesearch = safesearch
- self.pageno = pageno
- self.time_range = time_range
- self.timeout_limit = timeout_limit
- self.external_bang = external_bang
-
- @property
- def categories(self):
- return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
-
- def __repr__(self):
- return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
- format(self.query, self.engineref_list, self.lang, self.safesearch,
- self.pageno, self.time_range, self.timeout_limit, self.external_bang)
-
- def __eq__(self, other):
- return self.query == other.query\
- and self.engineref_list == other.engineref_list\
- and self.lang == other.lang\
- and self.safesearch == other.safesearch\
- and self.pageno == other.pageno\
- and self.time_range == other.time_range\
- and self.timeout_limit == other.timeout_limit\
- and self.external_bang == other.external_bang
-
- def __hash__(self):
- return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
- self.timeout_limit, self.external_bang))
+ if enable_checker:
+ initialize_checker()
class Search:
diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py
index 442d5a09d..85b9178df 100644
--- a/searx/search/checker/__init__.py
+++ b/searx/search/checker/__init__.py
@@ -1 +1,4 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
from .impl import Checker
+from .background import initialize, get_result
diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py
index 2f808237a..37b7e6cda 100644
--- a/searx/search/checker/__main__.py
+++ b/searx/search/checker/__main__.py
@@ -1,9 +1,13 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
import sys
import os
+import argparse
import searx.search
-import searx.search.processors
import searx.search.checker
+from searx.search import processors
+from searx.engines import engine_shortcuts
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
@@ -18,20 +22,24 @@ else:
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
-def iter_processor():
- if len(sys.argv) > 1:
- for name, processor in searx.search.processors.items():
- if name in sys.argv:
+def iter_processor(engine_name_list):
+ if len(engine_name_list) > 0:
+ for name in engine_name_list:
+ name = engine_shortcuts.get(name, name)
+ processor = processors.get(name)
+ if processor is not None:
yield name, processor
+ else:
+ print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ)
else:
for name, processor in searx.search.processors.items():
yield name, processor
-def main():
+def run(engine_name_list):
searx.search.initialize()
broken_urls = []
- for name, processor in iter_processor():
+ for name, processor in iter_processor(engine_name_list):
if sys.stdout.isatty():
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ)
checker = searx.search.checker.Checker(processor)
@@ -48,5 +56,13 @@ def main():
print('Error fetching', url)
+def main():
+ parser = argparse.ArgumentParser(description='Check searx engines.')
+ parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*',
+ help='engines name or shortcut list. Empty for all engines.')
+ args = parser.parse_args()
+ run(args.engine_name_list)
+
+
if __name__ == '__main__':
main()
diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py
new file mode 100644
index 000000000..45188ab38
--- /dev/null
+++ b/searx/search/checker/background.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import json
+import random
+import time
+import threading
+import os
+import signal
+
+from searx import logger, settings, searx_debug
+from searx.exceptions import SearxSettingsException
+from searx.search.processors import processors
+from searx.search.checker import Checker
+from searx.shared import schedule, storage
+
+
+CHECKER_RESULT = 'CHECKER_RESULT'
+running = threading.Lock()
+
+
+def _get_interval(every, error_msg):
+ if isinstance(every, int):
+ every = (every, every)
+ if not isinstance(every, (tuple, list))\
+ or len(every) != 2\
+ or not isinstance(every[0], int)\
+ or not isinstance(every[1], int):
+ raise SearxSettingsException(error_msg, None)
+ return every
+
+
+def _get_every():
+ every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
+ return _get_interval(every, 'checker.scheduling.every is not a int or list')
+
+
+def get_result():
+ serialized_result = storage.get_str('CHECKER_RESULT')
+ if serialized_result is not None:
+ return json.loads(serialized_result)
+
+
+def run():
+ if not running.acquire(blocking=False):
+ return
+ try:
+ logger.info('Starting checker')
+ result = {}
+ for name, processor in processors.items():
+ logger.debug('Checking %s engine', name)
+ checker = Checker(processor)
+ checker.run()
+ if checker.test_results.succesfull:
+ result[name] = {'status': True}
+ else:
+ result[name] = {'status': False, 'errors': checker.test_results.errors}
+
+ storage.set_str('CHECKER_RESULT', json.dumps(result))
+ logger.info('Check done')
+ finally:
+ running.release()
+
+
+def _run_with_delay():
+ every = _get_every()
+ delay = random.randint(0, every[1] - every[0])
+ logger.debug('Start checker in %i seconds', delay)
+ time.sleep(delay)
+ run()
+
+
+def _start_scheduling():
+ every = _get_every()
+ schedule(every[0], _run_with_delay)
+ run()
+
+
+def _signal_handler(signum, frame):
+ t = threading.Thread(target=run)
+ t.daemon = True
+ t.start()
+
+
+def initialize():
+ logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
+ signal.signal(signal.SIGUSR1, _signal_handler)
+
+ # special case when debug is activate
+ if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
+ logger.info('debug mode: checker is disabled')
+ return
+
+ # check value of checker.scheduling.every now
+ scheduling = settings.get('checker', {}).get('scheduling', None)
+ if scheduling is None or not scheduling:
+ logger.info('Checker scheduler is disabled')
+ return
+
+ #
+ start_after = scheduling.get('start_after', (300, 1800))
+ start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
+ delay = random.randint(start_after[0], start_after[1])
+ logger.info('Start checker in %i seconds', delay)
+ t = threading.Timer(delay, _start_scheduling)
+ t.daemon = True
+ t.start()
diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py
index f55b6d0f5..abef5f8e9 100644
--- a/searx/search/checker/impl.py
+++ b/searx/search/checker/impl.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
import typing
import types
import functools
@@ -11,7 +13,7 @@ import requests.exceptions
from searx import poolrequests, logger
from searx.results import ResultContainer
-from searx.search import SearchQuery, EngineRef
+from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@@ -240,18 +242,24 @@ class ResultContainerTests:
self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self):
+ """Check the ResultContainer has at least one infobox"""
if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox')
def has_answer(self):
+ """Check the ResultContainer has at least one answer"""
if len(self.result_container.answers) == 0:
self._record_error('No answer')
def has_language(self, lang):
+ """Check at least one title or content of the results is written in the `lang`.
+
+ Detected using pycld3, may be not accurate"""
if lang not in self.languages:
self._record_error(lang + ' not found')
def not_empty(self):
+ """Check the ResultContainer has at least one answer or infobox or result"""
result_types = set()
results = self.result_container.get_ordered_results()
if len(results) > 0:
@@ -267,6 +275,7 @@ class ResultContainerTests:
self._record_error('No result')
def one_title_contains(self, title: str):
+ """Check one of the title contains `title` (case insensitive comparaison)"""
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():
@@ -287,6 +296,7 @@ class CheckerTests:
self.result_container_tests_list = result_container_tests_list
def unique_results(self):
+ """Check the results of each ResultContain is unique"""
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0:
# results on the first page
diff --git a/searx/search/models.py b/searx/search/models.py
new file mode 100644
index 000000000..80ceaa223
--- /dev/null
+++ b/searx/search/models.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import typing
+
+
+class EngineRef:
+
+ __slots__ = 'name', 'category'
+
+ def __init__(self, name: str, category: str):
+ self.name = name
+ self.category = category
+
+ def __repr__(self):
+ return "EngineRef({!r}, {!r})".format(self.name, self.category)
+
+ def __eq__(self, other):
+ return self.name == other.name and self.category == other.category
+
+ def __hash__(self):
+ return hash((self.name, self.category))
+
+
+class SearchQuery:
+ """container for all the search parameters (query, language, etc...)"""
+
+ __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
+ 'timeout_limit', 'external_bang'
+
+ def __init__(self,
+ query: str,
+ engineref_list: typing.List[EngineRef],
+ lang: str='all',
+ safesearch: int=0,
+ pageno: int=1,
+ time_range: typing.Optional[str]=None,
+ timeout_limit: typing.Optional[float]=None,
+ external_bang: typing.Optional[str]=None):
+ self.query = query
+ self.engineref_list = engineref_list
+ self.lang = lang
+ self.safesearch = safesearch
+ self.pageno = pageno
+ self.time_range = time_range
+ self.timeout_limit = timeout_limit
+ self.external_bang = external_bang
+
+ @property
+ def categories(self):
+ return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
+
+ def __repr__(self):
+ return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
+ format(self.query, self.engineref_list, self.lang, self.safesearch,
+ self.pageno, self.time_range, self.timeout_limit, self.external_bang)
+
+ def __eq__(self, other):
+ return self.query == other.query\
+ and self.engineref_list == other.engineref_list\
+ and self.lang == other.lang\
+ and self.safesearch == other.safesearch\
+ and self.pageno == other.pageno\
+ and self.time_range == other.time_range\
+ and self.timeout_limit == other.timeout_limit\
+ and self.external_bang == other.external_bang
+
+ def __hash__(self):
+ return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
+ self.timeout_limit, self.external_bang))
diff --git a/searx/settings.yml b/searx/settings.yml
index 3094fc7a7..55c9849c1 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -102,24 +102,33 @@ outgoing: # communication with search engines
# - "HTTPS rewrite"
# - ...
-additional_tests:
- rosebud: &test_rosebud
- matrix:
- query: rosebud
- lang: en
- result_container:
- - not_empty
- - [one_title_contains', 'citizen kane']
- test:
- - unique_results
-
-tests:
- infobox: &tests_infobox
- infobox:
- matrix:
- query: ["linux", "new york", "bbc"]
- result_container:
- - has_infobox
+checker:
+ # disable checker when in debug mode
+ off_when_debug: True
+ # scheduling: interval or int
+ # use "scheduling: False" to disable scheduling
+ scheduling:
+ start_after: [300, 1800] # delay to start the first run of the checker
+ every: [86400, 90000] # how often the checker runs
+ # additional tests: only for the YAML anchors (see the engines section)
+ additional_tests:
+ rosebud: &test_rosebud
+ matrix:
+ query: rosebud
+ lang: en
+ result_container:
+ - not_empty
+ - ['one_title_contains', 'citizen kane']
+ test:
+ - unique_results
+ # tests: only for the YAML anchors (see the engines section)
+ tests:
+ infobox: &tests_infobox
+ infobox:
+ matrix:
+ query: ["linux", "new york", "bbc"]
+ result_container:
+ - has_infobox
engines:
- name: apk mirror
diff --git a/searx/webapp.py b/searx/webapp.py
index 10f4ce78c..985eced18 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori
from searx.utils import html_to_text, gen_useragent, dict_subset, match_language
from searx.version import VERSION_STRING
from searx.languages import language_codes as languages
-from searx.search import SearchWithPlugins, initialize
+from searx.search import SearchWithPlugins, initialize as search_initialize
+from searx.search.checker import get_result as checker_get_result
from searx.query import RawTextQuery
from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins
@@ -81,7 +82,6 @@ from searx.answerers import answerers
from searx.poolrequests import get_global_proxies
from searx.metrology.error_recorder import errors_per_engines
-
# serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))
@@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai
# initialize the engines except on the first run of the werkzeug server.
if not werkzeug_reloader\
or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"):
- initialize()
+ search_initialize(enable_checker=True)
babel = Babel(app)
@@ -977,6 +977,12 @@ def stats_errors():
return jsonify(result)
+@app.route('/stats/checker', methods=['GET'])
+def stats_checker():
+ result = checker_get_result()
+ return jsonify(result)
+
+
@app.route('/robots.txt', methods=['GET'])
def robots():
return Response("""User-agent: *