summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2020-12-09 21:23:20 +0100
committerAlexandre Flament <alex@al-f.net>2020-12-11 14:37:08 +0100
commitd703119d3a313a406482b121ee94c6afee3bc307 (patch)
tree7834dc899b99db4ea3f9f81542e8e029bf5b7d04
parent033f39bff7b3365256491014140e35aa1e974d4e (diff)
downloadsearxng-d703119d3a313a406482b121ee94c6afee3bc307.tar.gz
searxng-d703119d3a313a406482b121ee94c6afee3bc307.zip
[enh] add raise_for_httperror
check HTTP response: * detect some comme CAPTCHA challenge (no solving). In this case the engine is suspended for long a time. * otherwise raise HTTPError as before the check is done in poolrequests.py (was before in search.py). update qwant, wikipedia, wikidata to use raise_for_httperror instead of raise_for_status
-rw-r--r--docs/dev/engine_overview.rst26
-rw-r--r--searx/engines/__init__.py8
-rw-r--r--searx/engines/qwant.py27
-rw-r--r--searx/engines/wikidata.py3
-rw-r--r--searx/engines/wikipedia.py4
-rw-r--r--searx/exceptions.py29
-rw-r--r--searx/metrology/error_recorder.py7
-rw-r--r--searx/poolrequests.py11
-rw-r--r--searx/raise_for_httperror.py66
-rw-r--r--searx/search.py36
-rw-r--r--searx/settings.yml18
11 files changed, 179 insertions, 56 deletions
diff --git a/docs/dev/engine_overview.rst b/docs/dev/engine_overview.rst
index 99726a456..3562ca61a 100644
--- a/docs/dev/engine_overview.rst
+++ b/docs/dev/engine_overview.rst
@@ -134,19 +134,19 @@ The function ``def request(query, params):`` always returns the ``params``
variable. Inside searx, the following paramters can be used to specify a search
request:
-================== =========== ==========================================================================
-argument type information
-================== =========== ==========================================================================
-url string requested url
-method string HTTP request method
-headers set HTTP header information
-data set HTTP data information (parsed if ``method != 'GET'``)
-cookies set HTTP cookies
-verify boolean Performing SSL-Validity check
-max_redirects int maximum redirects, hard limit
-soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
-raise_for_status bool True by default: raise an exception if the HTTP code of response is >= 300
-================== =========== ==========================================================================
+=================== =========== ==========================================================================
+argument type information
+=================== =========== ==========================================================================
+url string requested url
+method string HTTP request method
+headers set HTTP header information
+data set HTTP data information (parsed if ``method != 'GET'``)
+cookies set HTTP cookies
+verify boolean Performing SSL-Validity check
+max_redirects int maximum redirects, hard limit
+soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
+raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300
+=================== =========== ==========================================================================
example code
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 0b77f2a95..b2a9b25a4 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -281,8 +281,12 @@ def initialize_engines(engine_list):
load_engines(engine_list)
def engine_init(engine_name, init_fn):
- init_fn(get_engine_from_settings(engine_name))
- logger.debug('%s engine: Initialized', engine_name)
+ try:
+ init_fn(get_engine_from_settings(engine_name))
+ except Exception:
+ logger.exception('%s engine: Fail to initialize', engine_name)
+ else:
+ logger.debug('%s engine: Initialized', engine_name)
for engine_name, engine in engines.items():
if hasattr(engine, 'init'):
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index c909ce11b..b785719d9 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -14,6 +14,8 @@ from datetime import datetime
from json import loads
from urllib.parse import urlencode
from searx.utils import html_to_text, match_language
+from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
+from searx.raise_for_httperror import raise_for_httperror
# engine dependent config
@@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region'
category_to_keyword = {'general': 'web',
'images': 'images',
- 'news': 'news',
- 'social media': 'social'}
+ 'news': 'news'}
# search-url
url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
@@ -51,6 +52,7 @@ def request(query, params):
params['url'] += '&locale=' + language.replace('-', '_').lower()
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
+ params['raise_for_httperror'] = False
return params
@@ -58,8 +60,20 @@ def request(query, params):
def response(resp):
results = []
+ # According to https://www.qwant.com/js/app.js
+ if resp.status_code == 429:
+ raise SearxEngineCaptchaException()
+
+ # raise for other errors
+ raise_for_httperror(resp)
+
+ # load JSON result
search_results = loads(resp.text)
+ # check for an API error
+ if search_results.get('status') != 'success':
+ raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
+
# return empty array if there are no results
if 'data' not in search_results:
return []
@@ -90,15 +104,6 @@ def response(resp):
'thumbnail_src': thumbnail_src,
'img_src': img_src})
- elif category_to_keyword.get(categories[0], '') == 'social':
- published_date = datetime.fromtimestamp(result['date'], None)
- img_src = result.get('img', None)
- results.append({'url': res_url,
- 'title': title,
- 'publishedDate': published_date,
- 'content': content,
- 'img_src': img_src})
-
elif category_to_keyword.get(categories[0], '') == 'news':
published_date = datetime.fromtimestamp(result['date'], None)
media = result.get('media', [])
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index 60d0dc9a0..8d787caac 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -161,9 +161,6 @@ def request(query, params):
def response(resp):
results = []
- if resp.status_code != 200:
- logger.debug('SPARQL endpoint error %s', resp.content.decode())
- resp.raise_for_status()
jsonresponse = loads(resp.content.decode())
language = resp.search_params['language'].lower()
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 000e1af76..54d75108e 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -14,6 +14,7 @@ from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from searx.utils import match_language, searx_useragent
+from searx.raise_for_httperror import raise_for_httperror
# search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
@@ -37,7 +38,7 @@ def request(query, params):
language=url_lang(params['language']))
params['headers']['User-Agent'] = searx_useragent()
- params['raise_for_status'] = False
+ params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2
return params
@@ -47,6 +48,7 @@ def request(query, params):
def response(resp):
if resp.status_code == 404:
return []
+ raise_for_httperror(resp)
results = []
api_result = loads(resp.text)
diff --git a/searx/exceptions.py b/searx/exceptions.py
index 82c1d76dc..67a282da2 100644
--- a/searx/exceptions.py
+++ b/searx/exceptions.py
@@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException):
"""The website has returned an application error"""
-class SearxEngineCaptchaException(SearxEngineResponseException):
- """The website has returned a CAPTCHA"""
+class SearxEngineAccessDeniedException(SearxEngineResponseException):
+ """The website is blocking the access"""
+
+ def __init__(self, suspended_time=24 * 3600, message='Access denied'):
+ super().__init__(message + ', suspended_time=' + str(suspended_time))
+ self.suspended_time = suspended_time
+ self.message = message
+
+
+class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
+ """The website has returned a CAPTCHA
+
+ By default, searx stops sending requests to this engine for 1 day.
+ """
+
+ def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'):
+ super().__init__(message=message, suspended_time=suspended_time)
+
+
+class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
+ """The website has returned a Too Many Request status code
+
+ By default, searx stops sending requests to this engine for 1 hour.
+ """
+
+ def __init__(self, suspended_time=3600, message='Too many request'):
+ super().__init__(message=message, suspended_time=suspended_time)
class SearxEngineXPathException(SearxEngineResponseException):
diff --git a/searx/metrology/error_recorder.py b/searx/metrology/error_recorder.py
index 4b67235e1..fee1ef7d6 100644
--- a/searx/metrology/error_recorder.py
+++ b/searx/metrology/error_recorder.py
@@ -4,7 +4,8 @@ import logging
from json import JSONDecodeError
from urllib.parse import urlparse
from requests.exceptions import RequestException
-from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
+from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
+ SearxEngineAccessDeniedException)
from searx import logger
@@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple:
return (exc.xpath_str, exc.message)
if isinstance(exc, SearxEngineXPathException):
return (exc.xpath_str, exc.message)
+ if isinstance(exc, SearxEngineAPIException):
+ return (str(exc.args[0]), )
+ if isinstance(exc, SearxEngineAccessDeniedException):
+ return (exc.message, )
return ()
diff --git a/searx/poolrequests.py b/searx/poolrequests.py
index 1eedc84b8..25a6baed9 100644
--- a/searx/poolrequests.py
+++ b/searx/poolrequests.py
@@ -7,6 +7,7 @@ import requests
from searx import settings
from searx import logger
+from searx.raise_for_httperror import raise_for_httperror
logger = logger.getChild('poolrequests')
@@ -156,6 +157,12 @@ def request(method, url, **kwargs):
if timeout is not None:
kwargs['timeout'] = timeout
+ # raise_for_error
+ check_for_httperror = True
+ if 'raise_for_httperror' in kwargs:
+ check_for_httperror = kwargs['raise_for_httperror']
+ del kwargs['raise_for_httperror']
+
# do request
response = session.request(method=method, url=url, **kwargs)
@@ -176,6 +183,10 @@ def request(method, url, **kwargs):
if hasattr(threadLocal, 'total_time'):
threadLocal.total_time += time_after_request - time_before_request
+ # raise an exception
+ if check_for_httperror:
+ raise_for_httperror(response)
+
return response
diff --git a/searx/raise_for_httperror.py b/searx/raise_for_httperror.py
new file mode 100644
index 000000000..bd12df9a9
--- /dev/null
+++ b/searx/raise_for_httperror.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+Raise exception for an HTTP response is an error.
+"""
+from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException,
+ SearxEngineAccessDeniedException)
+
+
+def is_cloudflare_challenge(resp):
+ if resp.status_code in [429, 503]:
+ if ('__cf_chl_jschl_tk__=' in resp.text)\
+ or ('/cdn-cgi/challenge-platform/' in resp.text
+ and 'orchestrate/jsch/v1' in resp.text
+ and 'window._cf_chl_enter(' in resp.text):
+ return True
+ if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text:
+ return True
+ return False
+
+
+def is_cloudflare_firewall(resp):
+ return resp.status_code == 403 and '<span class="cf-error-code">1020</span>' in resp.text
+
+
+def raise_for_cloudflare_captcha(resp):
+ if resp.headers.get('Server', '').startswith('cloudflare'):
+ if is_cloudflare_challenge(resp):
+ # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
+ # suspend for 2 weeks
+ raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15)
+
+ if is_cloudflare_firewall(resp):
+ raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24)
+
+
+def raise_for_recaptcha(resp):
+ if resp.status_code == 503 \
+ and '"https://www.google.com/recaptcha/' in resp.text:
+ raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
+
+
+def raise_for_captcha(resp):
+ raise_for_cloudflare_captcha(resp)
+ raise_for_recaptcha(resp)
+
+
+def raise_for_httperror(resp):
+ """Raise exception for an HTTP response is an error.
+
+ Args:
+ resp (requests.Response): Response to check
+
+ Raises:
+ requests.HTTPError: raise by resp.raise_for_status()
+ searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403.
+ searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429.
+ searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected.
+ """
+ if resp.status_code and resp.status_code >= 400:
+ raise_for_captcha(resp)
+ if resp.status_code in (402, 403):
+ raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code),
+ suspended_time=3600 * 24)
+ if resp.status_code == 429:
+ raise SearxEngineTooManyRequestsException()
+ resp.raise_for_status()
diff --git a/searx/search.py b/searx/search.py
index 8c2ad8d72..220950803 100644
--- a/searx/search.py
+++ b/searx/search.py
@@ -32,7 +32,8 @@ from searx.utils import gen_useragent
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
-from searx.exceptions import SearxEngineCaptchaException
+from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
+ SearxEngineTooManyRequestsException,)
from searx.metrology.error_recorder import record_exception, record_error
@@ -131,6 +132,9 @@ def send_http_request(engine, request_params):
# soft_max_redirects
soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0)
+ # raise_for_status
+ request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False)
+
# specific type of request (GET or POST)
if request_params['method'] == 'GET':
req = requests_lib.get
@@ -142,10 +146,6 @@ def send_http_request(engine, request_params):
# send the request
response = req(request_params['url'], **request_args)
- # check HTTP status
- if request_params.get('raise_for_status'):
- response.raise_for_status()
-
# check soft limit of the redirect count
if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error
@@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
# suppose everything will be alright
requests_exception = False
+ suspended_time = None
try:
# send requests and parse the results
@@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA')
+ suspended_time = e.suspended_time # pylint: disable=no-member
+ elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
+ result_container.add_unresponsive_engine(engine_name, 'too many requests')
+ logger.exception('engine {0} : Too many requests')
+ suspended_time = e.suspended_time # pylint: disable=no-member
+ elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
+ result_container.add_unresponsive_engine(engine_name, 'blocked')
+ logger.exception('engine {0} : Searx is blocked')
+ suspended_time = e.suspended_time # pylint: disable=no-member
else:
result_container.add_unresponsive_engine(engine_name, 'unexpected crash')
# others errors
@@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
if getattr(threading.current_thread(), '_timeout', False):
record_error(engine_name, 'Timeout')
- # suspend or not the engine if there are HTTP errors
+ # suspend the engine if there is an HTTP error
+ # or suspended_time is defined
with threading.RLock():
- if requests_exception:
+ if requests_exception or suspended_time:
# update continuous_errors / suspend_end_time
engine.continuous_errors += 1
- engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'],
- engine.continuous_errors * settings['search']['ban_time_on_fail'])
+ if suspended_time is None:
+ suspended_time = min(settings['search']['max_ban_time_on_fail'],
+ engine.continuous_errors * settings['search']['ban_time_on_fail'])
+ engine.suspend_end_time = time() + suspended_time
else:
- # no HTTP error (perhaps an engine error)
- # anyway, reset the suspend variables
+ # reset the suspend variables
engine.continuous_errors = 0
engine.suspend_end_time = 0
@@ -342,7 +354,7 @@ def default_request_params():
'cookies': {},
'verify': True,
'auth': None,
- 'raise_for_status': True
+ 'raise_for_httperror': True
}
diff --git a/searx/settings.yml b/searx/settings.yml
index 132bf620b..3ba9b745f 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -647,11 +647,6 @@ engines:
shortcut : qwn
categories : news
- - name : qwant social
- engine : qwant
- shortcut : qws
- categories : social media
-
# - name: library
# engine: recoll
# shortcut: lib
@@ -817,12 +812,13 @@ engines:
# Or you can use the html non-stable engine, activated by default
engine : youtube_noapi
- - name : yggtorrent
- engine : yggtorrent
- shortcut : ygg
- url: https://www2.yggtorrent.si/
- disabled : True
- timeout : 4.0
+ # tmp suspended: Cloudflare CAPTCHA
+ #- name : yggtorrent
+ # engine : yggtorrent
+ # shortcut : ygg
+ # url: https://www2.yggtorrent.si/
+ # disabled : True
+ # timeout : 4.0
- name : dailymotion
engine : dailymotion