summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2021-03-18 19:59:01 +0100
committerAlexandre Flament <alex@al-f.net>2021-04-10 15:38:33 +0200
commiteaa694fb7d0e47b943bc6d6edb6cb6a40ab2d85e (patch)
tree024786c8a7003be24bbc566cb8c8e734a143f99d
parent111180705b6f3b142732eb6325de1346f6372828 (diff)
downloadsearxng-eaa694fb7d0e47b943bc6d6edb6cb6a40ab2d85e.tar.gz
searxng-eaa694fb7d0e47b943bc6d6edb6cb6a40ab2d85e.zip
[enh] replace requests by httpx
-rw-r--r--requirements.txt7
-rw-r--r--searx/autocomplete.py5
-rw-r--r--searx/engines/dictzone.py2
-rw-r--r--searx/engines/elasticsearch.py3
-rw-r--r--searx/engines/google.py5
-rw-r--r--searx/engines/seznam.py5
-rw-r--r--searx/engines/spotify.py2
-rw-r--r--searx/engines/stackoverflow.py5
-rw-r--r--searx/engines/yacy.py4
-rw-r--r--searx/metrology/error_recorder.py18
-rw-r--r--searx/poolrequests.py513
-rw-r--r--searx/search/checker/impl.py6
-rw-r--r--searx/search/processors/online.py20
-rw-r--r--searx/settings.yml8
-rw-r--r--searx/utils.py2
-rwxr-xr-xsearx/webapp.py93
-rwxr-xr-xsearx_extra/update/update_external_bangs.py6
-rw-r--r--tests/unit/test_poolrequests.py27
18 files changed, 527 insertions, 204 deletions
diff --git a/requirements.txt b/requirements.txt
index bfbcecc51..cc3235d01 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,10 @@ lxml==4.6.3
pygments==2.8.0
python-dateutil==2.8.1
pyyaml==5.4.1
-requests[socks]==2.25.1
+httpx[http2]==0.17.1
+Brotli==1.0.9
+uvloop==0.15.2; python_version >= '3.7'
+uvloop==0.14.0; python_version < '3.7'
+httpx-socks[asyncio]==0.3.1
langdetect==1.0.8
+setproctitle==1.2.2
diff --git a/searx/autocomplete.py b/searx/autocomplete.py
index 75992a1d8..1fe3afac7 100644
--- a/searx/autocomplete.py
+++ b/searx/autocomplete.py
@@ -20,7 +20,8 @@ from lxml import etree
from json import loads
from urllib.parse import urlencode
-from requests import RequestException
+from httpx import HTTPError
+
from searx import settings
from searx.poolrequests import get as http_get
@@ -136,5 +137,5 @@ def search_autocomplete(backend_name, query, lang):
try:
return backend(query, lang)
- except (RequestException, SearxEngineResponseException):
+ except (HTTPError, SearxEngineResponseException):
return []
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
index 2483c0805..eaa8b6ab4 100644
--- a/searx/engines/dictzone.py
+++ b/searx/engines/dictzone.py
@@ -52,7 +52,7 @@ def response(resp):
to_results.append(to_result.text_content())
results.append({
- 'url': urljoin(resp.url, '?%d' % k),
+ 'url': urljoin(str(resp.url), '?%d' % k),
'title': from_result.text_content(),
'content': '; '.join(to_results)
})
diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py
index da7f98074..db84a5c13 100644
--- a/searx/engines/elasticsearch.py
+++ b/searx/engines/elasticsearch.py
@@ -4,7 +4,6 @@
"""
from json import loads, dumps
-from requests.auth import HTTPBasicAuth
from searx.exceptions import SearxEngineAPIException
@@ -32,7 +31,7 @@ def request(query, params):
return params
if username and password:
- params['auth'] = HTTPBasicAuth(username, password)
+ params['auth'] = (username, password)
params['url'] = search_url
params['method'] = 'GET'
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 8c20029a3..dcb65df57 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -10,7 +10,7 @@ Definitions`_.
# pylint: disable=invalid-name, missing-function-docstring
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode
from lxml import html
from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -186,8 +186,7 @@ def get_lang_info(params, lang_list, custom_aliases):
return ret_val
def detect_google_sorry(resp):
- resp_url = urlparse(resp.url)
- if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
+ if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py
index faceb0550..c058ebb76 100644
--- a/searx/engines/seznam.py
+++ b/searx/engines/seznam.py
@@ -3,7 +3,7 @@
Seznam
"""
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode
from lxml import html
from searx.poolrequests import get
from searx.exceptions import SearxEngineAccessDeniedException
@@ -46,8 +46,7 @@ def request(query, params):
def response(resp):
- resp_url = urlparse(resp.url)
- if resp_url.path.startswith('/verify'):
+ if resp.url.path.startswith('/verify'):
raise SearxEngineAccessDeniedException()
results = []
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
index 0ad8bfe32..fbabec0bb 100644
--- a/searx/engines/spotify.py
+++ b/searx/engines/spotify.py
@@ -5,7 +5,7 @@
from json import loads
from urllib.parse import urlencode
-import requests
+import searx.poolrequests as requests
import base64
# about
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index 91eaa68e9..8fc2cdb3a 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -3,7 +3,7 @@
Stackoverflow (IT)
"""
-from urllib.parse import urlencode, urljoin, urlparse
+from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text
from searx.exceptions import SearxEngineCaptchaException
@@ -41,8 +41,7 @@ def request(query, params):
# get response from search-request
def response(resp):
- resp_url = urlparse(resp.url)
- if resp_url.path.startswith('/nocaptcha'):
+ if resp.url.path.startswith('/nocaptcha'):
raise SearxEngineCaptchaException()
results = []
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
index c194ca451..fbd99c47b 100644
--- a/searx/engines/yacy.py
+++ b/searx/engines/yacy.py
@@ -7,7 +7,7 @@ from json import loads
from dateutil import parser
from urllib.parse import urlencode
-from requests.auth import HTTPDigestAuth
+from httpx import DigestAuth
from searx.utils import html_to_text
@@ -56,7 +56,7 @@ def request(query, params):
search_type=search_type)
if http_digest_auth_user and http_digest_auth_pass:
- params['auth'] = HTTPDigestAuth(http_digest_auth_user, http_digest_auth_pass)
+ params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass)
# add language tag if specified
if params['language'] != 'all':
diff --git a/searx/metrology/error_recorder.py b/searx/metrology/error_recorder.py
index f533e4e8b..167d1c8aa 100644
--- a/searx/metrology/error_recorder.py
+++ b/searx/metrology/error_recorder.py
@@ -3,7 +3,7 @@ import inspect
import logging
from json import JSONDecodeError
from urllib.parse import urlparse
-from requests.exceptions import RequestException
+from httpx import HTTPError, HTTPStatusError
from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
SearxEngineAccessDeniedException)
from searx import logger
@@ -60,28 +60,28 @@ def get_trace(traces):
return traces[-1]
-def get_hostname(exc: RequestException) -> typing.Optional[None]:
+def get_hostname(exc: HTTPError) -> typing.Optional[None]:
url = exc.request.url
if url is None and exc.response is not None:
url = exc.response.url
return urlparse(url).netloc
-def get_request_exception_messages(exc: RequestException)\
+def get_request_exception_messages(exc: HTTPError)\
-> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]:
url = None
status_code = None
reason = None
hostname = None
- if exc.request is not None:
+ if hasattr(exc, 'request') and exc.request is not None:
url = exc.request.url
- if url is None and exc.response is not None:
+ if url is None and hasattr(exc, 'response') and exc.respones is not None:
url = exc.response.url
if url is not None:
- hostname = str(urlparse(url).netloc)
- if exc.response is not None:
+ hostname = url.host
+ if isinstance(exc, HTTPStatusError):
status_code = str(exc.response.status_code)
- reason = exc.response.reason
+ reason = exc.response.reason_phrase
return (status_code, reason, hostname)
@@ -92,7 +92,7 @@ def get_messages(exc, filename) -> typing.Tuple:
return (str(exc), )
if isinstance(exc, ValueError) and 'lxml' in filename:
return (str(exc), )
- if isinstance(exc, RequestException):
+ if isinstance(exc, HTTPError):
return get_request_exception_messages(exc)
if isinstance(exc, SearxXPathSyntaxException):
return (exc.xpath_str, exc.message)
diff --git a/searx/poolrequests.py b/searx/poolrequests.py
index ab327251b..8b365f913 100644
--- a/searx/poolrequests.py
+++ b/searx/poolrequests.py
@@ -1,14 +1,54 @@
+import atexit
import sys
+import threading
+import asyncio
+import logging
+import concurrent.futures
from time import time
from itertools import cycle
-from threading import local
-import requests
+import httpcore
+import httpx
+import h2.exceptions
+from httpx_socks import AsyncProxyTransport
+from python_socks import parse_proxy_url
+import python_socks._errors
from searx import settings
from searx import logger
from searx.raise_for_httperror import raise_for_httperror
+# Optional uvloop (support Python 3.6)
+try:
+ import uvloop
+except ImportError:
+ pass
+else:
+ uvloop.install()
+
+# queue.SimpleQueue: Support Python 3.6
+try:
+ from queue import SimpleQueue
+except ImportError:
+ from queue import Empty
+ from collections import deque
+
+ class SimpleQueue:
+ """Minimal backport of queue.SimpleQueue"""
+
+ def __init__(self):
+ self._queue = deque()
+ self._count = threading.Semaphore(0)
+
+ def put(self, item):
+ self._queue.append(item)
+ self._count.release()
+
+ def get(self, timeout=None):
+ if not self._count.acquire(True, timeout):
+ raise Empty
+ return self._queue.popleft()
+
logger = logger.getChild('poolrequests')
@@ -31,99 +71,63 @@ if not getattr(ssl, "HAS_SNI", False):
sys.exit(1)
-class HTTPAdapterWithConnParams(requests.adapters.HTTPAdapter):
-
- def __init__(self, pool_connections=requests.adapters.DEFAULT_POOLSIZE,
- pool_maxsize=requests.adapters.DEFAULT_POOLSIZE,
- max_retries=requests.adapters.DEFAULT_RETRIES,
- pool_block=requests.adapters.DEFAULT_POOLBLOCK,
- **conn_params):
- if max_retries == requests.adapters.DEFAULT_RETRIES:
- self.max_retries = requests.adapters.Retry(0, read=False)
- else:
- self.max_retries = requests.adapters.Retry.from_int(max_retries)
- self.config = {}
- self.proxy_manager = {}
-
- super().__init__()
-
- self._pool_connections = pool_connections
- self._pool_maxsize = pool_maxsize
- self._pool_block = pool_block
- self._conn_params = conn_params
-
- self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block, **conn_params)
-
- def __setstate__(self, state):
- # Can't handle by adding 'proxy_manager' to self.__attrs__ because
- # because self.poolmanager uses a lambda function, which isn't pickleable.
- self.proxy_manager = {}
- self.config = {}
-
- for attr, value in state.items():
- setattr(self, attr, value)
+LOOP = None
+CLIENTS = dict()
+THREADLOCAL = threading.local()
+LIMITS = httpx.Limits(
+ # Magic number kept from previous code
+ max_connections=settings['outgoing'].get('pool_connections', 100),
+ # Picked from constructor
+ max_keepalive_connections=settings['outgoing'].get('pool_maxsize', 10),
+ #
+ keepalive_expiry=settings['outgoing'].get('keepalive_expiry', 5.0)
+)
+# default parameters for AsyncHTTPTransport
+# see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # noqa
+TRANSPORT_KWARGS = {
+ 'http2': settings['outgoing'].get('http2', False),
+ 'retries': 0,
+ 'trust_env': False,
+ 'backend': 'asyncio'
+}
+# requests compatibility when reading proxy settings from settings.yml
+PROXY_PATTERN_MAPPING = {
+ 'http': 'https://',
+ 'https:': 'https://'
+}
+# default maximum redirect
+# from https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55
+DEFAULT_REDIRECT_LIMIT = 30
- self.init_poolmanager(self._pool_connections, self._pool_maxsize,
- block=self._pool_block, **self._conn_params)
-
-threadLocal = local()
-connect = settings['outgoing'].get('pool_connections', 100) # Magic number kept from previous code
-maxsize = settings['outgoing'].get('pool_maxsize', requests.adapters.DEFAULT_POOLSIZE) # Picked from constructor
if settings['outgoing'].get('source_ips'):
- http_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize,
- source_address=(source_ip, 0))
- for source_ip in settings['outgoing']['source_ips'])
- https_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize,
- source_address=(source_ip, 0))
- for source_ip in settings['outgoing']['source_ips'])
+ LOCAL_ADDRESS_CYCLE = cycle(settings['outgoing'].get('source_ips'))
else:
- http_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), ))
- https_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), ))
-
-
-class SessionSinglePool(requests.Session):
-
- def __init__(self):
- super().__init__()
-
- # reuse the same adapters
- self.adapters.clear()
-
- https_adapter = threadLocal.__dict__.setdefault('https_adapter', next(https_adapters))
- self.mount('https://', https_adapter)
- if get_enable_http_protocol():
- http_adapter = threadLocal.__dict__.setdefault('http_adapter', next(http_adapters))
- self.mount('http://', http_adapter)
-
- def close(self):
- """Call super, but clear adapters since there are managed globaly"""
- self.adapters.clear()
- super().close()
+ LOCAL_ADDRESS_CYCLE = cycle((None, ))
def set_timeout_for_thread(timeout, start_time=None):
- threadLocal.timeout = timeout
- threadLocal.start_time = start_time
+ THREADLOCAL.timeout = timeout
+ THREADLOCAL.start_time = start_time
def set_enable_http_protocol(enable_http):
- threadLocal.enable_http = enable_http
+ THREADLOCAL.enable_http = enable_http
def get_enable_http_protocol():
try:
- return threadLocal.enable_http
+ return THREADLOCAL.enable_http
except AttributeError:
return False
def reset_time_for_thread():
- threadLocal.total_time = 0
+ THREADLOCAL.total_time = 0
def get_time_for_thread():
- return threadLocal.total_time
+ return THREADLOCAL.total_time
def get_proxy_cycles(proxy_settings):
@@ -152,22 +156,197 @@ def get_global_proxies():
return get_proxies(GLOBAL_PROXY_CYCLES)
+async def close_connections_for_url(connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL):
+ origin = httpcore._utils.url_to_origin(url)
+ logger.debug('Drop connections for %r', origin)
+ connections_to_close = connection_pool._connections_for_origin(origin)
+ for connection in connections_to_close:
+ await connection_pool._remove_from_pool(connection)
+ try:
+ await connection.aclose()
+ except httpcore.NetworkError as e:
+ logger.warning('Error closing an existing connection', exc_info=e)
+
+
+class AsyncHTTPTransportNoHttp(httpcore.AsyncHTTPTransport):
+ """Block HTTP request"""
+
+ async def arequest(self, method, url, headers=None, stream=None, ext=None):
+ raise httpcore.UnsupportedProtocol("HTTP protocol is disabled")
+
+
+class AsyncProxyTransportFixed(AsyncProxyTransport):
+ """Fix httpx_socks.AsyncProxyTransport
+
+ Map python_socks exceptions to httpcore.ProxyError
+
+ Map socket.gaierror to httpcore.ConnectError
+
+ Note: keepalive_expiry is ignored, AsyncProxyTransport should call:
+ * self._keepalive_sweep()
+ * self._response_closed(self, connection)
+
+ Note: AsyncProxyTransport inherit from AsyncConnectionPool
+
+ Note: the API is going to change on httpx 0.18.0
+ see https://github.com/encode/httpx/pull/1522
+ """
+
+ async def arequest(self, method, url, headers=None, stream=None, ext=None):
+ retry = 2
+ while retry > 0:
+ retry -= 1
+ try:
+ return await super().arequest(method, url, headers, stream, ext)
+ except (python_socks._errors.ProxyConnectionError,
+ python_socks._errors.ProxyTimeoutError,
+ python_socks._errors.ProxyError) as e:
+ raise httpcore.ProxyError(e)
+ except OSError as e:
+ # socket.gaierror when DNS resolution fails
+ raise httpcore.NetworkError(e)
+ except httpcore.RemoteProtocolError as e:
+ # in case of httpcore.RemoteProtocolError: Server disconnected
+ await close_connections_for_url(self, url)
+ logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e)
+ # retry
+ except (httpcore.NetworkError, httpcore.ProtocolError) as e:
+ # httpcore.WriteError on HTTP/2 connection leaves a new opened stream
+ # then each new request creates a new stream and raise the same WriteError
+ await close_connections_for_url(self, url)
+ raise e
+
+
+class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport):
+ """Fix httpx.AsyncHTTPTransport"""
+
+ async def arequest(self, method, url, headers=None, stream=None, ext=None):
+ retry = 2
+ while retry > 0:
+ retry -= 1
+ try:
+ return await super().arequest(method, url, headers, stream, ext)
+ except OSError as e:
+ # socket.gaierror when DNS resolution fails
+ raise httpcore.ConnectError(e)
+ except httpcore.CloseError as e:
+ # httpcore.CloseError: [Errno 104] Connection reset by peer
+ # raised by _keepalive_sweep()
+ # from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa
+ await close_connections_for_url(self._pool, url)
+ logger.warning('httpcore.CloseError: retry', exc_info=e)
+ # retry
+ except httpcore.RemoteProtocolError as e:
+ # in case of httpcore.RemoteProtocolError: Server disconnected
+ await close_connections_for_url(self._pool, url)
+ logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e)
+ # retry
+ except (httpcore.ProtocolError, httpcore.NetworkError) as e:
+ await close_connections_for_url(self._pool, url)
+ raise e
+
+
+def get_transport_for_socks_proxy(verify, local_address, proxy_url):
+ global LOOP, LIMITS, TRANSPORT_KWARGS
+ # support socks5h (requests compatibility):
+ # https://requests.readthedocs.io/en/master/user/advanced/#socks
+ # socks5:// hostname is resolved on client side
+ # socks5h:// hostname is resolved on proxy side
+ rdns = False
+ socks5h = 'socks5h://'
+ if proxy_url.startswith(socks5h):
+ proxy_url = 'socks5://' + proxy_url[len(socks5h):]
+ rdns = True
+
+ proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url)
+
+ return AsyncProxyTransportFixed(proxy_type=proxy_type, proxy_host=proxy_host, proxy_port=proxy_port,
+ username=proxy_username, password=proxy_password,
+ rdns=rdns,
+ loop=LOOP,
+ verify=verify,
+ local_address=local_address,
+ max_connections=LIMITS.max_connections,
+ max_keepalive_connections=LIMITS.max_keepalive_connections,
+ keepalive_expiry=LIMITS.keepalive_expiry,
+ **TRANSPORT_KWARGS)
+
+
+def get_transport(verify, local_address, proxy_url):
+ global LIMITS
+ return AsyncHTTPTransportFixed(verify=verify,
+ local_address=local_address,
+ limits=LIMITS,
+ proxy=httpx._config.Proxy(proxy_url) if proxy_url else None,
+ **TRANSPORT_KWARGS)
+
+
+def iter_proxies(proxies):
+ # https://www.python-httpx.org/compatibility/#proxy-keys
+ if isinstance(proxies, str):
+ yield 'all://', proxies
+ elif isinstance(proxies, dict):
+ for pattern, proxy_url in proxies.items():
+ pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern)
+ yield pattern, proxy_url
+
+
+def new_client(verify, local_address, proxies, max_redirects, enable_http):
+ # See https://www.python-httpx.org/advanced/#routing
+ mounts = {}
+ for pattern, proxy_url in iter_proxies(proxies):
+ if not enable_http and (pattern == 'http' or pattern.startswith('http://')):
+ continue
+ if proxy_url.startswith('socks4://') \
+ or proxy_url.startswith('socks5://') \
+ or proxy_url.startswith('socks5h://'):
+ mounts[pattern] = get_transport_for_socks_proxy(verify, local_address, proxy_url)
+ else:
+ mounts[pattern] = get_transport(verify, local_address, proxy_url)
+
+ if not enable_http:
+ mounts['http://'] = AsyncHTTPTransportNoHttp()
+
+ transport = get_transport(verify, local_address, None)
+ return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects)
+
+
+def get_client(verify, local_address, proxies, max_redirects, allow_http):
+ global CLIENTS
+ key = (verify, local_address, repr(proxies), max_redirects, allow_http)
+ if key not in CLIENTS:
+ CLIENTS[key] = new_client(verify, local_address, proxies, max_redirects, allow_http)
+ return CLIENTS[key]
+
+
+async def send_request(method, url, enable_http, kwargs):
+ if isinstance(url, bytes):
+ url = url.decode()
+
+ verify = kwargs.pop('verify', True)
+ local_address = next(LOCAL_ADDRESS_CYCLE)
+ proxies = kwargs.pop('proxies', None) or get_global_proxies()
+ max_redirects = kwargs.pop('max_redirects', DEFAULT_REDIRECT_LIMIT)
+
+ client = get_client(verify, local_address, proxies, max_redirects, enable_http)
+ response = await client.request(method.upper(), url, **kwargs)
+
+ # requests compatibility
+ # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
+ response.ok = not response.is_error
+
+ return response
+
+
def request(method, url, **kwargs):
"""same as requests/requests/api.py request(...)"""
time_before_request = time()
- # session start
- session = SessionSinglePool()
-
- # proxies
- if not kwargs.get('proxies'):
- kwargs['proxies'] = get_global_proxies()
-
# timeout
if 'timeout' in kwargs:
timeout = kwargs['timeout']
else:
- timeout = getattr(threadLocal, 'timeout', None)
+ timeout = getattr(THREADLOCAL, 'timeout', None)
if timeout is not None:
kwargs['timeout'] = timeout
@@ -178,24 +357,23 @@ def request(method, url, **kwargs):
del kwargs['raise_for_httperror']
# do request
- response = session.request(method=method, url=url, **kwargs)
-
- time_after_request = time()
-
- # is there a timeout for this engine ?
- if timeout is not None:
- timeout_overhead = 0.2 # seconds
- # start_time = when the user request started
- start_time = getattr(threadLocal, 'start_time', time_before_request)
- search_duration = time_after_request - start_time
- if search_duration > timeout + timeout_overhead:
- raise requests.exceptions.Timeout(response=response)
-
- # session end
- session.close()
-
- if hasattr(threadLocal, 'total_time'):
- threadLocal.total_time += time_after_request - time_before_request
+ future = asyncio.run_coroutine_threadsafe(send_request(method, url, get_enable_http_protocol(), kwargs), LOOP)
+ try:
+ if timeout:
+ timeout += 0.2 # overhead
+ start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
+ if start_time:
+ timeout -= time() - start_time
+
+ response = future.result(timeout or 120)
+ except concurrent.futures.TimeoutError as e:
+ raise httpx.TimeoutException('Timeout', request=None) from e
+
+ # update total_time.
+ # See get_time_for_thread() and reset_time_for_thread()
+ if hasattr(THREADLOCAL, 'total_time'):
+ time_after_request = time()
+ THREADLOCAL.total_time += time_after_request - time_before_request
# raise an exception
if check_for_httperror:
@@ -204,6 +382,49 @@ def request(method, url, **kwargs):
return response
+async def stream_chunk_to_queue(method, url, q, **kwargs):
+ verify = kwargs.pop('verify', True)
+ local_address = next(LOCAL_ADDRESS_CYCLE)
+ proxies = kwargs.pop('proxies', None) or get_global_proxies()
+ # "30" from requests:
+ # https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55
+ max_redirects = kwargs.pop('max_redirects', 30)
+ client = get_client(verify, local_address, proxies, max_redirects, True)
+ try:
+ async with client.stream(method, url, **kwargs) as response:
+ q.put(response)
+ async for chunk in response.aiter_bytes(65536):
+ if len(chunk) > 0:
+ q.put(chunk)
+ except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e:
+ q.put(e)
+ finally:
+ q.put(None)
+
+
+def stream(method, url, **kwargs):
+ """Replace httpx.stream.
+
+ Usage:
+ stream = poolrequests.stream(...)
+ response = next(stream)
+ for chunk in stream:
+ ...
+
+ httpx.Client.stream requires to write the httpx.HTTPTransport version of the
+ the httpx.AsyncHTTPTransport declared above.
+ """
+ q = SimpleQueue()
+ future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(method, url, q, **kwargs), LOOP)
+ chunk_or_exception = q.get(timeout=60)
+ while chunk_or_exception is not None:
+ if isinstance(chunk_or_exception, Exception):
+ raise chunk_or_exception
+ yield chunk_or_exception
+ chunk_or_exception = q.get(timeout=60)
+ return future.result()
+
+
def get(url, **kwargs):
kwargs.setdefault('allow_redirects', True)
return request('get', url, **kwargs)
@@ -233,3 +454,97 @@ def patch(url, data=None, **kwargs):
def delete(url, **kwargs):
return request('delete', url, **kwargs)
+
+
+def init():
+ # log
+ for logger_name in ('hpack.hpack', 'hpack.table'):
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
+
+ # loop
+ def loop_thread():
+ global LOOP
+ LOOP = asyncio.new_event_loop()
+ LOOP.run_forever()
+
+ th = threading.Thread(
+ target=loop_thread,
+ name='asyncio_loop',
+ daemon=True,
+ )
+ th.start()
+
+
+@atexit.register
+def done():
+ """Close all HTTP client
+
+ Avoid a warning at exit
+ see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785
+ """
+ global LOOP
+
+ async def close_client(client):
+ try:
+ await client.aclose()
+ except httpx.HTTPError:
+ pass
+
+ async def close_clients():
+ await asyncio.gather(*[close_client(client) for client in CLIENTS.values()], return_exceptions=False)
+ future = asyncio.run_coroutine_threadsafe(close_clients(), LOOP)
+ # wait 3 seconds to close the HTTP clients
+ future.result(3)
+
+
+init()
+
+
+# ## TEMPORARY DEBUG ##
+
+
+def debug_connection(connection):
+ now = LOOP.time()
+ expired = (connection.state == httpcore._async.base.ConnectionState.IDLE
+ and connection.expires_at is not None
+ and now >= connection.expires_at)
+ return connection.info()\
+ + (', connect_failed' if connection.connect_failed else '')\
+ + (', expired' if expired else '')
+
+
+def debug_origin(origin):
+ return origin[0].decode() + '://' + origin[1].decode() + ':' + str(origin[2])
+
+
+def debug_transport(transport):
+ result = {
+ '__class__': str(transport.__class__.__name__)
+ }
+ if isinstance(transport, (httpx.AsyncHTTPTransport, AsyncHTTPTransportFixed)):
+ pool = transport._pool
+ result['__pool_class__'] = str(pool.__class__.__name__)
+ if isinstance(pool, httpcore.AsyncConnectionPool):
+ for origin, connections in pool._connections.items():
+ result[debug_origin(origin)] = [debug_connection(connection) for connection in connections]
+ return result
+ elif isinstance(transport, AsyncProxyTransportFixed):
+ for origin, connections in transport._connections.items():
+ result[debug_origin(origin)] = [debug_connection(connection) for connection in connections]
+ return result
+ return result
+
+
+def debug_asyncclient(client, key=None):
+ result = {}
+ if key:
+ result['__key__'] = [k if isinstance(k, (str, int, float, bool, type(None))) else repr(k) for k in key]
+ result['__default__'] = debug_transport(client._transport)
+ for urlpattern, transport in client._mounts.items():
+ result[urlpattern.pattern] = debug_transport(transport)
+ return result
+
+
+def debug_asyncclients():
+ global CLIENTS
+ return [debug_asyncclient(client, key) for key, client in CLIENTS.items()]
diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py
index ad45440ea..b5fb38a99 100644
--- a/searx/search/checker/impl.py
+++ b/searx/search/checker/impl.py
@@ -11,7 +11,7 @@ from urllib.parse import urlparse
import re
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
-import requests.exceptions
+import httpx
from searx import poolrequests, logger
from searx.results import ResultContainer
@@ -90,10 +90,10 @@ def _is_url_image(image_url):
if r.headers["content-type"].startswith('image/'):
return True
return False
- except requests.exceptions.Timeout:
+ except httpx.TimeoutException:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
- except requests.exceptions.RequestException:
+ except httpx.HTTPError:
logger.exception('Exception for %s', image_url)
return False
diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py
index 1fc6444ad..24d8f53e2 100644
--- a/searx/search/processors/online.py
+++ b/searx/search/processors/online.py
@@ -1,10 +1,10 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-from urllib.parse import urlparse
from time import time
import threading
+import asyncio
-import requests.exceptions
+import httpx
import searx.poolrequests as poolrequests
from searx.engines import settings
@@ -99,8 +99,8 @@ class OnlineProcessor(EngineProcessor):
# unexpected redirect : record an error
# but the engine might still return valid results.
status_code = str(response.status_code or '')
- reason = response.reason or ''
- hostname = str(urlparse(response.url or '').netloc)
+ reason = response.reason_phrase or ''
+ hostname = response.url.host
record_error(self.engine_name,
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
(status_code, reason, hostname))
@@ -135,7 +135,7 @@ class OnlineProcessor(EngineProcessor):
poolrequests.set_enable_http_protocol(self.engine.enable_http)
# suppose everything will be alright
- requests_exception = False
+ http_exception = False
suspended_time = None
try:
@@ -169,20 +169,20 @@ class OnlineProcessor(EngineProcessor):
with threading.RLock():
self.engine.stats['errors'] += 1
- if (issubclass(e.__class__, requests.exceptions.Timeout)):
+ if (issubclass(e.__class__, (httpx.TimeoutException, asyncio.TimeoutError))):
result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')
# requests timeout (connect or read)
logger.error("engine {0} : HTTP requests timeout"
"(search duration : {1} s, timeout: {2} s) : {3}"
.format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))
- requests_exception = True
- elif (issubclass(e.__class__, requests.exceptions.RequestException)):
+ http_exception = True
+ elif (issubclass(e.__class__, (httpx.HTTPError, httpx.StreamError))):
result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')
# other requests exception
logger.exception("engine {0} : requests exception"
"(search duration : {1} s, timeout: {2} s) : {3}"
.format(self.engine_name, engine_time, timeout_limit, e))
- requests_exception = True
+ http_exception = True
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
@@ -206,7 +206,7 @@ class OnlineProcessor(EngineProcessor):
# suspend the engine if there is an HTTP error
# or suspended_time is defined
with threading.RLock():
- if requests_exception or suspended_time:
+ if http_exception or suspended_time:
# update continuous_errors / suspend_end_time
self.engine.continuous_errors += 1
if suspended_time is None:
diff --git a/searx/settings.yml b/searx/settings.yml
index c289cde5c..742492c3c 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -67,11 +67,13 @@ ui:
# key : !!binary "your_morty_proxy_key"
outgoing: # communication with search engines
- request_timeout : 2.0 # default timeout in seconds, can be override by engine
+ request_timeout : 3.0 # default timeout in seconds, can be override by engine
# max_request_timeout: 10.0 # the maximum timeout in seconds
useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator
- pool_connections : 100 # Number of different hosts
- pool_maxsize : 10 # Number of simultaneous requests by host
+ pool_connections : 100 # The maximum number of concurrent connections that may be established.
+ pool_maxsize : 20 # Allow the connection pool to maintain keep-alive connections below this point.
+ keepalive_expiry: 30.0 # Number of seconds to keep a connection in the pool
+ http2: True # Enable HTTP/2 (experimental)
# uncomment below section if you want to use a proxy
# see https://2.python-requests.org/en/latest/user/advanced/#proxies
# SOCKS proxies are also supported: see https://2.python-requests.org/en/latest/user/advanced/#socks
diff --git a/searx/utils.py b/searx/utils.py
index 3172ad8f3..55a386bd5 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -45,7 +45,7 @@ def searx_useragent():
"""Return the searx User Agent"""
return 'searx/{searx_version} {suffix}'.format(
searx_version=VERSION_STRING,
- suffix=settings['outgoing'].get('useragent_suffix', ''))
+ suffix=settings['outgoing'].get('useragent_suffix', '')).strip()
def gen_useragent(os=None):
diff --git a/searx/webapp.py b/searx/webapp.py
index 072f140ca..1571df8f1 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -26,12 +26,26 @@ if __name__ == '__main__':
from os.path import realpath, dirname
sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
+# set Unix thread name
+try:
+ import setproctitle
+except ImportError:
+ pass
+else:
+ import threading
+ old_thread_init = threading.Thread.__init__
+
+ def new_thread_init(self, *args, **kwargs):
+ old_thread_init(self, *args, **kwargs)
+ setproctitle.setthreadtitle(self._name)
+ threading.Thread.__init__ = new_thread_init
+
import hashlib
import hmac
import json
import os
-import requests
+import httpx
from searx import logger
logger = logger.getChild('webapp')
@@ -79,7 +93,7 @@ from searx.plugins import plugins
from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers
-from searx.poolrequests import get_global_proxies
+from searx import poolrequests
from searx.answerers import ask
from searx.metrology.error_recorder import errors_per_engines
@@ -890,50 +904,62 @@ def _is_selected_language_supported(engine, preferences):
@app.route('/image_proxy', methods=['GET'])
def image_proxy():
- url = request.args.get('url').encode()
+ url = request.args.get('url')
if not url:
return '', 400
- h = new_hmac(settings['server']['secret_key'], url)
+ h = new_hmac(settings['server']['secret_key'], url.encode())
if h != request.args.get('h'):
return '', 400
- headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
- headers['User-Agent'] = gen_useragent()
-
- resp = requests.get(url,
- stream=True,
- timeout=settings['outgoing']['request_timeout'],
- headers=headers,
- proxies=get_global_proxies())
+ maximum_size = 5 * 1024 * 1024
- if resp.status_code == 304:
- return '', resp.status_code
-
- if resp.status_code != 200:
- logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
- if resp.status_code >= 400:
+ try:
+ headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
+ headers['User-Agent'] = gen_useragent()
+ stream = poolrequests.stream(
+ method='GET',
+ url=url,
+ headers=headers,
+ timeout=settings['outgoing']['request_timeout'],
+ allow_redirects=True,
+ max_redirects=20)
+
+ resp = next(stream)
+ content_length = resp.headers.get('Content-Length')
+ if content_length and content_length.isdigit() and int(content_length) > maximum_size:
+ return 'Max size', 400
+
+ if resp.status_code == 304:
return '', resp.status_code
- return '', 400
- if not resp.headers.get('content-type', '').startswith('image/'):
- logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
- return '', 400
+ if resp.status_code != 200:
+ logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
+ if resp.status_code >= 400:
+ return '', resp.status_code
+ return '', 400
+
+ if not resp.headers.get('content-type', '').startswith('image/'):
+ logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
+ return '', 400
- img = b''
- chunk_counter = 0
+ headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})
- for chunk in resp.iter_content(1024 * 1024):
- chunk_counter += 1
- if chunk_counter > 5:
- return '', 502 # Bad gateway - file is too big (>5M)
- img += chunk
+ total_length = 0
- headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})
+ def forward_chunk():
+ nonlocal total_length
+ for chunk in stream:
+ total_length += len(chunk)
+ if total_length > maximum_size:
+ break
+ yield chunk
- return Response(img, mimetype=resp.headers['content-type'], headers=headers)
+ return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers)
+ except httpx.HTTPError:
+ return '', 400
@app.route('/stats', methods=['GET'])
@@ -1083,6 +1109,11 @@ def config():
})
+@app.route('/config/http')
+def config_http():
+ return jsonify(poolrequests.debug_asyncclients())
+
+
@app.errorhandler(404)
def page_not_found(e):
return render('404.html'), 404
diff --git a/searx_extra/update/update_external_bangs.py b/searx_extra/update/update_external_bangs.py
index e9dc0ff1d..e401e460a 100755
--- a/searx_extra/update/update_external_bangs.py
+++ b/searx_extra/update/update_external_bangs.py
@@ -17,7 +17,7 @@ import json
import re
from os.path import join
-import requests
+import httpx
from searx import searx_dir # pylint: disable=E0401 C0413
@@ -30,7 +30,7 @@ HTTP_COLON = 'http:'
def get_bang_url():
- response = requests.get(URL_BV1)
+ response = httpx.get(URL_BV1)
response.raise_for_status()
r = RE_BANG_VERSION.findall(response.text)
@@ -38,7 +38,7 @@ def get_bang_url():
def fetch_ddg_bangs(url):
- response = requests.get(url)
+ response = httpx.get(url)
response.raise_for_status()
return json.loads(response.content.decode())
diff --git a/tests/unit/test_poolrequests.py b/tests/unit/test_poolrequests.py
index b22685fd0..3063ebcbd 100644
--- a/tests/unit/test_poolrequests.py
+++ b/tests/unit/test_poolrequests.py
@@ -1,9 +1,5 @@
-from unittest.mock import patch
-from requests.models import Response
-
from searx.testing import SearxTestCase
-import searx.poolrequests
from searx.poolrequests import get_proxy_cycles, get_proxies
@@ -64,26 +60,3 @@ class TestProxy(SearxTestCase):
'http': 'http://localhost:9092',
'https': 'http://localhost:9093'
})
-
- @patch('searx.poolrequests.get_global_proxies')
- def test_request(self, mock_get_global_proxies):
- method = 'GET'
- url = 'http://localhost'
- custom_proxies = {
- 'https': 'http://localhost:1080'
- }
- global_proxies = {
- 'http': 'http://localhost:9092',
- 'https': 'http://localhost:9093'
- }
- mock_get_global_proxies.return_value = global_proxies
-
- # check the global proxies usage
- with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method:
- searx.poolrequests.request(method, url)
- mock_method.assert_called_once_with(method=method, url=url, proxies=global_proxies)
-
- # check if the proxies parameter overrides the global proxies
- with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method:
- searx.poolrequests.request(method, url, proxies=custom_proxies)
- mock_method.assert_called_once_with(method=method, url=url, proxies=custom_proxies)