summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2022-07-10 18:06:22 +0200
committerGitHub <noreply@github.com>2022-07-10 18:06:22 +0200
commit44f2eb50a5566e8e14a1e970f1ed775bf025c1ea (patch)
treedff03d000826a9d493b443f29d4ee9d0e725f542 /searx
parent4de4a213a6fa4b1755a3c8bc55cdf79a0c98875d (diff)
parenta1e8af0796d532d529eb9d90f315f79dfbd86b0d (diff)
downloadsearxng-44f2eb50a5566e8e14a1e970f1ed775bf025c1ea.tar.gz
searxng-44f2eb50a5566e8e14a1e970f1ed775bf025c1ea.zip
Merge pull request #1219 from dalf/follow_bing_redirect
bing.py: remove redirection links
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/bing.py49
-rw-r--r--searx/network/__init__.py128
-rw-r--r--searx/network/network.py25
3 files changed, 149 insertions, 53 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 4c037de85..3d4ac08bd 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -8,7 +8,8 @@
import re
from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html
-from searx.utils import eval_xpath, extract_text, match_language
+from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
+from searx.network import multi_requests, Request
about = {
"website": 'https://www.bing.com',
@@ -79,30 +80,48 @@ def response(resp):
dom = html.fromstring(resp.text)
- for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
-
- # IMO //div[@class="sa_cc"] does no longer match
- logger.debug('found //div[@class="sa_cc"] --> %s', result)
-
- link = eval_xpath(result, './/h3/a')[0]
- url = link.attrib.get('href')
- title = extract_text(link)
- content = extract_text(eval_xpath(result, './/p'))
-
- # append result
- results.append({'url': url, 'title': title, 'content': content})
-
# parse results again if nothing is found yet
- for result in eval_xpath(dom, '//li[@class="b_algo"]'):
+
+ url_to_resolve = []
+ url_to_resolve_index = []
+ for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
link = eval_xpath(result, './/h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(result, './/p'))
+ # get the real URL either using the URL shown to user or following the Bing URL
+ if url.startswith('https://www.bing.com/ck/a?'):
+ url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
+ # Bing can shorten the URL either at the end or in the middle of the string
+ if (
+ url_cite.startswith('https://')
+ and '…' not in url_cite
+ and '...' not in url_cite
+ and '›' not in url_cite
+ ):
+ # no need for an additional HTTP request
+ url = url_cite
+ else:
+ # resolve the URL with an additional HTTP request
+ url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
+ url_to_resolve_index.append(i)
+ url = None # remove the result if the HTTP Bing redirect raise an exception
+
# append result
results.append({'url': url, 'title': title, 'content': content})
+ # resolve all Bing redirections in parallel
+ request_list = [
+ Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
+ ]
+ response_list = multi_requests(request_list)
+ for i, redirect_response in enumerate(response_list):
+ if not isinstance(redirect_response, Exception):
+ results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
+
+ # get number_of_results
try:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container:
diff --git a/searx/network/__init__.py b/searx/network/__init__.py
index 06c9f75a4..8622e9731 100644
--- a/searx/network/__init__.py
+++ b/searx/network/__init__.py
@@ -8,7 +8,8 @@ import concurrent.futures
from queue import SimpleQueue
from types import MethodType
from timeit import default_timer
-from typing import Iterable, Tuple
+from typing import Iterable, NamedTuple, Tuple, List, Dict, Union
+from contextlib import contextmanager
import httpx
import anyio
@@ -48,9 +49,23 @@ def get_context_network():
return THREADLOCAL.__dict__.get('network') or get_network()
-def request(method, url, **kwargs):
- """same as requests/requests/api.py request(...)"""
+@contextmanager
+def _record_http_time():
+ # pylint: disable=too-many-branches
time_before_request = default_timer()
+ start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
+ try:
+ yield start_time
+ finally:
+ # update total_time.
+ # See get_time_for_thread() and reset_time_for_thread()
+ if hasattr(THREADLOCAL, 'total_time'):
+ time_after_request = default_timer()
+ THREADLOCAL.total_time += time_after_request - time_before_request
+
+
+def _get_timeout(start_time, kwargs):
+ # pylint: disable=too-many-branches
# timeout (httpx)
if 'timeout' in kwargs:
@@ -65,45 +80,84 @@ def request(method, url, **kwargs):
# ajdust actual timeout
timeout += 0.2 # overhead
- start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
if start_time:
timeout -= default_timer() - start_time
- # raise_for_error
- check_for_httperror = True
- if 'raise_for_httperror' in kwargs:
- check_for_httperror = kwargs['raise_for_httperror']
- del kwargs['raise_for_httperror']
+ return timeout
- # requests compatibility
- if isinstance(url, bytes):
- url = url.decode()
- # network
- network = get_context_network()
-
- # do request
- future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
- try:
- response = future.result(timeout)
- except concurrent.futures.TimeoutError as e:
- raise httpx.TimeoutException('Timeout', request=None) from e
-
- # requests compatibility
- # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
- response.ok = not response.is_error
-
- # update total_time.
- # See get_time_for_thread() and reset_time_for_thread()
- if hasattr(THREADLOCAL, 'total_time'):
- time_after_request = default_timer()
- THREADLOCAL.total_time += time_after_request - time_before_request
-
- # raise an exception
- if check_for_httperror:
- raise_for_httperror(response)
-
- return response
+def request(method, url, **kwargs):
+ """same as requests/requests/api.py request(...)"""
+ with _record_http_time() as start_time:
+ network = get_context_network()
+ timeout = _get_timeout(start_time, kwargs)
+ future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
+ try:
+ return future.result(timeout)
+ except concurrent.futures.TimeoutError as e:
+ raise httpx.TimeoutException('Timeout', request=None) from e
+
+
+def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]:
+ """send multiple HTTP requests in parallel. Wait for all requests to finish."""
+ with _record_http_time() as start_time:
+ # send the requests
+ network = get_context_network()
+ loop = get_loop()
+ future_list = []
+ for request_desc in request_list:
+ timeout = _get_timeout(start_time, request_desc.kwargs)
+ future = asyncio.run_coroutine_threadsafe(
+ network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop
+ )
+ future_list.append((future, timeout))
+
+ # read the responses
+ responses = []
+ for future, timeout in future_list:
+ try:
+ responses.append(future.result(timeout))
+ except concurrent.futures.TimeoutError:
+ responses.append(httpx.TimeoutException('Timeout', request=None))
+ except Exception as e: # pylint: disable=broad-except
+ responses.append(e)
+ return responses
+
+
+class Request(NamedTuple):
+ """Request description for the multi_requests function"""
+
+ method: str
+ url: str
+ kwargs: Dict[str, str] = {}
+
+ @staticmethod
+ def get(url, **kwargs):
+ return Request('GET', url, kwargs)
+
+ @staticmethod
+ def options(url, **kwargs):
+ return Request('OPTIONS', url, kwargs)
+
+ @staticmethod
+ def head(url, **kwargs):
+ return Request('HEAD', url, kwargs)
+
+ @staticmethod
+ def post(url, **kwargs):
+ return Request('POST', url, kwargs)
+
+ @staticmethod
+ def put(url, **kwargs):
+ return Request('PUT', url, kwargs)
+
+ @staticmethod
+ def patch(url, **kwargs):
+ return Request('PATCH', url, kwargs)
+
+ @staticmethod
+ def delete(url, **kwargs):
+ return Request('DELETE', url, kwargs)
def get(url, **kwargs):
diff --git a/searx/network/network.py b/searx/network/network.py
index 69af3b7c4..677a908bf 100644
--- a/searx/network/network.py
+++ b/searx/network/network.py
@@ -13,6 +13,7 @@ import httpx
from searx import logger, searx_debug
from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
+from .raise_for_httperror import raise_for_httperror
logger = logger.getChild('network')
@@ -226,6 +227,27 @@ class Network:
kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
return kwargs_clients
+ @staticmethod
+ def extract_do_raise_for_httperror(kwargs):
+ do_raise_for_httperror = True
+ if 'raise_for_httperror' in kwargs:
+ do_raise_for_httperror = kwargs['raise_for_httperror']
+ del kwargs['raise_for_httperror']
+ return do_raise_for_httperror
+
+ @staticmethod
+ def patch_response(response, do_raise_for_httperror):
+ if isinstance(response, httpx.Response):
+ # requests compatibility (response is not streamed)
+ # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
+ response.ok = not response.is_error
+
+ # raise an exception
+ if do_raise_for_httperror:
+ raise_for_httperror(response)
+
+ return response
+
def is_valid_response(self, response):
# pylint: disable=too-many-boolean-expressions
if (
@@ -239,6 +261,7 @@ class Network:
async def call_client(self, stream, method, url, **kwargs):
retries = self.retries
was_disconnected = False
+ do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs)
kwargs_clients = Network.extract_kwargs_clients(kwargs)
while retries >= 0: # pragma: no cover
client = await self.get_client(**kwargs_clients)
@@ -248,7 +271,7 @@ class Network:
else:
response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0:
- return response
+ return Network.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e:
if not was_disconnected:
# the server has closed the connection: