diff options
author | Alexandre Flament <alex@al-f.net> | 2022-07-10 18:06:22 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-10 18:06:22 +0200 |
commit | 44f2eb50a5566e8e14a1e970f1ed775bf025c1ea (patch) | |
tree | dff03d000826a9d493b443f29d4ee9d0e725f542 /searx | |
parent | 4de4a213a6fa4b1755a3c8bc55cdf79a0c98875d (diff) | |
parent | a1e8af0796d532d529eb9d90f315f79dfbd86b0d (diff) | |
download | searxng-44f2eb50a5566e8e14a1e970f1ed775bf025c1ea.tar.gz searxng-44f2eb50a5566e8e14a1e970f1ed775bf025c1ea.zip |
Merge pull request #1219 from dalf/follow_bing_redirect
bing.py: remove redirection links
Diffstat (limited to 'searx')
-rw-r--r-- | searx/engines/bing.py | 49 | ||||
-rw-r--r-- | searx/network/__init__.py | 128 | ||||
-rw-r--r-- | searx/network/network.py | 25 |
3 files changed, 149 insertions, 53 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 4c037de85..3d4ac08bd 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -8,7 +8,8 @@ import re from urllib.parse import urlencode, urlparse, parse_qs from lxml import html -from searx.utils import eval_xpath, extract_text, match_language +from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language +from searx.network import multi_requests, Request about = { "website": 'https://www.bing.com', @@ -79,30 +80,48 @@ def response(resp): dom = html.fromstring(resp.text) - for result in eval_xpath(dom, '//div[@class="sa_cc"]'): - - # IMO //div[@class="sa_cc"] does no longer match - logger.debug('found //div[@class="sa_cc"] --> %s', result) - - link = eval_xpath(result, './/h3/a')[0] - url = link.attrib.get('href') - title = extract_text(link) - content = extract_text(eval_xpath(result, './/p')) - - # append result - results.append({'url': url, 'title': title, 'content': content}) - # parse results again if nothing is found yet - for result in eval_xpath(dom, '//li[@class="b_algo"]'): + + url_to_resolve = [] + url_to_resolve_index = [] + for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(eval_xpath(result, './/p')) + # get the real URL either using the URL shown to user or following the Bing URL + if url.startswith('https://www.bing.com/ck/a?'): + url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) + # Bing can shorten the URL either at the end or in the middle of the string + if ( + url_cite.startswith('https://') + and '…' not in url_cite + and '...' not in url_cite + and '›' not in url_cite + ): + # no need for an additional HTTP request + url = url_cite + else: + # resolve the URL with an additional HTTP request + url_to_resolve.append(url.replace('&ntb=1', '&ntb=F')) + url_to_resolve_index.append(i) + url = None # remove the result if the HTTP Bing redirect raise an exception + # append result results.append({'url': url, 'title': title, 'content': content}) + # resolve all Bing redirections in parallel + request_list = [ + Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve + ] + response_list = multi_requests(request_list) + for i, redirect_response in enumerate(response_list): + if not isinstance(redirect_response, Exception): + results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] + + # get number_of_results try: result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: diff --git a/searx/network/__init__.py b/searx/network/__init__.py index 06c9f75a4..8622e9731 100644 --- a/searx/network/__init__.py +++ b/searx/network/__init__.py @@ -8,7 +8,8 @@ import concurrent.futures from queue import SimpleQueue from types import MethodType from timeit import default_timer -from typing import Iterable, Tuple +from typing import Iterable, NamedTuple, Tuple, List, Dict, Union +from contextlib import contextmanager import httpx import anyio @@ -48,9 +49,23 @@ def get_context_network(): return THREADLOCAL.__dict__.get('network') or get_network() -def request(method, url, **kwargs): - """same as requests/requests/api.py request(...)""" +@contextmanager +def _record_http_time(): + # pylint: disable=too-many-branches time_before_request = default_timer() + start_time = getattr(THREADLOCAL, 'start_time', time_before_request) + try: + yield start_time + finally: + # update total_time. + # See get_time_for_thread() and reset_time_for_thread() + if hasattr(THREADLOCAL, 'total_time'): + time_after_request = default_timer() + THREADLOCAL.total_time += time_after_request - time_before_request + + +def _get_timeout(start_time, kwargs): + # pylint: disable=too-many-branches # timeout (httpx) if 'timeout' in kwargs: @@ -65,45 +80,84 @@ def request(method, url, **kwargs): # ajdust actual timeout timeout += 0.2 # overhead - start_time = getattr(THREADLOCAL, 'start_time', time_before_request) if start_time: timeout -= default_timer() - start_time - # raise_for_error - check_for_httperror = True - if 'raise_for_httperror' in kwargs: - check_for_httperror = kwargs['raise_for_httperror'] - del kwargs['raise_for_httperror'] + return timeout - # requests compatibility - if isinstance(url, bytes): - url = url.decode() - # network - network = get_context_network() - - # do request - future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) - try: - response = future.result(timeout) - except concurrent.futures.TimeoutError as e: - raise httpx.TimeoutException('Timeout', request=None) from e - - # requests compatibility - # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses - response.ok = not response.is_error - - # update total_time. - # See get_time_for_thread() and reset_time_for_thread() - if hasattr(THREADLOCAL, 'total_time'): - time_after_request = default_timer() - THREADLOCAL.total_time += time_after_request - time_before_request - - # raise an exception - if check_for_httperror: - raise_for_httperror(response) - - return response +def request(method, url, **kwargs): + """same as requests/requests/api.py request(...)""" + with _record_http_time() as start_time: + network = get_context_network() + timeout = _get_timeout(start_time, kwargs) + future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) + try: + return future.result(timeout) + except concurrent.futures.TimeoutError as e: + raise httpx.TimeoutException('Timeout', request=None) from e + + +def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]: + """send multiple HTTP requests in parallel. Wait for all requests to finish.""" + with _record_http_time() as start_time: + # send the requests + network = get_context_network() + loop = get_loop() + future_list = [] + for request_desc in request_list: + timeout = _get_timeout(start_time, request_desc.kwargs) + future = asyncio.run_coroutine_threadsafe( + network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop + ) + future_list.append((future, timeout)) + + # read the responses + responses = [] + for future, timeout in future_list: + try: + responses.append(future.result(timeout)) + except concurrent.futures.TimeoutError: + responses.append(httpx.TimeoutException('Timeout', request=None)) + except Exception as e: # pylint: disable=broad-except + responses.append(e) + return responses + + +class Request(NamedTuple): + """Request description for the multi_requests function""" + + method: str + url: str + kwargs: Dict[str, str] = {} + + @staticmethod + def get(url, **kwargs): + return Request('GET', url, kwargs) + + @staticmethod + def options(url, **kwargs): + return Request('OPTIONS', url, kwargs) + + @staticmethod + def head(url, **kwargs): + return Request('HEAD', url, kwargs) + + @staticmethod + def post(url, **kwargs): + return Request('POST', url, kwargs) + + @staticmethod + def put(url, **kwargs): + return Request('PUT', url, kwargs) + + @staticmethod + def patch(url, **kwargs): + return Request('PATCH', url, kwargs) + + @staticmethod + def delete(url, **kwargs): + return Request('DELETE', url, kwargs) def get(url, **kwargs): diff --git a/searx/network/network.py b/searx/network/network.py index 69af3b7c4..677a908bf 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -13,6 +13,7 @@ import httpx from searx import logger, searx_debug from .client import new_client, get_loop, AsyncHTTPTransportNoHttp +from .raise_for_httperror import raise_for_httperror logger = logger.getChild('network') @@ -226,6 +227,27 @@ class Network: kwargs['follow_redirects'] = kwargs.pop('allow_redirects') return kwargs_clients + @staticmethod + def extract_do_raise_for_httperror(kwargs): + do_raise_for_httperror = True + if 'raise_for_httperror' in kwargs: + do_raise_for_httperror = kwargs['raise_for_httperror'] + del kwargs['raise_for_httperror'] + return do_raise_for_httperror + + @staticmethod + def patch_response(response, do_raise_for_httperror): + if isinstance(response, httpx.Response): + # requests compatibility (response is not streamed) + # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses + response.ok = not response.is_error + + # raise an exception + if do_raise_for_httperror: + raise_for_httperror(response) + + return response + def is_valid_response(self, response): # pylint: disable=too-many-boolean-expressions if ( @@ -239,6 +261,7 @@ class Network: async def call_client(self, stream, method, url, **kwargs): retries = self.retries was_disconnected = False + do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs) kwargs_clients = Network.extract_kwargs_clients(kwargs) while retries >= 0: # pragma: no cover client = await self.get_client(**kwargs_clients) @@ -248,7 +271,7 @@ class Network: else: response = await client.request(method, url, **kwargs) if self.is_valid_response(response) or retries <= 0: - return response + return Network.patch_response(response, do_raise_for_httperror) except httpx.RemoteProtocolError as e: if not was_disconnected: # the server has closed the connection: |