bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once
author: Alexandre Flament <alex@al-f.net> 2022-05-21 18:24:47 +0200
committer: Alexandre Flament <alex@al-f.net> 2022-07-08 22:02:21 +0200
commit: a1e8af0796d532d529eb9d90f315f79dfbd86b0d (patch)
tree: 3bf551b24162b05fe50193dd0c3079ced8698116 /searx
parent: 2864a67ce922ecaf3f224859ee1e4a92f0b9f8c7 (diff)
download: searxng-a1e8af0796d532d529eb9d90f315f79dfbd86b0d.tar.gz
searxng-a1e8af0796d532d529eb9d90f315f79dfbd86b0d.zip
3 files changed, 149 insertions, 53 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 4c037de85..3d4ac08bd 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -8,7 +8,8 @@
 import re
 from urllib.parse import urlencode, urlparse, parse_qs
 from lxml import html
-from searx.utils import eval_xpath, extract_text, match_language
+from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
+from searx.network import multi_requests, Request
 
 about = {
     "website": 'https://www.bing.com',
@@ -79,30 +80,48 @@ def response(resp):
 
     dom = html.fromstring(resp.text)
 
-    for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
-
-        # IMO //div[@class="sa_cc"] does no longer match
-        logger.debug('found //div[@class="sa_cc"] --> %s', result)
-
-        link = eval_xpath(result, './/h3/a')[0]
-        url = link.attrib.get('href')
-        title = extract_text(link)
-        content = extract_text(eval_xpath(result, './/p'))
-
-        # append result
-        results.append({'url': url, 'title': title, 'content': content})
-
     # parse results again if nothing is found yet
-    for result in eval_xpath(dom, '//li[@class="b_algo"]'):
+
+    url_to_resolve = []
+    url_to_resolve_index = []
+    for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
 
         link = eval_xpath(result, './/h2/a')[0]
         url = link.attrib.get('href')
         title = extract_text(link)
         content = extract_text(eval_xpath(result, './/p'))
 
+        # get the real URL either using the URL shown to user or following the Bing URL
+        if url.startswith('https://www.bing.com/ck/a?'):
+            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
+            # Bing can shorten the URL either at the end or in the middle of the string
+            if (
+                url_cite.startswith('https://')
+                and '…' not in url_cite
+                and '...' not in url_cite
+                and '›' not in url_cite
+            ):
+                # no need for an additional HTTP request
+                url = url_cite
+            else:
+                # resolve the URL with an additional HTTP request
+                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
+                url_to_resolve_index.append(i)
+                url = None  # remove the result if the HTTP Bing redirect raise an exception
+
         # append result
         results.append({'url': url, 'title': title, 'content': content})
 
+    # resolve all Bing redirections in parallel
+    request_list = [
+        Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
+    ]
+    response_list = multi_requests(request_list)
+    for i, redirect_response in enumerate(response_list):
+        if not isinstance(redirect_response, Exception):
+            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
+
+    # get number_of_results
     try:
         result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
         if "-" in result_len_container:
diff --git a/searx/network/__init__.py b/searx/network/__init__.py
index 06c9f75a4..8622e9731 100644
--- a/searx/network/__init__.py
+++ b/searx/network/__init__.py
@@ -8,7 +8,8 @@ import concurrent.futures
 from queue import SimpleQueue
 from types import MethodType
 from timeit import default_timer
-from typing import Iterable, Tuple
+from typing import Iterable, NamedTuple, Tuple, List, Dict, Union
+from contextlib import contextmanager
 
 import httpx
 import anyio
@@ -48,9 +49,23 @@ def get_context_network():
     return THREADLOCAL.__dict__.get('network') or get_network()
 
 
-def request(method, url, **kwargs):
-    """same as requests/requests/api.py request(...)"""
+@contextmanager
+def _record_http_time():
+    # pylint: disable=too-many-branches
     time_before_request = default_timer()
+    start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
+    try:
+        yield start_time
+    finally:
+        # update total_time.
+        # See get_time_for_thread() and reset_time_for_thread()
+        if hasattr(THREADLOCAL, 'total_time'):
+            time_after_request = default_timer()
+            THREADLOCAL.total_time += time_after_request - time_before_request
+
+
+def _get_timeout(start_time, kwargs):
+    # pylint: disable=too-many-branches
 
     # timeout (httpx)
     if 'timeout' in kwargs:
@@ -65,45 +80,84 @@ def request(method, url, **kwargs):
 
     # ajdust actual timeout
     timeout += 0.2  # overhead
-    start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
     if start_time:
         timeout -= default_timer() - start_time
 
-    # raise_for_error
-    check_for_httperror = True
-    if 'raise_for_httperror' in kwargs:
-        check_for_httperror = kwargs['raise_for_httperror']
-        del kwargs['raise_for_httperror']
+    return timeout
 
-    # requests compatibility
-    if isinstance(url, bytes):
-        url = url.decode()
 
-    # network
-    network = get_context_network()
-
-    # do request
-    future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
-    try:
-        response = future.result(timeout)
-    except concurrent.futures.TimeoutError as e:
-        raise httpx.TimeoutException('Timeout', request=None) from e
-
-    # requests compatibility
-    # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
-    response.ok = not response.is_error
-
-    # update total_time.
-    # See get_time_for_thread() and reset_time_for_thread()
-    if hasattr(THREADLOCAL, 'total_time'):
-        time_after_request = default_timer()
-        THREADLOCAL.total_time += time_after_request - time_before_request
-
-    # raise an exception
-    if check_for_httperror:
-        raise_for_httperror(response)
-
-    return response
+def request(method, url, **kwargs):
+    """same as requests/requests/api.py request(...)"""
+    with _record_http_time() as start_time:
+        network = get_context_network()
+        timeout = _get_timeout(start_time, kwargs)
+        future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
+        try:
+            return future.result(timeout)
+        except concurrent.futures.TimeoutError as e:
+            raise httpx.TimeoutException('Timeout', request=None) from e
+
+
+def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]:
+    """send multiple HTTP requests in parallel. Wait for all requests to finish."""
+    with _record_http_time() as start_time:
+        # send the requests
+        network = get_context_network()
+        loop = get_loop()
+        future_list = []
+        for request_desc in request_list:
+            timeout = _get_timeout(start_time, request_desc.kwargs)
+            future = asyncio.run_coroutine_threadsafe(
+                network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop
+            )
+            future_list.append((future, timeout))
+
+        # read the responses
+        responses = []
+        for future, timeout in future_list:
+            try:
+                responses.append(future.result(timeout))
+            except concurrent.futures.TimeoutError:
+                responses.append(httpx.TimeoutException('Timeout', request=None))
+            except Exception as e:  # pylint: disable=broad-except
+                responses.append(e)
+        return responses
+
+
+class Request(NamedTuple):
+    """Request description for the multi_requests function"""
+
+    method: str
+    url: str
+    kwargs: Dict[str, str] = {}
+
+    @staticmethod
+    def get(url, **kwargs):
+        return Request('GET', url, kwargs)
+
+    @staticmethod
+    def options(url, **kwargs):
+        return Request('OPTIONS', url, kwargs)
+
+    @staticmethod
+    def head(url, **kwargs):
+        return Request('HEAD', url, kwargs)
+
+    @staticmethod
+    def post(url, **kwargs):
+        return Request('POST', url, kwargs)
+
+    @staticmethod
+    def put(url, **kwargs):
+        return Request('PUT', url, kwargs)
+
+    @staticmethod
+    def patch(url, **kwargs):
+        return Request('PATCH', url, kwargs)
+
+    @staticmethod
+    def delete(url, **kwargs):
+        return Request('DELETE', url, kwargs)
 
 
 def get(url, **kwargs):
diff --git a/searx/network/network.py b/searx/network/network.py
index 69af3b7c4..677a908bf 100644
--- a/searx/network/network.py
+++ b/searx/network/network.py
@@ -13,6 +13,7 @@ import httpx
 
 from searx import logger, searx_debug
 from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
+from .raise_for_httperror import raise_for_httperror
 
 
 logger = logger.getChild('network')
@@ -226,6 +227,27 @@ class Network:
             kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
         return kwargs_clients
 
+    @staticmethod
+    def extract_do_raise_for_httperror(kwargs):
+        do_raise_for_httperror = True
+        if 'raise_for_httperror' in kwargs:
+            do_raise_for_httperror = kwargs['raise_for_httperror']
+            del kwargs['raise_for_httperror']
+        return do_raise_for_httperror
+
+    @staticmethod
+    def patch_response(response, do_raise_for_httperror):
+        if isinstance(response, httpx.Response):
+            # requests compatibility (response is not streamed)
+            # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
+            response.ok = not response.is_error
+
+            # raise an exception
+            if do_raise_for_httperror:
+                raise_for_httperror(response)
+
+        return response
+
     def is_valid_response(self, response):
         # pylint: disable=too-many-boolean-expressions
         if (
@@ -239,6 +261,7 @@ class Network:
     async def call_client(self, stream, method, url, **kwargs):
         retries = self.retries
         was_disconnected = False
+        do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs)
         kwargs_clients = Network.extract_kwargs_clients(kwargs)
         while retries >= 0:  # pragma: no cover
             client = await self.get_client(**kwargs_clients)
@@ -248,7 +271,7 @@ class Network:
                 else:
                     response = await client.request(method, url, **kwargs)
                 if self.is_valid_response(response) or retries <= 0:
-                    return response
+                    return Network.patch_response(response, do_raise_for_httperror)
             except httpx.RemoteProtocolError as e:
                 if not was_disconnected:
                     # the server has closed the connection:
author	Alexandre Flament <alex@al-f.net>	2022-05-21 18:24:47 +0200
committer	Alexandre Flament <alex@al-f.net>	2022-07-08 22:02:21 +0200
commit	a1e8af0796d532d529eb9d90f315f79dfbd86b0d (patch)
tree	3bf551b24162b05fe50193dd0c3079ced8698116 /searx
parent	2864a67ce922ecaf3f224859ee1e4a92f0b9f8c7 (diff)
download	searxng-a1e8af0796d532d529eb9d90f315f79dfbd86b0d.tar.gz searxng-a1e8af0796d532d529eb9d90f315f79dfbd86b0d.zip