1 files changed, 34 insertions, 15 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 4c037de85..3d4ac08bd 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -8,7 +8,8 @@
 import re
 from urllib.parse import urlencode, urlparse, parse_qs
 from lxml import html
-from searx.utils import eval_xpath, extract_text, match_language
+from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
+from searx.network import multi_requests, Request
 
 about = {
     "website": 'https://www.bing.com',
@@ -79,30 +80,48 @@ def response(resp):
 
     dom = html.fromstring(resp.text)
 
-    for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
-
-        # IMO //div[@class="sa_cc"] does no longer match
-        logger.debug('found //div[@class="sa_cc"] --> %s', result)
-
-        link = eval_xpath(result, './/h3/a')[0]
-        url = link.attrib.get('href')
-        title = extract_text(link)
-        content = extract_text(eval_xpath(result, './/p'))
-
-        # append result
-        results.append({'url': url, 'title': title, 'content': content})
-
     # parse results again if nothing is found yet
-    for result in eval_xpath(dom, '//li[@class="b_algo"]'):
+
+    url_to_resolve = []
+    url_to_resolve_index = []
+    for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
 
         link = eval_xpath(result, './/h2/a')[0]
         url = link.attrib.get('href')
         title = extract_text(link)
         content = extract_text(eval_xpath(result, './/p'))
 
+        # get the real URL either using the URL shown to user or following the Bing URL
+        if url.startswith('https://www.bing.com/ck/a?'):
+            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
+            # Bing can shorten the URL either at the end or in the middle of the string
+            if (
+                url_cite.startswith('https://')
+                and '…' not in url_cite
+                and '...' not in url_cite
+                and '›' not in url_cite
+            ):
+                # no need for an additional HTTP request
+                url = url_cite
+            else:
+                # resolve the URL with an additional HTTP request
+                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
+                url_to_resolve_index.append(i)
+                url = None  # remove the result if the HTTP Bing redirect raise an exception
+
         # append result
         results.append({'url': url, 'title': title, 'content': content})
 
+    # resolve all Bing redirections in parallel
+    request_list = [
+        Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
+    ]
+    response_list = multi_requests(request_list)
+    for i, redirect_response in enumerate(response_list):
+        if not isinstance(redirect_response, Exception):
+            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
+
+    # get number_of_results
     try:
         result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
         if "-" in result_len_container: