diff options
Diffstat (limited to 'searx/engines/bing.py')
-rw-r--r-- | searx/engines/bing.py | 49 |
1 files changed, 34 insertions, 15 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 4c037de85..3d4ac08bd 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -8,7 +8,8 @@ import re from urllib.parse import urlencode, urlparse, parse_qs from lxml import html -from searx.utils import eval_xpath, extract_text, match_language +from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language +from searx.network import multi_requests, Request about = { "website": 'https://www.bing.com', @@ -79,30 +80,48 @@ def response(resp): dom = html.fromstring(resp.text) - for result in eval_xpath(dom, '//div[@class="sa_cc"]'): - - # IMO //div[@class="sa_cc"] does no longer match - logger.debug('found //div[@class="sa_cc"] --> %s', result) - - link = eval_xpath(result, './/h3/a')[0] - url = link.attrib.get('href') - title = extract_text(link) - content = extract_text(eval_xpath(result, './/p')) - - # append result - results.append({'url': url, 'title': title, 'content': content}) - # parse results again if nothing is found yet - for result in eval_xpath(dom, '//li[@class="b_algo"]'): + + url_to_resolve = [] + url_to_resolve_index = [] + for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(eval_xpath(result, './/p')) + # get the real URL either using the URL shown to user or following the Bing URL + if url.startswith('https://www.bing.com/ck/a?'): + url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) + # Bing can shorten the URL either at the end or in the middle of the string + if ( + url_cite.startswith('https://') + and '…' not in url_cite + and '...' not in url_cite + and '›' not in url_cite + ): + # no need for an additional HTTP request + url = url_cite + else: + # resolve the URL with an additional HTTP request + url_to_resolve.append(url.replace('&ntb=1', '&ntb=F')) + url_to_resolve_index.append(i) + url = None # remove the result if the HTTP Bing redirect raise an exception + # append result results.append({'url': url, 'title': title, 'content': content}) + # resolve all Bing redirections in parallel + request_list = [ + Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve + ] + response_list = multi_requests(request_list) + for i, redirect_response in enumerate(response_list): + if not isinstance(redirect_response, Exception): + results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] + + # get number_of_results try: result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: |