diff options
author | Ahmad Alkadri <ahmad.alkadri@outlook.com> | 2023-01-08 19:12:52 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-01-09 15:08:24 +0100 |
commit | 7fc8d72889742399abb0d327e355ab350ddb395b (patch) | |
tree | 557b602a3e5f1e6e6528511d2cf369238916693b /searx/engines | |
parent | a90ed481ed28ca35bdc9e694c355120ec2ccebb2 (diff) | |
download | searxng-7fc8d72889742399abb0d327e355ab350ddb395b.tar.gz searxng-7fc8d72889742399abb0d327e355ab350ddb395b.zip |
[fix] bing: parsing result; check to see if the element contains links
This patch is to hardening the parsing of the bing response:
1. To fix [2087] check if the selected result item contains a link, otherwise
skip result item and continue in the result loop. Increment the result
pointer when a result has been added / the enumerate that counts for skipped
items is no longer valid when result items are skipped.
To test the bugfix use: ``!bi :all cerbot``
2. Limit the XPath selection of result items to direct children nodes (list
items ``li``) of the ordered list (``ol``).
To test the selector use: ``!bi :en pontiac aztek wiki``
.. in the result list you should find the wikipedia entry on top,
compare [2068]
[2087] https://github.com/searxng/searxng/issues/2087
[2068] https://github.com/searxng/searxng/issues/2068
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/bing.py | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 5c4681cd8..783c0056a 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -9,7 +9,7 @@ import re from urllib.parse import urlencode, urlparse, parse_qs from lxml import html -from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language +from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex from searx.network import multi_requests, Request about = { @@ -84,9 +84,12 @@ def response(resp): url_to_resolve = [] url_to_resolve_index = [] - for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')): + i = 0 + for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'): - link = eval_xpath(result, './/h2/a')[0] + link = eval_xpath_getindex(result, './/h2/a', 0, None) + if link is None: + continue url = link.attrib.get('href') title = extract_text(link) @@ -119,6 +122,8 @@ def response(resp): # append result results.append({'url': url, 'title': title, 'content': content}) + # increment result pointer for the next iteration in this loop + i += 1 # resolve all Bing redirections in parallel request_list = [ |