From 9ee99423fe22550ef566245ef23e2a9e8ee76c27 Mon Sep 17 00:00:00 2001 From: ahmad-alkadri Date: Tue, 3 Jan 2023 22:59:01 +0100 Subject: [fix] Bing-Web engine: XPath to get the wikipedia result Modify the XPath selector to get the wikipedia result plus small fixes. About result content: especially with the Wikipedia result, we'd get several paragraph elements, only the first paragraph would be taken and displayed on the search result --- searx/engines/bing.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'searx') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 8d024fed0..5c4681cd8 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -4,6 +4,7 @@ - https://github.com/searx/searx/issues/2019#issuecomment-648227442 """ +# pylint: disable=too-many-branches import re from urllib.parse import urlencode, urlparse, parse_qs @@ -74,7 +75,6 @@ def request(query, params): def response(resp): - results = [] result_len = 0 @@ -84,12 +84,20 @@ def response(resp): url_to_resolve = [] url_to_resolve_index = [] - for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): + for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')): link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = extract_text(eval_xpath(result, './/p')) + + # Make sure that the element is free of links and + content = eval_xpath(result, '(.//p)[1]') + for p in content: + for e in p.xpath('.//a'): + e.getparent().remove(e) + for e in p.xpath('.//span[@class="algoSlug_icon"]'): + e.getparent().remove(e) + content = extract_text(content) # get the real URL either using the URL shown to user or following the Bing URL if url.startswith('https://www.bing.com/ck/a?'): -- cgit v1.2.3-54-g00ecf