summaryrefslogtreecommitdiff
path: root/searx/engines/bing.py
diff options
context:
space:
mode:
authorahmad-alkadri <ahmad.alkadri@outlook.com>2023-01-03 22:59:01 +0100
committerMarkus Heiser <markus.heiser@darmarit.de>2023-01-08 09:11:16 +0100
commit9ee99423fe22550ef566245ef23e2a9e8ee76c27 (patch)
treea08547dcc00788b5c9d5e160ab318e0dd9e53bee /searx/engines/bing.py
parent4e355564d2e11e4e86826a093c1f450b43adce4c (diff)
downloadsearxng-9ee99423fe22550ef566245ef23e2a9e8ee76c27.tar.gz
searxng-9ee99423fe22550ef566245ef23e2a9e8ee76c27.zip
[fix] Bing-Web engine: XPath to get the wikipedia result
Modify the XPath selector to get the wikipedia result plus small fixes. About result content: especially with the Wikipedia result, we'd get several paragraph elements, only the first paragraph would be taken and displayed on the search result
Diffstat (limited to 'searx/engines/bing.py')
-rw-r--r--searx/engines/bing.py14
1 files changed, 11 insertions, 3 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 8d024fed0..5c4681cd8 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -4,6 +4,7 @@
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""
+# pylint: disable=too-many-branches
import re
from urllib.parse import urlencode, urlparse, parse_qs
@@ -74,7 +75,6 @@ def request(query, params):
def response(resp):
-
results = []
result_len = 0
@@ -84,12 +84,20 @@ def response(resp):
url_to_resolve = []
url_to_resolve_index = []
- for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
+ for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')):
link = eval_xpath(result, './/h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
- content = extract_text(eval_xpath(result, './/p'))
+
+ # Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
+ content = eval_xpath(result, '(.//p)[1]')
+ for p in content:
+ for e in p.xpath('.//a'):
+ e.getparent().remove(e)
+ for e in p.xpath('.//span[@class="algoSlug_icon"]'):
+ e.getparent().remove(e)
+ content = extract_text(content)
# get the real URL either using the URL shown to user or following the Bing URL
if url.startswith('https://www.bing.com/ck/a?'):