diff options
author | ahmad-alkadri <ahmad.alkadri@outlook.com> | 2023-01-03 22:59:01 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2023-01-08 09:11:16 +0100 |
commit | 9ee99423fe22550ef566245ef23e2a9e8ee76c27 (patch) | |
tree | a08547dcc00788b5c9d5e160ab318e0dd9e53bee /searx/engines/bing.py | |
parent | 4e355564d2e11e4e86826a093c1f450b43adce4c (diff) | |
download | searxng-9ee99423fe22550ef566245ef23e2a9e8ee76c27.tar.gz searxng-9ee99423fe22550ef566245ef23e2a9e8ee76c27.zip |
[fix] Bing-Web engine: XPath to get the wikipedia result
Modify the XPath selector to get the wikipedia result plus small fixes.
About result content: especially with the Wikipedia result, we'd get several
paragraph elements, only the first paragraph would be taken and displayed on the
search result
Diffstat (limited to 'searx/engines/bing.py')
-rw-r--r-- | searx/engines/bing.py | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 8d024fed0..5c4681cd8 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -4,6 +4,7 @@ - https://github.com/searx/searx/issues/2019#issuecomment-648227442 """ +# pylint: disable=too-many-branches import re from urllib.parse import urlencode, urlparse, parse_qs @@ -74,7 +75,6 @@ def request(query, params): def response(resp): - results = [] result_len = 0 @@ -84,12 +84,20 @@ def response(resp): url_to_resolve = [] url_to_resolve_index = [] - for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): + for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')): link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = extract_text(eval_xpath(result, './/p')) + + # Make sure that the element is free of <a href> links and <span class='algoSlug_icon'> + content = eval_xpath(result, '(.//p)[1]') + for p in content: + for e in p.xpath('.//a'): + e.getparent().remove(e) + for e in p.xpath('.//span[@class="algoSlug_icon"]'): + e.getparent().remove(e) + content = extract_text(content) # get the real URL either using the URL shown to user or following the Bing URL if url.startswith('https://www.bing.com/ck/a?'): |