summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2024-05-04 08:45:42 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2024-05-04 19:45:52 +0200
commitdbed8da284f0e91bef0fb42606ae10241505b364 (patch)
tree98f0b6f41296543cc63b6fa0a70abc5768eba872
parentd577817646b12c9460765117ffcacff2b65bf18c (diff)
downloadsearxng-dbed8da284f0e91bef0fb42606ae10241505b364.tar.gz
searxng-dbed8da284f0e91bef0fb42606ae10241505b364.zip
[fix] startpage engine: XPath expressions adapted for new HTML layout
Startpage has changed its HTML layout, classes like ``w-gl__result__main`` do no longer exists and the result items have been slightly changed in their structure. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r--searx/engines/startpage.py16
1 files changed, 5 insertions, 11 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index aa594f0dc..d538a22e4 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -142,9 +142,6 @@ search_url = base_url + '/sp/search'
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
-results_xpath = '//div[@class="w-gl__result__main"]'
-link_xpath = './/a[@class="w-gl__result-title result-link"]'
-content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form
@@ -334,8 +331,8 @@ def _response_cat_web(dom):
results = []
# parse results
- for result in eval_xpath(dom, results_xpath):
- links = eval_xpath(result, link_xpath)
+ for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
+ links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
if not links:
continue
link = links[0]
@@ -349,12 +346,9 @@ def _response_cat_web(dom):
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
continue
- title = extract_text(link)
-
- if eval_xpath(result, content_xpath):
- content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore
- else:
- content = ''
+ title = extract_text(eval_xpath(link, 'h2'))
+ content = eval_xpath(result, './/p[contains(@class, "description")]')
+ content = extract_text(content, allow_none=True) or ''
published_date = None