summaryrefslogtreecommitdiff
path: root/searx/engines/startpage.py
diff options
context:
space:
mode:
authorDalf <alex@al-f.net>2014-01-05 14:00:10 +0100
committerDalf <alex@al-f.net>2014-01-05 14:00:10 +0100
commita2928e8d8369f129303229d17dfabc896f59441d (patch)
tree58ff79295c2a2114dcb07c6be209f1f185555e5f /searx/engines/startpage.py
parentbf56ec4fb1199ab46feeacb8a045d28b287baf47 (diff)
downloadsearxng-a2928e8d8369f129303229d17dfabc896f59441d.tar.gz
searxng-a2928e8d8369f129303229d17dfabc896f59441d.zip
[fix] startpage engine : characters with diacritic were preceded by whitespace, and cleaner way to parse the result.
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r--searx/engines/startpage.py11
1 files changed, 5 insertions, 6 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 061c8158d..87c091e2d 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -19,14 +19,13 @@ def response(resp):
global base_url
results = []
dom = html.fromstring(resp.content)
- for result in dom.xpath('//div[@class="result"]'):
+ # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
+ # not ads : div[@class="result"] are the direct childs of div[@id="results"]
+ for result in dom.xpath('//div[@id="results"]/div[@class="result"]'):
link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href')
parsed_url = urlparse(url)
- # TODO better google link detection
- if parsed_url.netloc.find('www.google.com') >= 0:
- continue
- title = ' '.join(link.xpath('.//text()'))
- content = escape(' '.join(result.xpath('.//p[@class="desc"]//text()')))
+ title = link.text_content()
+ content = result.xpath('./p[@class="desc"]')[0].text_content()
results.append({'url': url, 'title': title, 'content': content})
return results