summaryrefslogtreecommitdiff
path: root/searx/engines/xpath.py
diff options
context:
space:
mode:
authorasciimoo <asciimoo@gmail.com>2013-10-26 13:45:43 +0200
committerasciimoo <asciimoo@gmail.com>2013-10-26 13:45:43 +0200
commit5d764f95cf44ab4c1ba83d7055297e3c4ea48c98 (patch)
tree8737eed994cb7a2010950d140f64206d7e82991e /searx/engines/xpath.py
parentc09d69bd2c2f0f8d37c03f94a2e6a97636fedba1 (diff)
downloadsearxng-5d764f95cf44ab4c1ba83d7055297e3c4ea48c98.tar.gz
searxng-5d764f95cf44ab4c1ba83d7055297e3c4ea48c98.zip
[enh] xpath engine absolute xpath support
Diffstat (limited to 'searx/engines/xpath.py')
-rw-r--r--searx/engines/xpath.py19
1 files changed, 12 insertions, 7 deletions
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index 61672b8cf..00fc3fac2 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -5,10 +5,10 @@ from cgi import escape
from lxml.etree import _ElementStringResult
search_url = None
-results_xpath = None
url_xpath = None
content_xpath = None
title_xpath = None
+results_xpath = ''
def extract_url(xpath_results):
url = ''
@@ -26,7 +26,7 @@ def extract_url(xpath_results):
else:
url = xpath_results[0].attrib.get('href')
else:
- raise Exception('Cannot handle xpath url resultset')
+ url = xpath_results.attrib.get('href')
if not url.startswith('http://') or not url.startswith('https://'):
url = 'http://'+url
parsed_url = urlparse(url)
@@ -45,10 +45,15 @@ def response(resp):
results = []
dom = html.fromstring(resp.text)
query = resp.search_params['query']
- for result in dom.xpath(results_xpath):
- url = extract_url(result.xpath(url_xpath))
- title = ' '.join(result.xpath(title_xpath))
- content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
- results.append({'url': url, 'title': title, 'content': content})
+ if results_xpath:
+ for result in dom.xpath(results_xpath):
+ url = extract_url(result.xpath(url_xpath))
+ title = ' '.join(result.xpath(title_xpath))
+ content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
+ results.append({'url': url, 'title': title, 'content': content})
+ else:
+ for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
+ results.append({'url': url, 'title': title, 'content': content})
+
return results