diff options
author | asciimoo <asciimoo@gmail.com> | 2013-10-26 02:22:20 +0200 |
---|---|---|
committer | asciimoo <asciimoo@gmail.com> | 2013-10-26 02:22:20 +0200 |
commit | badd9885459edf76c1f99f6e65feeb24185e88df (patch) | |
tree | 3bbb992ca89b2d20368a2da4d38aa8bddd3b7b21 /searx/engines/xpath.py | |
parent | 89b68242d3b4258b4144f9723b007ffa538d4475 (diff) | |
download | searxng-badd9885459edf76c1f99f6e65feeb24185e88df.tar.gz searxng-badd9885459edf76c1f99f6e65feeb24185e88df.zip |
[enh] xpath engine added
Diffstat (limited to 'searx/engines/xpath.py')
-rw-r--r-- | searx/engines/xpath.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py new file mode 100644 index 000000000..61672b8cf --- /dev/null +++ b/searx/engines/xpath.py @@ -0,0 +1,54 @@ +from lxml import html +from urllib import urlencode +from urlparse import urlparse, urljoin +from cgi import escape +from lxml.etree import _ElementStringResult + +search_url = None +results_xpath = None +url_xpath = None +content_xpath = None +title_xpath = None + +def extract_url(xpath_results): + url = '' + parsed_search_url = urlparse(search_url) + if type(xpath_results) == list: + if not len(xpath_results): + raise Exception('Empty url resultset') + if type(xpath_results[0]) == _ElementStringResult: + url = ''.join(xpath_results) + if url.startswith('//'): + url = parsed_search_url.scheme+url + elif url.startswith('/'): + url = urljoin(search_url, url) + #TODO + else: + url = xpath_results[0].attrib.get('href') + else: + raise Exception('Cannot handle xpath url resultset') + if not url.startswith('http://') or not url.startswith('https://'): + url = 'http://'+url + parsed_url = urlparse(url) + if not parsed_url.netloc: + raise Exception('Cannot parse url') + return url + +def request(query, params): + query = urlencode({'q': query})[2:] + params['url'] = search_url.format(query=query) + params['query'] = query + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + query = resp.search_params['query'] + for result in dom.xpath(results_xpath): + url = extract_url(result.xpath(url_xpath)) + title = ' '.join(result.xpath(title_xpath)) + content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query)) + results.append({'url': url, 'title': title, 'content': content}) + + return results |