summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorasciimoo <asciimoo@gmail.com>2013-11-09 18:39:20 +0100
committerasciimoo <asciimoo@gmail.com>2013-11-09 18:39:20 +0100
commit17bf00ee42583910e45794e1438a2bab459225ad (patch)
treef1fc78646676c40c39cda1de9b69cf5dc39ff263
parent14a53e343085083d0ddf271adff2d13ca1c94f99 (diff)
downloadsearxng-17bf00ee42583910e45794e1438a2bab459225ad.tar.gz
searxng-17bf00ee42583910e45794e1438a2bab459225ad.zip
[enh] removing result html tags
-rw-r--r--searx/engines/duckduckgo.py3
-rw-r--r--searx/engines/startpage.py4
-rw-r--r--searx/engines/twitter.py3
-rw-r--r--searx/engines/xpath.py3
4 files changed, 7 insertions, 6 deletions
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 33f56f469..d591854a5 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -1,5 +1,6 @@
from json import loads
from urllib import urlencode
+from searx.utils import html_to_text
url = 'https://duckduckgo.com/'
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
@@ -16,7 +17,7 @@ def response(resp):
if not r.get('t'):
continue
results.append({'title': r['t']
- ,'content': r['a']
+ ,'content': html_to_text(r['a'])
,'url': r['u']
})
return results
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 47273e6e7..061c8158d 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -1,4 +1,4 @@
-from urllib import quote
+from urllib import urlencode
from lxml import html
from urlparse import urlparse
from cgi import escape
@@ -8,7 +8,7 @@ search_url = base_url+'do/search'
def request(query, params):
global search_url
- query = quote(query.replace(' ', '+'), safe='+')
+ query = urlencode({'q': query})[2:]
params['url'] = search_url
params['method'] = 'POST'
params['data'] = {'query': query}
diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py
index d0a0aef17..f9d9e26ad 100644
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@@ -1,6 +1,7 @@
from urlparse import urljoin
from urllib import urlencode
from lxml import html
+from cgi import escape
categories = ['social media']
@@ -21,6 +22,6 @@ def response(resp):
link = tweet.xpath('.//small[@class="time"]//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
- content = ''.join(map(html.tostring, tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//*')))
+ content = escape(''.join(tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//text()')))
results.append({'url': url, 'title': title, 'content': content})
return results
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index 8c2e04d5c..2743dc2a0 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -46,12 +46,11 @@ def request(query, params):
def response(resp):
results = []
dom = html.fromstring(resp.text)
- query = resp.search_params['query']
if results_xpath:
for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath))
title = ' '.join(result.xpath(title_xpath))
- content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
+ content = escape(' '.join(result.xpath(content_xpath)))
results.append({'url': url, 'title': title, 'content': content})
else:
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):