summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorAdam Tauber <adam.tauber@balabit.com>2015-02-02 09:36:43 +0100
committerAdam Tauber <adam.tauber@balabit.com>2015-02-02 09:36:43 +0100
commit7f865356f9a6c1b40d0c668c59b3d081de618bac (patch)
tree60e9acb27577968a41136c04f248c24871e83860 /searx/engines
parent03137eebd9fdfaa57452cb364c1bc9f31b243f67 (diff)
parent5a16077455ef9e821a2b5f5f7e975be8a37ce83d (diff)
downloadsearxng-7f865356f9a6c1b40d0c668c59b3d081de618bac.tar.gz
searxng-7f865356f9a6c1b40d0c668c59b3d081de618bac.zip
Merge branch 'unit-tests' of https://github.com/Cqoicebordel/searx into Cqoicebordel-unit-tests
Conflicts: searx/tests/test_engines.py
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/bing.py9
-rw-r--r--searx/engines/bing_images.py5
-rw-r--r--searx/engines/bing_news.py31
-rw-r--r--searx/engines/btdigg.py11
-rw-r--r--searx/engines/deviantart.py7
-rw-r--r--searx/engines/digg.py2
-rw-r--r--searx/engines/flickr_noapi.py2
-rw-r--r--searx/engines/google_images.py6
-rw-r--r--searx/engines/google_news.py6
-rw-r--r--searx/engines/kickass.py8
-rw-r--r--searx/engines/piratebay.py12
-rw-r--r--searx/engines/searchcode_code.py2
-rw-r--r--searx/engines/searchcode_doc.py2
-rw-r--r--searx/engines/stackoverflow.py8
-rw-r--r--searx/engines/vimeo.py3
-rw-r--r--searx/engines/www500px.py11
-rw-r--r--searx/engines/xpath.py4
-rw-r--r--searx/engines/youtube.py4
18 files changed, 66 insertions, 67 deletions
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 5de461cfe..f9c323d05 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -14,6 +14,7 @@
from urllib import urlencode
from cgi import escape
from lxml import html
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['general']
@@ -55,8 +56,8 @@ def response(resp):
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href')
- title = ' '.join(link.xpath('.//text()'))
- content = escape(' '.join(result.xpath('.//p//text()')))
+ title = extract_text(link)
+ content = escape(extract_text(result.xpath('.//p')))
# append result
results.append({'url': url,
@@ -71,8 +72,8 @@ def response(resp):
for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href')
- title = ' '.join(link.xpath('.//text()'))
- content = escape(' '.join(result.xpath('.//p//text()')))
+ title = extract_text(link)
+ content = escape(extract_text(result.xpath('.//p')))
# append result
results.append({'url': url,
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 9ae498427..9d1c22f5a 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -33,7 +33,10 @@ def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
# required for cookie
- language = 'en-US'
+ if params['language'] == 'all':
+ language = 'en-US'
+ else:
+ language = params['language'].replace('_', '-')
search_path = search_string.format(
query=urlencode({'q': query}),
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index 789a23b89..e6adb2644 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -15,6 +15,7 @@ from lxml import html
from datetime import datetime, timedelta
from dateutil import parser
import re
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['news']
@@ -42,6 +43,7 @@ def request(query, params):
params['cookies']['_FP'] = "ui=en-US"
params['url'] = base_url + search_path
+
return params
@@ -55,44 +57,35 @@ def response(resp):
for result in dom.xpath('//div[@class="sn_r"]'):
link = result.xpath('.//div[@class="newstitle"]/a')[0]
url = link.attrib.get('href')
- title = ' '.join(link.xpath('.//text()'))
- contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
- '//span[@class="sn_snip"]//text()')
- if contentXPath is not None:
- content = escape(' '.join(contentXPath))
+ title = extract_text(link)
+ contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
+ content = escape(extract_text(contentXPath))
# parse publishedDate
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
'//span[contains(@class,"sn_ST")]'
- '//span[contains(@class,"sn_tm")]'
- '//text()')
- if publishedDateXPath is not None:
- publishedDate = escape(' '.join(publishedDateXPath))
+ '//span[contains(@class,"sn_tm")]')
+
+ publishedDate = escape(extract_text(publishedDateXPath))
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now()\
- - timedelta(minutes=int(timeNumbers[0]))
+ publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now()\
- - timedelta(hours=int(timeNumbers[0]))
- elif re.match("^[0-9]+ hour(s|),"
- " [0-9]+ minute(s|) ago$", publishedDate):
+ publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
+ elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now()\
- - timedelta(days=int(timeNumbers[0]))
+ publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
else:
try:
- # FIXME use params['language'] to parse either mm/dd or dd/mm
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
- # FIXME
publishedDate = datetime.now()
# append result
diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py
index 973ede9ac..944250628 100644
--- a/searx/engines/btdigg.py
+++ b/searx/engines/btdigg.py
@@ -23,11 +23,6 @@ paging = True
url = 'https://btdigg.org'
search_url = url + '/search?q={search_term}&p={pageno}'
-# specific xpath variables
-magnet_xpath = './/a[@title="Torrent magnet link"]'
-torrent_xpath = './/a[@title="Download torrent file"]'
-content_xpath = './/span[@class="font11px lightgrey block"]'
-
# do search-request
def request(query, params):
@@ -52,8 +47,8 @@ def response(resp):
# parse results
for result in search_res:
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
- href = urljoin(url, link.attrib['href'])
- title = escape(extract_text(link.xpath('.//text()')))
+ href = urljoin(url, link.attrib.get('href'))
+ title = escape(extract_text(link))
content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
content = "<br />".join(content.split("\n"))
@@ -81,7 +76,7 @@ def response(resp):
filesize = int(filesize * 1024 * 1024 * 1024)
elif filesize_multiplier == 'MB':
filesize = int(filesize * 1024 * 1024)
- elif filesize_multiplier == 'kb':
+ elif filesize_multiplier == 'KB':
filesize = int(filesize * 1024)
except:
filesize = None
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py
index 6284cf598..4198e8c76 100644
--- a/searx/engines/deviantart.py
+++ b/searx/engines/deviantart.py
@@ -14,6 +14,7 @@ from urllib import urlencode
from urlparse import urljoin
from lxml import html
import re
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['images']
@@ -50,9 +51,9 @@ def response(resp):
for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
link = result.xpath('.//a[contains(@class, "thumb")]')[0]
url = urljoin(base_url, link.attrib.get('href'))
- title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa
- title = ''.join(title_links[0].xpath('.//text()'))
- thumbnail_src = link.xpath('.//img')[0].attrib['src']
+ title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')
+ title = extract_text(title_links[0])
+ thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
img_src = regex.sub('/', thumbnail_src)
# append result
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
index 8c457d6b9..1b5f2c8e4 100644
--- a/searx/engines/digg.py
+++ b/searx/engines/digg.py
@@ -44,7 +44,7 @@ def response(resp):
search_result = loads(resp.text)
- if search_result['html'] == '':
+ if 'html' not in search_result or search_result['html'] == '':
return results
dom = html.fromstring(search_result['html'])
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
index 9b28ded13..3a83fdc65 100644
--- a/searx/engines/flickr_noapi.py
+++ b/searx/engines/flickr_noapi.py
@@ -21,7 +21,7 @@ logger = logger.getChild('flickr-noapi')
categories = ['images']
url = 'https://secure.flickr.com/'
-search_url = url+'search/?{query}&page={page}'
+search_url = url + 'search/?{query}&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py
index cc62a4fd2..092ae6639 100644
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -18,7 +18,7 @@ paging = True
# search-url
url = 'https://ajax.googleapis.com/'
-search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa
+search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'
# do search-request
@@ -45,14 +45,14 @@ def response(resp):
for result in search_res['responseData']['results']:
href = result['originalContextUrl']
title = result['title']
- if not result['url']:
+ if 'url' not in result:
continue
thumbnail_src = result['tbUrl']
# append result
results.append({'url': href,
'title': title,
- 'content': '',
+ 'content': result['content'],
'thumbnail_src': thumbnail_src,
'img_src': unquote(result['url']),
'template': 'images.html'})
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index eb114f9c9..3e4371b99 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -20,7 +20,7 @@ language_support = True
# engine dependent config
url = 'https://ajax.googleapis.com/'
-search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
+search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}'
# do search-request
@@ -33,7 +33,7 @@ def request(query, params):
params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query}),
- language=language)
+ lang=language)
return params
@@ -52,6 +52,8 @@ def response(resp):
for result in search_res['responseData']['results']:
# parse publishedDate
publishedDate = parser.parse(result['publishedDate'])
+ if 'url' not in result:
+ continue
# append result
results.append({'url': result['unescapedUrl'],
diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py
index ac349283d..8b89e1f47 100644
--- a/searx/engines/kickass.py
+++ b/searx/engines/kickass.py
@@ -13,6 +13,7 @@ from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['videos', 'music', 'files']
@@ -56,9 +57,8 @@ def response(resp):
for result in search_res[1:]:
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
- title = ' '.join(link.xpath('.//text()'))
- content = escape(html.tostring(result.xpath(content_xpath)[0],
- method="text"))
+ title = extract_text(link)
+ content = escape(extract_text(result.xpath(content_xpath)))
seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0]
@@ -88,7 +88,7 @@ def response(resp):
filesize = int(filesize * 1024 * 1024 * 1024)
elif filesize_multiplier == 'MB':
filesize = int(filesize * 1024 * 1024)
- elif filesize_multiplier == 'kb':
+ elif filesize_multiplier == 'KB':
filesize = int(filesize * 1024)
except:
filesize = None
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
index f6144faa2..207df276c 100644
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -13,6 +13,7 @@ from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['videos', 'music', 'files']
@@ -29,7 +30,8 @@ search_types = {'files': '0',
# specific xpath variables
magnet_xpath = './/a[@title="Download this torrent using magnet"]'
-content_xpath = './/font[@class="detDesc"]//text()'
+torrent_xpath = './/a[@title="Download this torrent"]'
+content_xpath = './/font[@class="detDesc"]'
# do search-request
@@ -59,8 +61,8 @@ def response(resp):
for result in search_res[1:]:
link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
- title = ' '.join(link.xpath('.//text()'))
- content = escape(' '.join(result.xpath(content_xpath)))
+ title = extract_text(link)
+ content = escape(extract_text(result.xpath(content_xpath)))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible
@@ -76,6 +78,7 @@ def response(resp):
leech = 0
magnetlink = result.xpath(magnet_xpath)[0]
+ torrentfile = result.xpath(torrent_xpath)[0]
# append result
results.append({'url': href,
@@ -83,7 +86,8 @@ def response(resp):
'content': content,
'seed': seed,
'leech': leech,
- 'magnetlink': magnetlink.attrib['href'],
+ 'magnetlink': magnetlink.attrib.get('href'),
+ 'torrentfile': torrentfile.attrib.get('href'),
'template': 'torrent.html'})
# return results sorted by seeder
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index 655818da2..f276697b1 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -42,7 +42,7 @@ def response(resp):
search_results = loads(resp.text)
# parse results
- for result in search_results['results']:
+ for result in search_results.get('results', []):
href = result['url']
title = "" + result['name'] + " - " + result['filename']
repo = result['repo']
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
index b5b7159be..76da8d752 100644
--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -35,7 +35,7 @@ def response(resp):
search_results = loads(resp.text)
# parse results
- for result in search_results['results']:
+ for result in search_results.get('results', []):
href = result['url']
title = "[" + result['type'] + "] " +\
result['namespace'] +\
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index dcbb1890c..78dba9f68 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -12,6 +12,7 @@ from urlparse import urljoin
from cgi import escape
from urllib import urlencode
from lxml import html
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['it']
@@ -24,8 +25,7 @@ search_url = url+'search?{query}&page={pageno}'
# specific xpath variables
results_xpath = '//div[contains(@class,"question-summary")]'
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
-title_xpath = './/text()'
-content_xpath = './/div[@class="excerpt"]//text()'
+content_xpath = './/div[@class="excerpt"]'
# do search-request
@@ -46,8 +46,8 @@ def response(resp):
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href'))
- title = escape(' '.join(link.xpath(title_xpath)))
- content = escape(' '.join(result.xpath(content_xpath)))
+ title = escape(extract_text(link))
+ content = escape(extract_text(result.xpath(content_xpath)))
# append result
results.append({'url': href,
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
index 39033c591..7577d12e1 100644
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
@@ -59,8 +59,7 @@ def response(resp):
url = base_url + videoid
title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(content_xpath)[0])
- publishedDate = parser.parse(extract_text(
- result.xpath(publishedDate_xpath)[0]))
+ publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
embedded = embedded_url.format(videoid=videoid)
# append result
diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py
index f25678c24..99dba4abf 100644
--- a/searx/engines/www500px.py
+++ b/searx/engines/www500px.py
@@ -15,6 +15,7 @@ from urllib import urlencode
from urlparse import urljoin
from lxml import html
import re
+from searx.engines.xpath import extract_text
# engine dependent config
categories = ['images']
@@ -22,7 +23,7 @@ paging = True
# search-url
base_url = 'https://500px.com'
-search_url = base_url+'/search?search?page={pageno}&type=photos&{query}'
+search_url = base_url + '/search?search?page={pageno}&type=photos&{query}'
# do search-request
@@ -44,11 +45,11 @@ def response(resp):
for result in dom.xpath('//div[@class="photo"]'):
link = result.xpath('.//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
- title = result.xpath('.//div[@class="title"]//text()')[0]
- thumbnail_src = link.xpath('.//img')[0].attrib['src']
+ title = extract_text(result.xpath('.//div[@class="title"]'))
+ thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
# To have a bigger thumbnail, uncomment the next line
- #thumbnail_src = regex.sub('4.jpg', thumbnail_src)
- content = result.xpath('.//div[@class="info"]//text()')[0]
+ # thumbnail_src = regex.sub('4.jpg', thumbnail_src)
+ content = extract_text(result.xpath('.//div[@class="info"]'))
img_src = regex.sub('2048.jpg', thumbnail_src)
# append result
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index 72120304e..1a599dc0a 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -28,13 +28,13 @@ def extract_text(xpath_results):
result = ''
for e in xpath_results:
result = result + extract_text(e)
- return result
+ return result.strip()
elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
# it's a string
return ''.join(xpath_results)
else:
# it's a element
- return html_to_text(xpath_results.text_content())
+ return html_to_text(xpath_results.text_content()).strip()
def extract_url(xpath_results, search_url):
diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py
index 59f07c574..1375538a8 100644
--- a/searx/engines/youtube.py
+++ b/searx/engines/youtube.py
@@ -57,7 +57,7 @@ def response(resp):
url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
if not url:
- return
+ continue
# remove tracking
url = url[0].replace('feature=youtube_gdata', '')
@@ -73,7 +73,7 @@ def response(resp):
pubdate = result['published']['$t']
publishedDate = parser.parse(pubdate)
- if result['media$group']['media$thumbnail']:
+ if 'media$thumbnail' in result['media$group']:
thumbnail = result['media$group']['media$thumbnail'][0]['url']
content = result['content']['$t']