summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorApply55gx <Apply55gx@users.noreply.github.com>2017-10-25 10:44:28 +0200
committerGitHub <noreply@github.com>2017-10-25 10:44:28 +0200
commitd800e3fcfa44bc0be7262092815b2d2020a9d9f3 (patch)
tree0bdc64b3e15592e2fdeeaa40f21cbcc8039b7949 /searx/engines
parent18a4e7035f72a3c31239ae0bd1ee67cc2ad354b8 (diff)
parentb34124fd8a6b020136ca9656acdb01afceabe96f (diff)
downloadsearxng-d800e3fcfa44bc0be7262092815b2d2020a9d9f3.tar.gz
searxng-d800e3fcfa44bc0be7262092815b2d2020a9d9f3.zip
Merge pull request #1 from asciimoo/master
-
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/bing_images.py56
-rw-r--r--searx/engines/bing_videos.py5
-rw-r--r--searx/engines/blekko_images.py70
-rw-r--r--searx/engines/digg.py7
-rw-r--r--searx/engines/duckduckgo.py2
-rw-r--r--searx/engines/faroo.py58
-rw-r--r--searx/engines/generalfile.py62
-rw-r--r--searx/engines/gigablast.py8
-rw-r--r--searx/engines/google_news.py4
-rw-r--r--searx/engines/nyaa.py89
-rw-r--r--searx/engines/swisscows.py2
-rw-r--r--searx/engines/tokyotoshokan.py5
-rw-r--r--searx/engines/torrentz.py38
13 files changed, 151 insertions, 255 deletions
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 6300c94e4..15679056c 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -18,7 +18,6 @@
from lxml import html
from json import loads
import re
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode
# engine dependent config
@@ -26,6 +25,8 @@ categories = ['images']
paging = True
safesearch = True
time_range_support = True
+language_support = True
+supported_languages_url = 'https://www.bing.com/account/general'
# search-url
base_url = 'https://www.bing.com/'
@@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT',
_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
+# get supported region code
+def get_region_code(lang, lang_list=None):
+ region = None
+ if lang in (lang_list or supported_languages):
+ region = lang
+ elif lang.startswith('no'):
+ region = 'nb-NO'
+ else:
+ # try to get a supported country code with language
+ lang = lang.split('-')[0]
+ for lc in (lang_list or supported_languages):
+ if lang == lc.split('-')[0]:
+ region = lc
+ break
+ if region:
+ return region.lower()
+ else:
+ return 'en-us'
+
+
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
- # required for cookie
- if params['language'] == 'all':
- language = 'en-US'
- else:
- language = params['language']
-
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset)
+ language = get_region_code(params['language'])
+
params['cookies']['SRCHHPGUSR'] = \
- 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
- '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+ 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+
+ params['cookies']['_EDGE_S'] = 'mkt=' + language +\
+ '&ui=' + language + '&F=1'
params['url'] = base_url + search_path
if params['time_range'] in time_range_dict:
@@ -106,3 +125,22 @@ def response(resp):
# return results
return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = []
+ dom = html.fromstring(resp.text)
+
+ regions_xpath = '//div[@id="region-section-content"]' \
+ + '//ul[@class="b_vList"]/li/a/@href'
+
+ regions = dom.xpath(regions_xpath)
+ for region in regions:
+ code = re.search('setmkt=[^\&]+', region).group()[7:]
+ if code == 'nb-NO':
+ code = 'no-NO'
+
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py
index 918064c9b..bd91bce37 100644
--- a/searx/engines/bing_videos.py
+++ b/searx/engines/bing_videos.py
@@ -12,6 +12,7 @@
from json import loads
from lxml import html
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
@@ -21,6 +22,7 @@ paging = True
safesearch = True
time_range_support = True
number_of_results = 10
+language_support = True
search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\
'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5'
@@ -45,7 +47,8 @@ def request(query, params):
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
# language cookie
- params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1'
+ region = get_region_code(params['language'], lang_list=supported_languages)
+ params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
# query and paging
params['url'] = search_url.format(query=urlencode({'q': query}),
diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py
deleted file mode 100644
index f71645634..000000000
--- a/searx/engines/blekko_images.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""
- Blekko (Images)
-
- @website https://blekko.com
- @provide-api yes (inofficial)
-
- @using-api yes
- @results JSON
- @stable yes
- @parse url, title, img_src
-"""
-
-from json import loads
-from searx.url_utils import urlencode
-
-# engine dependent config
-categories = ['images']
-paging = True
-safesearch = True
-
-# search-url
-base_url = 'https://blekko.com'
-search_url = '/api/images?{query}&c={c}'
-
-# safesearch definitions
-safesearch_types = {2: '1',
- 1: '',
- 0: '0'}
-
-
-# do search-request
-def request(query, params):
- c = (params['pageno'] - 1) * 48
-
- params['url'] = base_url +\
- search_url.format(query=urlencode({'q': query}),
- c=c)
-
- if params['pageno'] != 1:
- params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1))
-
- # let Blekko know we wan't have profiling
- params['cookies']['tag_lesslogging'] = '1'
-
- # parse safesearch argument
- params['cookies']['safesearch'] = safesearch_types.get(params['safesearch'], '')
-
- return params
-
-
-# get response from search-request
-def response(resp):
- results = []
-
- search_results = loads(resp.text)
-
- # return empty array if there are no results
- if not search_results:
- return []
-
- for result in search_results:
- # append result
- results.append({'url': result['page_url'],
- 'title': result['title'],
- 'content': '',
- 'img_src': result['url'],
- 'template': 'images.html'})
-
- # return results
- return results
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
index 606747a4d..4369ccb84 100644
--- a/searx/engines/digg.py
+++ b/searx/engines/digg.py
@@ -10,6 +10,8 @@
@parse url, title, content, publishedDate, thumbnail
"""
+import random
+import string
from dateutil import parser
from json import loads
from lxml import html
@@ -30,12 +32,17 @@ title_xpath = './/h2//a//text()'
content_xpath = './/p//text()'
pubdate_xpath = './/time'
+digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
+ string.digits + "+_"
+
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url.format(position=offset,
query=quote_plus(query))
+ params['cookies']['frontend.auid'] = ''.join(random.choice(
+ digg_cookie_chars) for _ in range(22))
return params
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 407d731f0..921e29f8b 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -134,4 +134,4 @@ def _fetch_supported_languages(resp):
regions_json = loads(response_page)
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
- return supported_languages
+ return list(supported_languages)
diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py
index e24d1b7dc..7ce3a6ce8 100644
--- a/searx/engines/faroo.py
+++ b/searx/engines/faroo.py
@@ -4,7 +4,7 @@
@website http://www.faroo.com
@provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key
- @using-api yes
+ @using-api no
@results JSON
@stable yes
@parse url, title, content, publishedDate, img_src
@@ -20,18 +20,16 @@ categories = ['general', 'news']
paging = True
language_support = True
number_of_results = 10
-api_key = None
# search-url
url = 'http://www.faroo.com/'
-search_url = url + 'api?{query}'\
- '&start={offset}'\
- '&length={number_of_results}'\
- '&l={language}'\
- '&src={categorie}'\
- '&i=false'\
- '&f=json'\
- '&key={api_key}' # noqa
+search_url = url + 'instant.json?{query}'\
+ '&start={offset}'\
+ '&length={number_of_results}'\
+ '&l={language}'\
+ '&src={categorie}'\
+ '&i=false'\
+ '&c=false'
search_category = {'general': 'web',
'news': 'news'}
@@ -57,21 +55,15 @@ def request(query, params):
number_of_results=number_of_results,
query=urlencode({'q': query}),
language=language,
- categorie=categorie,
- api_key=api_key)
+ categorie=categorie)
- # using searx User-Agent
- params['headers']['User-Agent'] = searx_useragent()
+ params['headers']['Referer'] = url
return params
# get response from search-request
def response(resp):
- # HTTP-Code 401: api-key is not valide
- if resp.status_code == 401:
- raise Exception("API key is not valide")
-
# HTTP-Code 429: rate limit exceeded
if resp.status_code == 429:
raise Exception("rate limit has been exceeded!")
@@ -86,31 +78,19 @@ def response(resp):
# parse results
for result in search_res['results']:
+ publishedDate = None
+ result_json = {'url': result['url'], 'title': result['title'],
+ 'content': result['kwic']}
if result['news']:
- # timestamp (milliseconds since 1970)
- publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0) # noqa
-
- # append news result
- results.append({'url': result['url'],
- 'title': result['title'],
- 'publishedDate': publishedDate,
- 'content': result['kwic']})
-
- else:
- # append general result
- # TODO, publishedDate correct?
- results.append({'url': result['url'],
- 'title': result['title'],
- 'content': result['kwic']})
+ result_json['publishedDate'] = \
+ datetime.datetime.fromtimestamp(result['date'] / 1000.0)
# append image result if image url is set
- # TODO, show results with an image like in faroo
if result['iurl']:
- results.append({'template': 'images.html',
- 'url': result['url'],
- 'title': result['title'],
- 'content': result['kwic'],
- 'img_src': result['iurl']})
+ result_json['template'] = 'videos.html'
+ result_json['thumbnail'] = result['iurl']
+
+ results.append(result_json)
# return results
return results
diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py
deleted file mode 100644
index 3bb27444f..000000000
--- a/searx/engines/generalfile.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""
- General Files (Files)
-
- @website http://www.general-files.org
- @provide-api no (nothing found)
-
- @using-api no (because nothing found)
- @results HTML (using search portal)
- @stable no (HTML can change)
- @parse url, title, content
-
- @todo detect torrents?
-"""
-
-from lxml import html
-
-# engine dependent config
-categories = ['files']
-paging = True
-
-# search-url
-base_url = 'http://www.general-file.com'
-search_url = base_url + '/files-{letter}/{query}/{pageno}'
-
-# specific xpath variables
-result_xpath = '//table[@class="block-file"]'
-title_xpath = './/h2/a//text()'
-url_xpath = './/h2/a/@href'
-content_xpath = './/p//text()'
-
-
-# do search-request
-def request(query, params):
-
- params['url'] = search_url.format(query=query,
- letter=query[0],
- pageno=params['pageno'])
-
- return params
-
-
-# get response from search-request
-def response(resp):
- results = []
-
- dom = html.fromstring(resp.text)
-
- # parse results
- for result in dom.xpath(result_xpath):
- url = result.xpath(url_xpath)[0]
-
- # skip fast download links
- if not url.startswith('/'):
- continue
-
- # append result
- results.append({'url': base_url + url,
- 'title': ''.join(result.xpath(title_xpath)),
- 'content': ''.join(result.xpath(content_xpath))})
-
- # return results
- return results
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index 37933c69b..2bdc97fd1 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -10,6 +10,7 @@
@parse url, title, content
"""
+import random
from json import loads
from time import time
from lxml.html import fromstring
@@ -32,7 +33,8 @@ search_string = 'search?{query}'\
'&qh=0'\
'&qlang={lang}'\
'&ff={safesearch}'\
- '&rxikd={rxikd}' # random number - 9 digits
+ '&rxieu={rxieu}'\
+ '&rand={rxikd}' # current unix timestamp
# specific xpath variables
results_xpath = '//response//result'
@@ -59,10 +61,12 @@ def request(query, params):
else:
safesearch = 0
+ # rxieu is some kind of hash from the search query, but accepts random atm
search_path = search_string.format(query=urlencode({'q': query}),
offset=offset,
number_of_results=number_of_results,
- rxikd=str(time())[:9],
+ rxikd=int(time() * 1000),
+ rxieu=random.randint(1000000000, 9999999999),
lang=language,
safesearch=safesearch)
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 7344b5289..8881d0dad 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -67,8 +67,8 @@ def response(resp):
for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
try:
r = {
- 'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0],
- 'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')),
+ 'url': result.xpath('.//a[@class="l _PMs"]')[0].attrib.get("href"),
+ 'title': ''.join(result.xpath('.//a[@class="l _PMs"]//text()')),
'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
}
except:
diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py
index 272c712c4..6a8e598c4 100644
--- a/searx/engines/nyaa.py
+++ b/searx/engines/nyaa.py
@@ -1,7 +1,7 @@
"""
- Nyaa.se (Anime Bittorrent tracker)
+ Nyaa.si (Anime Bittorrent tracker)
- @website http://www.nyaa.se/
+ @website http://www.nyaa.si/
@provide-api no
@using-api no
@results HTML
@@ -12,50 +12,25 @@
from lxml import html
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero
# engine dependent config
categories = ['files', 'images', 'videos', 'music']
paging = True
# search-url
-base_url = 'http://www.nyaa.se/'
+base_url = 'http://www.nyaa.si/'
search_url = base_url + '?page=search&{query}&offset={offset}'
# xpath queries
-xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]'
-xpath_category = './/td[@class="tlisticon"]/a'
-xpath_title = './/td[@class="tlistname"]/a'
-xpath_torrent_file = './/td[@class="tlistdownload"]/a'
-xpath_filesize = './/td[@class="tlistsize"]/text()'
-xpath_seeds = './/td[@class="tlistsn"]/text()'
-xpath_leeches = './/td[@class="tlistln"]/text()'
-xpath_downloads = './/td[@class="tlistdn"]/text()'
-
-
-# convert a variable to integer or return 0 if it's not a number
-def int_or_zero(num):
- if isinstance(num, list):
- if len(num) < 1:
- return 0
- num = num[0]
- if num.isdigit():
- return int(num)
- return 0
-
-
-# get multiplier to convert torrent size to bytes
-def get_filesize_mul(suffix):
- return {
- 'KB': 1024,
- 'MB': 1024 ** 2,
- 'GB': 1024 ** 3,
- 'TB': 1024 ** 4,
-
- 'KIB': 1024,
- 'MIB': 1024 ** 2,
- 'GIB': 1024 ** 3,
- 'TIB': 1024 ** 4
- }[str(suffix).upper()]
+xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]'
+xpath_category = './/td[1]/a[1]'
+xpath_title = './/td[2]/a[last()]'
+xpath_torrent_links = './/td[3]/a'
+xpath_filesize = './/td[4]/text()'
+xpath_seeds = './/td[6]/text()'
+xpath_leeches = './/td[7]/text()'
+xpath_downloads = './/td[8]/text()'
# do search-request
@@ -72,25 +47,32 @@ def response(resp):
dom = html.fromstring(resp.text)
for result in dom.xpath(xpath_results):
+ # defaults
+ filesize = 0
+ magnet_link = ""
+ torrent_link = ""
+
# category in which our torrent belongs
- category = result.xpath(xpath_category)[0].attrib.get('title')
+ try:
+ category = result.xpath(xpath_category)[0].attrib.get('title')
+ except:
+ pass
# torrent title
page_a = result.xpath(xpath_title)[0]
title = extract_text(page_a)
# link to the page
- href = page_a.attrib.get('href')
+ href = base_url + page_a.attrib.get('href')
- # link to the torrent file
- torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href')
-
- # torrent size
- try:
- file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
- file_size = int(float(file_size) * get_filesize_mul(suffix))
- except:
- file_size = None
+ for link in result.xpath(xpath_torrent_links):
+ url = link.attrib.get('href')
+ if 'magnet' in url:
+ # link to the magnet
+ magnet_link = url
+ else:
+ # link to the torrent file
+ torrent_link = url
# seed count
seed = int_or_zero(result.xpath(xpath_seeds))
@@ -101,6 +83,14 @@ def response(resp):
# torrent downloads count
downloads = int_or_zero(result.xpath(xpath_downloads))
+ # let's try to calculate the torrent size
+ try:
+ filesize_info = result.xpath(xpath_filesize)[0]
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+ except:
+ pass
+
# content string contains all information not included into template
content = 'Category: "{category}". Downloaded {downloads} times.'
content = content.format(category=category, downloads=downloads)
@@ -110,8 +100,9 @@ def response(resp):
'content': content,
'seed': seed,
'leech': leech,
- 'filesize': file_size,
+ 'filesize': filesize,
'torrentfile': torrent_link,
+ 'magnetlink': magnet_link,
'template': 'torrent.html'})
return results
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
index e9c13ca24..00346a7d0 100644
--- a/searx/engines/swisscows.py
+++ b/searx/engines/swisscows.py
@@ -118,7 +118,7 @@ def _fetch_supported_languages(resp):
dom = fromstring(resp.text)
options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
for option in options:
- code = option.xpath('./@data-val')[0]
+ code = option.xpath('./@data-search-language')[0]
if code.startswith('nb-'):
code = code.replace('nb', 'no', 1)
supported_languages.append(code)
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py
index 9a6b5e57d..773212043 100644
--- a/searx/engines/tokyotoshokan.py
+++ b/searx/engines/tokyotoshokan.py
@@ -14,8 +14,8 @@ import re
from lxml import html
from searx.engines.xpath import extract_text
from datetime import datetime
-from searx.engines.nyaa import int_or_zero, get_filesize_mul
from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero
# engine dependent config
categories = ['files', 'videos', 'music']
@@ -76,8 +76,7 @@ def response(resp):
try:
# ('1.228', 'GB')
groups = size_re.match(item).groups()
- multiplier = get_filesize_mul(groups[1])
- params['filesize'] = int(multiplier * float(groups[0]))
+ params['filesize'] = get_torrent_size(groups[0], groups[1])
except:
pass
elif item.startswith('Date:'):
diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py
index dda56fc22..fd4164a66 100644
--- a/searx/engines/torrentz.py
+++ b/searx/engines/torrentz.py
@@ -1,7 +1,7 @@
"""
- Torrentz.eu (BitTorrent meta-search engine)
+ Torrentz2.eu (BitTorrent meta-search engine)
- @website https://torrentz.eu/
+ @website https://torrentz2.eu/
@provide-api no
@using-api no
@@ -14,24 +14,24 @@
import re
from lxml import html
from datetime import datetime
-from searx.engines.nyaa import int_or_zero, get_filesize_mul
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
+from searx.utils import get_torrent_size
# engine dependent config
categories = ['files', 'videos', 'music']
paging = True
# search-url
-# https://torrentz.eu/search?f=EXAMPLE&p=6
-base_url = 'https://torrentz.eu/'
+# https://torrentz2.eu/search?f=EXAMPLE&p=6
+base_url = 'https://torrentz2.eu/'
search_url = base_url + 'search?{query}'
# do search-request
def request(query, params):
page = params['pageno'] - 1
- query = urlencode({'q': query, 'p': page})
+ query = urlencode({'f': query, 'p': page})
params['url'] = search_url.format(query=query)
return params
@@ -54,22 +54,29 @@ def response(resp):
# extract url and remove a slash in the beginning
link = links[0].attrib.get('href').lstrip('/')
- seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '')
- leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '')
+ seed = 0
+ leech = 0
+ try:
+ seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', ''))
+ leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', ''))
+ except:
+ pass
params = {
'url': base_url + link,
'title': title,
- 'seed': int_or_zero(seed),
- 'leech': int_or_zero(leech),
+ 'seed': seed,
+ 'leech': leech,
'template': 'torrent.html'
}
# let's try to calculate the torrent size
try:
- size_str = result.xpath('./dd/span[@class="s"]/text()')[0]
- size, suffix = size_str.split()
- params['filesize'] = int(size) * get_filesize_mul(suffix)
+ filesize_info = result.xpath('./dd/span[3]/text()')[0]
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+
+ params['filesize'] = filesize
except:
pass
@@ -80,9 +87,8 @@ def response(resp):
# extract and convert creation date
try:
- date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title')
- # Fri, 25 Mar 2016 16:29:01
- date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S')
+ date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title')
+ date = datetime.fromtimestamp(float(date_ts))
params['publishedDate'] = date
except:
pass