summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKirill Isakov <ukwt@ya.ru>2016-03-27 00:49:57 +0600
committerKirill Isakov <ukwt@ya.ru>2016-03-27 00:49:57 +0600
commit547b8a87653d87b8be85710275a66be1bec1e39c (patch)
treeef07430b8ce5dab281f50792056a43b3e0558add
parent7fbc12ee4e6aea8a8ad0098deb03054976056371 (diff)
downloadsearxng-547b8a87653d87b8be85710275a66be1bec1e39c.tar.gz
searxng-547b8a87653d87b8be85710275a66be1bec1e39c.zip
Add Tokyo Toshokan search engine
-rw-r--r--searx/engines/tokyotoshokan.py102
-rw-r--r--searx/settings.yml6
-rw-r--r--tests/unit/engines/test_tokyotoshokan.py110
3 files changed, 218 insertions, 0 deletions
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py
new file mode 100644
index 000000000..17e8e2191
--- /dev/null
+++ b/searx/engines/tokyotoshokan.py
@@ -0,0 +1,102 @@
+"""
+ Tokyo Toshokan (A BitTorrent Library for Japanese Media)
+
+ @website https://www.tokyotosho.info/
+ @provide-api no
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, publishedDate, seed, leech,
+ filesize, magnetlink, content
+"""
+
+import re
+from cgi import escape
+from urllib import urlencode
+from lxml import html
+from searx.engines.xpath import extract_text
+from datetime import datetime
+from searx.engines.nyaa import int_or_zero, get_filesize_mul
+
+# engine dependent config
+categories = ['files', 'videos', 'music']
+paging = True
+
+# search-url
+base_url = 'https://www.tokyotosho.info/'
+search_url = base_url + 'search.php?{query}'
+
+
+# do search-request
+def request(query, params):
+ query = urlencode({'page': params['pageno'],
+ 'terms': query})
+ params['url'] = search_url.format(query=query)
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+ rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
+
+ # check if there are no results or page layout was changed so we cannot parse it
+ # currently there are two rows for each result, so total count must be even
+ if len(rows) == 0 or len(rows) % 2 != 0:
+ return []
+
+ # regular expression for parsing torrent size strings
+ size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
+
+ # processing the results, two rows at a time
+ for i in xrange(0, len(rows), 2):
+ # parse the first row
+ name_row = rows[i]
+
+ links = name_row.xpath('./td[@class="desc-top"]/a')
+ params = {
+ 'template': 'torrent.html',
+ 'url': links[-1].attrib.get('href'),
+ 'title': extract_text(links[-1])
+ }
+ # I have not yet seen any torrents without magnet links, but
+ # it's better to be prepared to stumble upon one some day
+ if len(links) == 2:
+ magnet = links[0].attrib.get('href')
+ if magnet.startswith('magnet'):
+ # okay, we have a valid magnet link, let's add it to the result
+ params['magnetlink'] = magnet
+
+ # no more info in the first row, start parsing the second one
+ info_row = rows[i + 1]
+ desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
+ for item in desc.split('|'):
+ item = item.strip()
+ if item.startswith('Size:'):
+ try:
+ # ('1.228', 'GB')
+ groups = size_re.match(item).groups()
+ multiplier = get_filesize_mul(groups[1])
+ params['filesize'] = int(multiplier * float(groups[0]))
+ except Exception as e:
+ pass
+ elif item.startswith('Date:'):
+ try:
+ # Date: 2016-02-21 21:44 UTC
+ date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
+ params['publishedDate'] = date
+ except Exception as e:
+ pass
+ elif item.startswith('Comment:'):
+ params['content'] = item
+ stats = info_row.xpath('./td[@class="stats"]/span')
+ # has the layout not changed yet?
+ if len(stats) == 3:
+ params['seed'] = int_or_zero(extract_text(stats[0]))
+ params['leech'] = int_or_zero(extract_text(stats[1]))
+
+ results.append(params)
+
+ return results
diff --git a/searx/settings.yml b/searx/settings.yml
index 2dce06fd0..7ecb477b7 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -271,6 +271,12 @@ engines:
shortcut : sw
disabled : True
+ - name : tokyotoshokan
+ engine : tokyotoshokan
+ shortcut : tt
+ timeout : 6.0
+ disabled : True
+
- name : torrentz
engine : torrentz
timeout : 5.0
diff --git a/tests/unit/engines/test_tokyotoshokan.py b/tests/unit/engines/test_tokyotoshokan.py
new file mode 100644
index 000000000..efe7dbfc2
--- /dev/null
+++ b/tests/unit/engines/test_tokyotoshokan.py
@@ -0,0 +1,110 @@
+import mock
+from collections import defaultdict
+from searx.engines import tokyotoshokan
+from searx.testing import SearxTestCase
+from datetime import datetime
+
+
+class TestTokyotoshokanEngine(SearxTestCase):
+
+ def test_request(self):
+ query = 'test_query'
+ dic = defaultdict(dict)
+ dic['pageno'] = 1
+ params = tokyotoshokan.request(query, dic)
+ self.assertTrue('url' in params)
+ self.assertTrue(query in params['url'])
+ self.assertTrue('tokyotosho.info' in params['url'])
+
+ def test_response(self):
+ resp = mock.Mock(text='<html></html>')
+ self.assertEqual(tokyotoshokan.response(resp), [])
+
+ html = """
+ <table class="listing">
+ <tbody>
+ <tr class="shade category_0">
+ <td rowspan="2">
+ <a href="/?cat=7"><span class="sprite_cat-raw"></span></a>
+ </td>
+ <td class="desc-top">
+ <a href="magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b">
+ <span class="sprite_magnet"></span>
+ </a>
+ <a rel="nofollow" type="application/x-bittorrent" href="http://www.nyaa.se/f">
+ Koyomimonogatari
+ </a>
+ </td>
+ <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td>
+ </tr>
+ <tr class="shade category_0">
+ <td class="desc-bot">
+ Authorized: <span class="auth_ok">Yes</span>
+ Submitter: <a href="?username=Ohys">Ohys</a> |
+ Size: 10.5MB |
+ Date: 2016-03-26 16:41 UTC |
+ Comment: sample comment
+ </td>
+ <td style="color: #BBB; font-family: monospace" class="stats" align="right">
+ S: <span style="color: red">53</span>
+ L: <span style="color: red">18</span>
+ C: <span style="color: red">0</span>
+ ID: 975700
+ </td>
+ </tr>
+
+ <tr class="category_0">
+ <td rowspan="2">
+ <a href="/?cat=7"><span class="sprite_cat-raw"></span></a>
+ </td>
+ <td class="desc-top">
+ <a rel="nofollow" type="application/x-bittorrent" href="http://google.com/q">
+ Owarimonogatari
+ </a>
+ </td>
+ <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td>
+ </tr>
+ <tr class="category_0">
+ <td class="desc-bot">
+ Submitter: <a href="?username=Ohys">Ohys</a> |
+ Size: 932.84EB |
+ Date: QWERTY-03-26 16:41 UTC
+ </td>
+ <td style="color: #BBB; font-family: monospace" class="stats" align="right">
+ S: <span style="color: red">0</span>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ """
+
+ resp = mock.Mock(text=html)
+ results = tokyotoshokan.response(resp)
+
+ self.assertEqual(type(results), list)
+ self.assertEqual(len(results), 2)
+
+ # testing the first result, which has correct format
+ # and should have all information fields filled
+ r = results[0]
+ self.assertEqual(r['url'], 'http://www.nyaa.se/f')
+ self.assertEqual(r['title'], 'Koyomimonogatari')
+ self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b')
+ self.assertEqual(r['filesize'], int(1024 * 1024 * 10.5))
+ self.assertEqual(r['publishedDate'], datetime(2016, 03, 26, 16, 41))
+ self.assertEqual(r['content'], 'Comment: sample comment')
+ self.assertEqual(r['seed'], 53)
+ self.assertEqual(r['leech'], 18)
+
+ # testing the second result, which does not include magnet link,
+ # seed & leech info, and has incorrect size & creation date
+ r = results[1]
+ self.assertEqual(r['url'], 'http://google.com/q')
+ self.assertEqual(r['title'], 'Owarimonogatari')
+
+ self.assertFalse('magnetlink' in r)
+ self.assertFalse('filesize' in r)
+ self.assertFalse('content' in r)
+ self.assertFalse('publishedDate' in r)
+ self.assertFalse('seed' in r)
+ self.assertFalse('leech' in r)