diff options
author | Kirill Isakov <ukwt@ya.ru> | 2016-03-27 00:49:57 +0600 |
---|---|---|
committer | Kirill Isakov <ukwt@ya.ru> | 2016-03-27 00:49:57 +0600 |
commit | 547b8a87653d87b8be85710275a66be1bec1e39c (patch) | |
tree | ef07430b8ce5dab281f50792056a43b3e0558add | |
parent | 7fbc12ee4e6aea8a8ad0098deb03054976056371 (diff) | |
download | searxng-547b8a87653d87b8be85710275a66be1bec1e39c.tar.gz searxng-547b8a87653d87b8be85710275a66be1bec1e39c.zip |
Add Tokyo Toshokan search engine
-rw-r--r-- | searx/engines/tokyotoshokan.py | 102 | ||||
-rw-r--r-- | searx/settings.yml | 6 | ||||
-rw-r--r-- | tests/unit/engines/test_tokyotoshokan.py | 110 |
3 files changed, 218 insertions, 0 deletions
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py new file mode 100644 index 000000000..17e8e2191 --- /dev/null +++ b/searx/engines/tokyotoshokan.py @@ -0,0 +1,102 @@ +""" + Tokyo Toshokan (A BitTorrent Library for Japanese Media) + + @website https://www.tokyotosho.info/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, publishedDate, seed, leech, + filesize, magnetlink, content +""" + +import re +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://www.tokyotosho.info/' +search_url = base_url + 'search.php?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'page': params['pageno'], + 'terms': query}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') + + # check if there are no results or page layout was changed so we cannot parse it + # currently there are two rows for each result, so total count must be even + if len(rows) == 0 or len(rows) % 2 != 0: + return [] + + # regular expression for parsing torrent size strings + size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) + + # processing the results, two rows at a time + for i in xrange(0, len(rows), 2): + # parse the first row + name_row = rows[i] + + links = name_row.xpath('./td[@class="desc-top"]/a') + params = { + 'template': 'torrent.html', + 'url': links[-1].attrib.get('href'), + 'title': extract_text(links[-1]) + } + # I have not yet seen any torrents without magnet links, but + # it's better to be prepared to stumble upon one some day + if len(links) == 2: + magnet = links[0].attrib.get('href') + if magnet.startswith('magnet'): + # okay, we have a valid magnet link, let's add it to the result + params['magnetlink'] = magnet + + # no more info in the first row, start parsing the second one + info_row = rows[i + 1] + desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) + for item in desc.split('|'): + item = item.strip() + if item.startswith('Size:'): + try: + # ('1.228', 'GB') + groups = size_re.match(item).groups() + multiplier = get_filesize_mul(groups[1]) + params['filesize'] = int(multiplier * float(groups[0])) + except Exception as e: + pass + elif item.startswith('Date:'): + try: + # Date: 2016-02-21 21:44 UTC + date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') + params['publishedDate'] = date + except Exception as e: + pass + elif item.startswith('Comment:'): + params['content'] = item + stats = info_row.xpath('./td[@class="stats"]/span') + # has the layout not changed yet? + if len(stats) == 3: + params['seed'] = int_or_zero(extract_text(stats[0])) + params['leech'] = int_or_zero(extract_text(stats[1])) + + results.append(params) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 2dce06fd0..7ecb477b7 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -271,6 +271,12 @@ engines: shortcut : sw disabled : True + - name : tokyotoshokan + engine : tokyotoshokan + shortcut : tt + timeout : 6.0 + disabled : True + - name : torrentz engine : torrentz timeout : 5.0 diff --git a/tests/unit/engines/test_tokyotoshokan.py b/tests/unit/engines/test_tokyotoshokan.py new file mode 100644 index 000000000..efe7dbfc2 --- /dev/null +++ b/tests/unit/engines/test_tokyotoshokan.py @@ -0,0 +1,110 @@ +import mock +from collections import defaultdict +from searx.engines import tokyotoshokan +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestTokyotoshokanEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = tokyotoshokan.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('tokyotosho.info' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(tokyotoshokan.response(resp), []) + + html = """ + <table class="listing"> + <tbody> + <tr class="shade category_0"> + <td rowspan="2"> + <a href="/?cat=7"><span class="sprite_cat-raw"></span></a> + </td> + <td class="desc-top"> + <a href="magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b"> + <span class="sprite_magnet"></span> + </a> + <a rel="nofollow" type="application/x-bittorrent" href="http://www.nyaa.se/f"> + Koyomimonogatari + </a> + </td> + <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td> + </tr> + <tr class="shade category_0"> + <td class="desc-bot"> + Authorized: <span class="auth_ok">Yes</span> + Submitter: <a href="?username=Ohys">Ohys</a> | + Size: 10.5MB | + Date: 2016-03-26 16:41 UTC | + Comment: sample comment + </td> + <td style="color: #BBB; font-family: monospace" class="stats" align="right"> + S: <span style="color: red">53</span> + L: <span style="color: red">18</span> + C: <span style="color: red">0</span> + ID: 975700 + </td> + </tr> + + <tr class="category_0"> + <td rowspan="2"> + <a href="/?cat=7"><span class="sprite_cat-raw"></span></a> + </td> + <td class="desc-top"> + <a rel="nofollow" type="application/x-bittorrent" href="http://google.com/q"> + Owarimonogatari + </a> + </td> + <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td> + </tr> + <tr class="category_0"> + <td class="desc-bot"> + Submitter: <a href="?username=Ohys">Ohys</a> | + Size: 932.84EB | + Date: QWERTY-03-26 16:41 UTC + </td> + <td style="color: #BBB; font-family: monospace" class="stats" align="right"> + S: <span style="color: red">0</span> + </td> + </tr> + </tbody> + </table> + """ + + resp = mock.Mock(text=html) + results = tokyotoshokan.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + + # testing the first result, which has correct format + # and should have all information fields filled + r = results[0] + self.assertEqual(r['url'], 'http://www.nyaa.se/f') + self.assertEqual(r['title'], 'Koyomimonogatari') + self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b') + self.assertEqual(r['filesize'], int(1024 * 1024 * 10.5)) + self.assertEqual(r['publishedDate'], datetime(2016, 03, 26, 16, 41)) + self.assertEqual(r['content'], 'Comment: sample comment') + self.assertEqual(r['seed'], 53) + self.assertEqual(r['leech'], 18) + + # testing the second result, which does not include magnet link, + # seed & leech info, and has incorrect size & creation date + r = results[1] + self.assertEqual(r['url'], 'http://google.com/q') + self.assertEqual(r['title'], 'Owarimonogatari') + + self.assertFalse('magnetlink' in r) + self.assertFalse('filesize' in r) + self.assertFalse('content' in r) + self.assertFalse('publishedDate' in r) + self.assertFalse('seed' in r) + self.assertFalse('leech' in r) |