diff options
author | Adam Tauber <asciimoo@gmail.com> | 2015-02-01 14:07:34 +0100 |
---|---|---|
committer | Adam Tauber <asciimoo@gmail.com> | 2015-02-01 14:07:34 +0100 |
commit | 03137eebd9fdfaa57452cb364c1bc9f31b243f67 (patch) | |
tree | b95f6f124cb9f2574e0835ec3f182b0d222719e7 | |
parent | 4a20fc202e886eaf7778481c403106e6243f49b7 (diff) | |
parent | a605d0ae698e8a5555935780d83df50b06727f24 (diff) | |
download | searxng-03137eebd9fdfaa57452cb364c1bc9f31b243f67.tar.gz searxng-03137eebd9fdfaa57452cb364c1bc9f31b243f67.zip |
Merge pull request #208 from pointhi/new_engines
add 1x.com engine, improve yacy-engine
-rw-r--r-- | searx/engines/www1x.py | 82 | ||||
-rw-r--r-- | searx/engines/yacy.py | 25 | ||||
-rw-r--r-- | searx/settings.yml | 5 | ||||
-rw-r--r-- | searx/tests/engines/test_www1x.py | 57 | ||||
-rw-r--r-- | searx/tests/test_engines.py | 1 |
5 files changed, 157 insertions, 13 deletions
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py new file mode 100644 index 000000000..a68c105ce --- /dev/null +++ b/searx/engines/www1x.py @@ -0,0 +1,82 @@ +## 1x (Images) +# +# @website http://1x.com/ +# @provide-api no +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, thumbnail, img_src, content + + +from urllib import urlencode +from urlparse import urljoin +from lxml import html +import string +import re + +# engine dependent config +categories = ['images'] +paging = False + +# search-url +base_url = 'http://1x.com' +search_url = base_url+'/backend/search.php?{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # get links from result-text + regex = re.compile('(</a>|<a)') + results_parts = re.split(regex, resp.text) + + cur_element = '' + + # iterate over link parts + for result_part in results_parts: + # processed start and end of link + if result_part == '<a': + cur_element = result_part + continue + elif result_part != '</a>': + cur_element += result_part + continue + + cur_element += result_part + + # fix xml-error + cur_element = string.replace(cur_element, '"></a>', '"/></a>') + + dom = html.fromstring(cur_element) + link = dom.xpath('//a')[0] + + url = urljoin(base_url, link.attrib.get('href')) + title = link.attrib.get('title', '') + + thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) + # TODO: get image with higher resolution + img_src = thumbnail_src + + # check if url is showing to a photo + if '/photo/' not in url: + continue + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 4c4fac7df..17e2a7aab 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -68,9 +68,18 @@ def response(resp): search_results = raw_search_results.get('channels', {})[0].get('items', []) - if resp.search_params['category'] == 'general': + for result in search_results: + # parse image results + if result.get('image'): + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': '', + 'img_src': result['image'], + 'template': 'images.html'}) + # parse general results - for result in search_results: + else: publishedDate = parser.parse(result['pubDate']) # append result @@ -79,17 +88,7 @@ def response(resp): 'content': result['description'], 'publishedDate': publishedDate}) - elif resp.search_params['category'] == 'images': - # parse image results - for result in search_results: - # append result - results.append({'url': result['url'], - 'title': result['title'], - 'content': '', - 'img_src': result['image'], - 'template': 'images.html'}) - - #TODO parse video, audio and file results + #TODO parse video, audio and file results # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index ebae8af62..f4fca8985 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -83,6 +83,11 @@ engines: engine : www500px shortcut : px + - name : 1x + engine : www1x + shortcut : 1x + disabled : True + - name : flickr categories : images shortcut : fl diff --git a/searx/tests/engines/test_www1x.py b/searx/tests/engines/test_www1x.py new file mode 100644 index 000000000..ab4f282c1 --- /dev/null +++ b/searx/tests/engines/test_www1x.py @@ -0,0 +1,57 @@ +from collections import defaultdict +import mock +from searx.engines import www1x +from searx.testing import SearxTestCase + + +class TestWww1xEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + params = www1x.request(query, defaultdict(dict)) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('1x.com' in params['url']) + + def test_response(self): + self.assertRaises(AttributeError, www1x.response, None) + self.assertRaises(AttributeError, www1x.response, []) + self.assertRaises(AttributeError, www1x.response, '') + self.assertRaises(AttributeError, www1x.response, '[]') + + response = mock.Mock(text='<html></html>') + self.assertEqual(www1x.response(response), []) + html = """ + <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters + [ + <!ELEMENT characters (character*) > + <!ELEMENT character (#PCDATA ) > + + <!ENTITY iexcl "¡" > + <!ENTITY cent "¢" > + <!ENTITY pound "£" > + ] + ><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%"> + <tr> + <td style="min-width: 220px;" valign="top"> + <div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div> + <div> + <a href="/photo/123456" class="dynamiclink"> +<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;"> + </a> + <a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink"> +<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg"> + </a> + </div> + </td> + </table> + ]]></searchresult></root> + """ + response = mock.Mock(text=html) + results = www1x.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456') + self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg') + self.assertEqual(results[0]['content'], '') + self.assertEqual(results[0]['template'], 'images.html') diff --git a/searx/tests/test_engines.py b/searx/tests/test_engines.py index 1ffdbe529..cfd7fa26a 100644 --- a/searx/tests/test_engines.py +++ b/searx/tests/test_engines.py @@ -1,2 +1,3 @@ from searx.tests.engines.test_dummy import * # noqa from searx.tests.engines.test_github import * # noqa +from searx.tests.engines.test_www1x import * # noqa |