diff options
author | Angristan <11699655+Angristan@users.noreply.github.com> | 2018-08-19 13:30:41 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-19 13:30:41 +0200 |
commit | c2da901afab28cc13794511709f70a0c76edc659 (patch) | |
tree | 7ef93394bf5c2a3d854ecd6fe2d04d3a6f7e3b4c | |
parent | b75f1b6cc39a94989a74d52eb0f1267c3e3c665e (diff) | |
parent | 3126660be5e85a18ee386f49104d3bbb158a6386 (diff) | |
download | searxng-c2da901afab28cc13794511709f70a0c76edc659.tar.gz searxng-c2da901afab28cc13794511709f70a0c76edc659.zip |
Merge branch 'master' into patch-2
-rw-r--r-- | searx/answerers/random/answerer.py | 24 | ||||
-rw-r--r-- | searx/engines/duden.py | 76 | ||||
-rw-r--r-- | searx/settings.yml | 5 | ||||
-rw-r--r-- | tests/unit/engines/test_duden.py | 41 |
4 files changed, 143 insertions, 3 deletions
diff --git a/searx/answerers/random/answerer.py b/searx/answerers/random/answerer.py index f2b8bf3e5..b6e8422ad 100644 --- a/searx/answerers/random/answerer.py +++ b/searx/answerers/random/answerer.py @@ -1,6 +1,8 @@ +import hashlib import random import string import sys +import uuid from flask_babel import gettext # required answerer attribute @@ -16,9 +18,13 @@ else: random_string_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase +def random_characters(): + return [random.choice(random_string_letters) + for _ in range(random.randint(8, 32))] + + def random_string(): - return u''.join(random.choice(random_string_letters) - for _ in range(random.randint(8, 32))) + return u''.join(random_characters()) def random_float(): @@ -29,9 +35,21 @@ def random_int(): return unicode(random.randint(-random_int_max, random_int_max)) +def random_sha256(): + m = hashlib.sha256() + m.update(b''.join(random_characters())) + return unicode(m.hexdigest()) + + +def random_uuid(): + return unicode(uuid.uuid4()) + + random_types = {b'string': random_string, b'int': random_int, - b'float': random_float} + b'float': random_float, + b'sha256': random_sha256, + b'uuid': random_uuid} # required answerer function diff --git a/searx/engines/duden.py b/searx/engines/duden.py new file mode 100644 index 000000000..881ff9d9c --- /dev/null +++ b/searx/engines/duden.py @@ -0,0 +1,76 @@ +""" + Duden + @website https://www.duden.de + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html, etree +import re +from searx.engines.xpath import extract_text +from searx.url_utils import quote +from searx import logger + +categories = ['general'] +paging = True +language_support = False + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' + + +def request(query, params): + '''pre-request callback + params<dict>: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = (params['pageno'] - 1) + params['url'] = search_url.format(offset=offset, query=quote(query)) + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + dom = html.fromstring(resp.text) + + try: + number_of_results_string = re.sub('[^0-9]', '', dom.xpath( + '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] + ) + + results.append({'number_of_results': int(number_of_results_string)}) + + except: + logger.debug("Couldn't read number of results.") + pass + + for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): + try: + logger.debug("running for %s" % str(result)) + link = result.xpath('.//h2/a')[0] + url = link.attrib.get('href') + title = result.xpath('string(.//h2/a)') + content = extract_text(result.xpath('.//p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 0c6f642fc..1168ad7a4 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -714,6 +714,11 @@ engines: shortcut : 1337x disabled : True + - name : Duden + engine : duden + shortcut : du + disabled : True + # - name : yacy # engine : yacy # shortcut : ya diff --git a/tests/unit/engines/test_duden.py b/tests/unit/engines/test_duden.py new file mode 100644 index 000000000..d9bbfef8b --- /dev/null +++ b/tests/unit/engines/test_duden.py @@ -0,0 +1,41 @@ +from collections import defaultdict +import mock +from searx.engines import duden +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestDudenEngine(SearxTestCase): + + def test_request(self): + query = 'Haus' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = duden.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('duden.de' in params['url']) + + def test_response(self): + resp = mock.Mock(text='<html></html>') + self.assertEqual(duden.response(resp), []) + + html = """ + <section class="wide"> + <h2><a href="https://this.is.the.url/" class="hidden-link"><strong>This is the title</strong> also here</a></h2> + <p>This is the <strong>content</strong></p> + <a href="https://this.is.the.url/">Zum vollständigen Artikel</a> + </section> + """ + + resp = mock.Mock(text=html) + results = duden.response(resp) + + self.assertEqual(len(results), 1) + self.assertEqual(type(results), list) + + # testing result (dictionary entry) + r = results[0] + self.assertEqual(r['url'], 'https://this.is.the.url/') + self.assertEqual(r['title'], 'This is the title also here') + self.assertEqual(r['content'], 'This is the content') |