diff options
author | Angristan <11699655+Angristan@users.noreply.github.com> | 2018-08-19 13:30:41 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-19 13:30:41 +0200 |
commit | c2da901afab28cc13794511709f70a0c76edc659 (patch) | |
tree | 7ef93394bf5c2a3d854ecd6fe2d04d3a6f7e3b4c /searx/engines | |
parent | b75f1b6cc39a94989a74d52eb0f1267c3e3c665e (diff) | |
parent | 3126660be5e85a18ee386f49104d3bbb158a6386 (diff) | |
download | searxng-c2da901afab28cc13794511709f70a0c76edc659.tar.gz searxng-c2da901afab28cc13794511709f70a0c76edc659.zip |
Merge branch 'master' into patch-2
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/duden.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/searx/engines/duden.py b/searx/engines/duden.py new file mode 100644 index 000000000..881ff9d9c --- /dev/null +++ b/searx/engines/duden.py @@ -0,0 +1,76 @@ +""" + Duden + @website https://www.duden.de + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html, etree +import re +from searx.engines.xpath import extract_text +from searx.url_utils import quote +from searx import logger + +categories = ['general'] +paging = True +language_support = False + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' + + +def request(query, params): + '''pre-request callback + params<dict>: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = (params['pageno'] - 1) + params['url'] = search_url.format(offset=offset, query=quote(query)) + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + dom = html.fromstring(resp.text) + + try: + number_of_results_string = re.sub('[^0-9]', '', dom.xpath( + '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] + ) + + results.append({'number_of_results': int(number_of_results_string)}) + + except: + logger.debug("Couldn't read number of results.") + pass + + for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): + try: + logger.debug("running for %s" % str(result)) + link = result.xpath('.//h2/a')[0] + url = link.attrib.get('href') + title = result.xpath('string(.//h2/a)') + content = extract_text(result.xpath('.//p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + return results |