diff options
author | Alexandre Flament <alex@al-f.net> | 2020-12-03 10:20:40 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-03 10:20:40 +0100 |
commit | 6b5a57882242f24f867b6aa14b79b514720c6d83 (patch) | |
tree | e7c598da513be497562df8bd2fe33607ac3d8918 | |
parent | 35a76d91161ffdbd22136e431b4d42022f432141 (diff) | |
parent | bef185723affdc549487e50ae521db61110c3383 (diff) | |
download | searxng-6b5a57882242f24f867b6aa14b79b514720c6d83.tar.gz searxng-6b5a57882242f24f867b6aa14b79b514720c6d83.zip |
Merge pull request #2285 from return42/fix-digg
bugfix & refactor digg engine
-rw-r--r-- | Makefile | 16 | ||||
-rw-r--r-- | searx/engines/digg.py | 69 |
2 files changed, 42 insertions, 43 deletions
@@ -212,15 +212,15 @@ gecko.driver: PHONY += test test.sh test.pylint test.pep8 test.unit test.coverage test.robot test: buildenv test.pylint test.pep8 test.unit gecko.driver test.robot +PYLINT_FILES=\ + searx/preferences.py \ + searx/testing.py \ + searx/engines/gigablast.py \ + searx/engines/deviantart.py \ + searx/engines/digg.py -# TODO: balance linting with pylint test.pylint: pyenvinstall - $(call cmd,pylint,\ - searx/preferences.py \ - searx/testing.py \ - searx/engines/gigablast.py \ - searx/engines/deviantart.py \ - ) + $(call cmd,pylint,$(PYLINT_FILES)) $(call cmd,pylint,\ --disable=$(PYLINT_SEARX_DISABLE_OPTION) \ --additional-builtins=$(PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES) \ @@ -249,7 +249,7 @@ test.sh: test.pep8: pyenvinstall @echo "TEST pycodestyle (formerly pep8)" - $(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, searx/engines/gigablast.py, searx/engines/deviantart.py' \ + $(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, $(foreach f,$(PYLINT_FILES),$(f),)' \ --max-line-length=120 --ignore "E117,E252,E402,E722,E741,W503,W504,W605" searx tests test.unit: pyenvinstall diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 831d698bc..85f727f0d 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -1,7 +1,7 @@ """ Digg (News, Social media) - @website https://digg.com/ + @website https://digg.com @provide-api no @using-api no @@ -9,59 +9,58 @@ @stable no (HTML can change) @parse url, title, content, publishedDate, thumbnail """ +# pylint: disable=missing-function-docstring -import random -import string from json import loads from urllib.parse import urlencode from datetime import datetime +from lxml import html + # engine dependent config categories = ['news', 'social media'] paging = True +base_url = 'https://digg.com' # search-url -base_url = 'https://digg.com/' -search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' - -# specific xpath variables -results_xpath = '//article' -link_xpath = './/small[@class="time"]//a' -title_xpath = './/h2//a//text()' -content_xpath = './/p//text()' -pubdate_xpath = './/time' - -digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ - string.digits + "+_" - +search_url = base_url + ( + '/api/search/' + '?{query}' + '&from={position}' + '&size=20' + '&format=html' +) -# do search-request def request(query, params): offset = (params['pageno'] - 1) * 20 - params['url'] = search_url.format(position=offset, - query=urlencode({'q': query})) - params['cookies']['frontend.auid'] = ''.join(random.choice( - digg_cookie_chars) for _ in range(22)) + params['url'] = search_url.format( + query = urlencode({'q': query}), + position = offset, + ) return params - -# get response from search-request def response(resp): results = [] - search_result = loads(resp.text) - # parse results - for result in search_result['mapped']: + for result in loads(resp.text)['mapped']: + + # strip html tags and superfluous quotation marks from content + content = html.document_fromstring( + result['excerpt'] + ).text_content() - published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") - # append result - results.append({'url': result['url'], - 'title': result['title'], - 'content': result['excerpt'], - 'template': 'videos.html', - 'publishedDate': published, - 'thumbnail': result['images']['thumbImage']}) + # 'created': {'ISO': '2020-10-16T14:09:55Z', ...} + published = datetime.strptime( + result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ' + ) + results.append({ + 'url': result['url'], + 'title': result['title'], + 'content' : content, + 'template': 'videos.html', + 'publishedDate': published, + 'thumbnail': result['images']['thumbImage'], + }) - # return results return results |