summaryrefslogtreecommitdiff
path: root/searx/engines/digg.py
blob: 24a932d531f12978a9183b5b048b1e517ea4a659 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
 Digg (News, Social media)

 @website     https://digg.com/
 @provide-api no

 @using-api   no
 @results     HTML (using search portal)
 @stable      no (HTML can change)
 @parse       url, title, content, publishedDate, thumbnail
"""

import random
import string
from dateutil import parser
from json import loads
from urllib.parse import urlencode
from lxml import html
from datetime import datetime

# engine dependent config
categories = ['news', 'social media']
paging = True

# search-url
base_url = 'https://digg.com/'
search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html'

# specific xpath variables
results_xpath = '//article'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/h2//a//text()'
content_xpath = './/p//text()'
pubdate_xpath = './/time'

digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
    string.digits + "+_"


# do search-request
def request(query, params):
    offset = (params['pageno'] - 1) * 20
    params['url'] = search_url.format(position=offset,
                                      query=urlencode({'q': query}))
    params['cookies']['frontend.auid'] = ''.join(random.choice(
        digg_cookie_chars) for _ in range(22))
    return params


# get response from search-request
def response(resp):
    results = []

    search_result = loads(resp.text)

    # parse results
    for result in search_result['mapped']:

        published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S")
        # append result
        results.append({'url': result['url'],
                        'title': result['title'],
                        'content': result['excerpt'],
                        'template': 'videos.html',
                        'publishedDate': published,
                        'thumbnail': result['images']['thumbImage']})

    # return results
    return results