run.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

#!/usr/bin/env python3

# proxy requests to FT through Tor by default, comment to disable
import socks
import socket
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
socket.socket = socks.socksocket

import urllib.parse
import hashlib
import os
from flask import Flask, request, Response, send_from_directory
from lib.request import retrieve
from lib.db import db_base
from lib.parser import update_html, update_css

APP = Flask(__name__, static_url_path='')
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
CACHE_PATH = os.path.join(ABS_PATH, 'cache')

WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com',
             'ftalphaville.ft.com']
CONTENT = '/content/'

@APP.route('/')
@APP.route('/fetch/<path:resource>')
def fetch(resource=None):
    '''
    Download resource using request headers, return it
    '''
    # return FT homepage if no resource value provided
    if resource:
        # uwsgi decodes characters -> flask, need to split at /fetch/
        url = request.url.split('/fetch/', maxsplit=1)[1]
    else:
        url = 'https://www.ft.com/'

    # refuse requests for resources not in WHITELIST
    url_split = list(urllib.parse.urlsplit(url))
    if url_split[1] not in WHITELIST:
        return 'Error 403: Non-FT resource', 403

    # remove unnecessary key/values from header set
    disabled = ['host', 'accept-encoding', 'accept', 'origin']
    headers = {k: v for (k, v) in request.headers if k.lower() not in disabled}

    # hash url for cache engine, open sqlite connection
    url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest()
    cache_db = db_base()

    # if resource is cached return w/ paired FT-derived content-type
    if resource and cache_db.is_cached(url_sha1):
        content_type = cache_db.get_content_type(url_sha1)
        cache_db.close()
        return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type)

    # set referer to g-news only when browsing articles, unnecessary otherwise
    if CONTENT in url_split[2]:
        headers['Referer'] = 'https://news.google.com/'
    else:
        headers['Referer'] = 'https://www.google.com/'

    # use encodings supported by lib/request.py (urllib wrapper)
    headers['Accept-Encoding'] = 'gzip, deflate'

    # fetch remote resource, pass updated set of headers
    resp = retrieve(url=url, headers=headers)
    if resp['data'] is None:
        return 'Error making request: %s' % url, resp['code']

    content_type = resp['meta'].get('content-type')
    prefix = urllib.parse.urljoin(request.host_url, 'fetch/')

    if content_type and 'text/html' in content_type:
        # prefix resource includes w/ proxy fetch() method
        soup = update_html(url, prefix, resp)
        soup = update_css(url, prefix, soup=soup) # inline CSS

        # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning
        response = Response(response=soup.decode('utf-8'), status=resp['code'],
                            content_type=content_type)
        cache_db.close()
        return response

    # if resource is CSS file (text/css) parse and update resource includes
    elif content_type and 'text/css' in content_type:
        c = update_css(url, prefix, data=resp['data'].decode('utf-8'))
        response = Response(response=c, status=resp['code'],
                            content_type=content_type)
        cache_db.close()
        return response

    # cache and return if resource is neither HTML nor CSS
    else:
        with open(os.path.join('cache', url_sha1), 'wb') as f_cache:
            f_cache.write(resp['data'])
        cache_db.cache_add(url_sha1, content_type)
        response = Response(response=resp['data'], status=resp['code'],
                            content_type=content_type)
        cache_db.close()
        return response

if __name__ == '__main__':
    APP.run(host='127.0.0.1', port='8085', threaded=True)