From 449935314000fa6391b989f0b90257d15b5c4ffe Mon Sep 17 00:00:00 2001 From: Jordan Date: Fri, 15 Nov 2019 23:14:47 -0700 Subject: initial commit --- run.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100755 run.py (limited to 'run.py') diff --git a/run.py b/run.py new file mode 100755 index 0000000..791f5f7 --- /dev/null +++ b/run.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# proxy requests to FT through Tor by default, comment to disable +import socks +import socket +socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050) +socket.socket = socks.socksocket + +import urllib.parse +import hashlib +import os +from flask import Flask, request, Response, send_from_directory +from lib.request import retrieve +from lib.db import db_base +from lib.parser import update_html, update_css + +APP = Flask(__name__, static_url_path='') +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) +CACHE_PATH = os.path.join(ABS_PATH, 'cache') + +WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com'] +CONTENT = '/content/' + +@APP.route('/') +@APP.route('/fetch/') +def fetch(resource=None): + ''' + Download resource using request headers, return it + ''' + # return FT homepage if no resource value provided + if resource: + # uwsgi decodes characters -> flask, need to split at /fetch/ + url = request.url.split('/fetch/', maxsplit=1)[1] + else: + url = 'https://www.ft.com/' + + # refuse requests for resources not in WHITELIST + url_split = list(urllib.parse.urlsplit(url)) + if url_split[1] not in WHITELIST: + return 'Error 403: Non-FT resource', 403 + + # remove unnecessary key/values from header set + disabled = ['host', 'accept-encoding', 'accept', 'origin'] + headers = {k: v for (k, v) in request.headers if k.lower() not in disabled} + + # hash url for cache engine, open sqlite connection + url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest() + cache_db = db_base() + + # if resource is cached return w/ paired FT-derived content-type + if resource and cache_db.is_cached(url_sha1): + content_type = cache_db.get_content_type(url_sha1) + cache_db.close() + return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type) + + # set referer to g-news only when browsing articles, unnecessary otherwise + if CONTENT in url_split[2]: + headers['Referer'] = 'https://news.google.com/' + else: + headers['Referer'] = 'https://www.google.com/' + + # use encodings supported by lib/request.py (urllib wrapper) + headers['Accept-Encoding'] = 'gzip, deflate' + + # fetch remote resource, pass updated set of headers + resp = retrieve(url=url, headers=headers) + if resp['data'] is None: + return 'Error making request: %s' % url, resp['code'] + + content_type = resp['meta'].get('content-type') + prefix = urllib.parse.urljoin(request.host_url, 'fetch/') + + if content_type and 'text/html' in content_type: + # prefix resource includes w/ proxy fetch() method + soup = update_html(url, prefix, resp) + soup = update_css(url, prefix, soup=soup) # inline CSS + + # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning + response = Response(response=soup.decode('utf-8'), status=resp['code'], + content_type=content_type) + cache_db.close() + return response + + # if resource is CSS file (text/css) parse and update resource includes + elif content_type and 'text/css' in content_type: + c = update_css(url, prefix, data=resp['data'].decode('utf-8')) + response = Response(response=c, status=resp['code'], + content_type=content_type) + cache_db.close() + return response + + # cache and return if resource is neither HTML nor CSS + else: + with open(os.path.join('cache', url_sha1), 'wb') as f_cache: + f_cache.write(resp['data']) + cache_db.cache_add(url_sha1, content_type) + response = Response(response=resp['data'], status=resp['code'], + content_type=content_type) + cache_db.close() + return response + +if __name__ == '__main__': + APP.run(host='127.0.0.1', port='8085', threaded=True) -- cgit v1.2.3-54-g00ecf