From 449935314000fa6391b989f0b90257d15b5c4ffe Mon Sep 17 00:00:00 2001
From: Jordan <me@jordan.im>
Date: Fri, 15 Nov 2019 23:14:47 -0700
Subject: initial commit

---
 run.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100755 run.py

(limited to 'run.py')

diff --git a/run.py b/run.py
new file mode 100755
index 0000000..791f5f7
--- /dev/null
+++ b/run.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+# proxy requests to FT through Tor by default, comment to disable
+import socks
+import socket
+socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
+socket.socket = socks.socksocket
+
+import urllib.parse
+import hashlib
+import os
+from flask import Flask, request, Response, send_from_directory
+from lib.request import retrieve
+from lib.db import db_base
+from lib.parser import update_html, update_css
+
+APP = Flask(__name__, static_url_path='')
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+CACHE_PATH = os.path.join(ABS_PATH, 'cache')
+
+WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com']
+CONTENT = '/content/'
+
+@APP.route('/')
+@APP.route('/fetch/<path:resource>')
+def fetch(resource=None):
+    '''
+    Download resource using request headers, return it
+    '''
+    # return FT homepage if no resource value provided
+    if resource:
+        # uwsgi decodes characters -> flask, need to split at /fetch/
+        url = request.url.split('/fetch/', maxsplit=1)[1]
+    else:
+        url = 'https://www.ft.com/'
+
+    # refuse requests for resources not in WHITELIST
+    url_split = list(urllib.parse.urlsplit(url))
+    if url_split[1] not in WHITELIST:
+        return 'Error 403: Non-FT resource', 403
+
+    # remove unnecessary key/values from header set
+    disabled = ['host', 'accept-encoding', 'accept', 'origin']
+    headers = {k: v for (k, v) in request.headers if k.lower() not in disabled}
+
+    # hash url for cache engine, open sqlite connection
+    url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest()
+    cache_db = db_base()
+
+    # if resource is cached return w/ paired FT-derived content-type
+    if resource and cache_db.is_cached(url_sha1):
+        content_type = cache_db.get_content_type(url_sha1)
+        cache_db.close()
+        return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type)
+
+    # set referer to g-news only when browsing articles, unnecessary otherwise
+    if CONTENT in url_split[2]:
+        headers['Referer'] = 'https://news.google.com/'
+    else:
+        headers['Referer'] = 'https://www.google.com/'
+
+    # use encodings supported by lib/request.py (urllib wrapper)
+    headers['Accept-Encoding'] = 'gzip, deflate'
+
+    # fetch remote resource, pass updated set of headers
+    resp = retrieve(url=url, headers=headers)
+    if resp['data'] is None:
+        return 'Error making request: %s' % url, resp['code']
+
+    content_type = resp['meta'].get('content-type')
+    prefix = urllib.parse.urljoin(request.host_url, 'fetch/')
+
+    if content_type and 'text/html' in content_type:
+        # prefix resource includes w/ proxy fetch() method
+        soup = update_html(url, prefix, resp)
+        soup = update_css(url, prefix, soup=soup) # inline CSS
+
+        # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning
+        response = Response(response=soup.decode('utf-8'), status=resp['code'],
+                            content_type=content_type)
+        cache_db.close()
+        return response
+
+    # if resource is CSS file (text/css) parse and update resource includes
+    elif content_type and 'text/css' in content_type:
+        c = update_css(url, prefix, data=resp['data'].decode('utf-8'))
+        response = Response(response=c, status=resp['code'],
+                            content_type=content_type)
+        cache_db.close()
+        return response
+
+    # cache and return if resource is neither HTML nor CSS
+    else:
+        with open(os.path.join('cache', url_sha1), 'wb') as f_cache:
+            f_cache.write(resp['data'])
+        cache_db.cache_add(url_sha1, content_type)
+        response = Response(response=resp['data'], status=resp['code'],
+                            content_type=content_type)
+        cache_db.close()
+        return response
+
+if __name__ == '__main__':
+    APP.run(host='127.0.0.1', port='8085', threaded=True)
-- 
cgit v1.2.3-54-g00ecf