From 449935314000fa6391b989f0b90257d15b5c4ffe Mon Sep 17 00:00:00 2001
From: Jordan <me@jordan.im>
Date: Fri, 15 Nov 2019 23:14:47 -0700
Subject: initial commit

---
 .gitignore        |   6 +++
 README            |   8 ++++
 lib/db.py         |  64 ++++++++++++++++++++++++++++++++
 lib/parser.py     | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/request.py    |  70 +++++++++++++++++++++++++++++++++++
 run.py            | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++
 uwsgi.ini.example |   7 ++++
 uwsgi.sh          |   7 ++++
 8 files changed, 374 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README
 create mode 100644 lib/db.py
 create mode 100644 lib/parser.py
 create mode 100644 lib/request.py
 create mode 100755 run.py
 create mode 100644 uwsgi.ini.example
 create mode 100755 uwsgi.sh
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b47c0bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+*.swp
+*.swo
+__pycache__
+sandbox
+cache
+uwsgi.ini
diff --git a/README b/README
new file mode 100644
index 0000000..e1a2eca
--- /dev/null
+++ b/README
@@ -0,0 +1,8 @@
+ft-proxy: transparent proxy reader for The Financial Times
+
+* bypass paywall (2019-11-09)
+* FT requests routed via Tor by default (socks 9095)
+* minimal static asset caching engine
+* remove Javascript elements, GDPR popups
+
+ftcleq4k2dd6xf4wdnhnb6bpmeg2q5spxrmevatichmipz6nrvlb5zyd
diff --git a/lib/db.py b/lib/db.py
new file mode 100644
index 0000000..4c99795
--- /dev/null
+++ b/lib/db.py
@@ -0,0 +1,64 @@
+'''
+File: db.py
+
+Library to facilitate database IO
+'''
+
+import os
+import sqlite3
+
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+
+class db_base:
+    def __init__(self):
+        '''
+        Initialize conn/cursor for in-class use
+        '''
+        self.db_path = os.path.join(ABS_PATH, '../cache/cache.db')
+        self.cache_path = os.path.join(ABS_PATH, '../cache')
+        self.conn = self.db_init()
+        self.cursor = self.conn.cursor()
+
+    def db_init(self):
+        '''
+        Initialize database schema if db not found, return conn
+        '''
+        if not os.path.isdir(self.cache_path):
+            os.mkdir(self.cache_path)
+        if not os.path.isfile(self.db_path):
+            conn = sqlite3.connect(self.db_path)
+            print('database not found, initializing...')
+            conn.execute('''CREATE TABLE cache (hash TEXT, content_type
+            TEXT)''')
+            conn.commit()
+            conn.close()
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        return conn
+
+    def cache_add(self, _hash, content_type):
+        self.cursor.execute('''INSERT INTO cache(hash, content_type) 
+                            VALUES(?,?)''', (_hash, content_type,))
+        self.save()
+
+    def cash_del(self, _hash):
+        self.cursor.execute('DELETE FROM cache WHERE hash=?', (_hash,))
+        self.save()
+
+    def is_cached(self, _hash):
+        self.cursor.execute('''SELECT COUNT(*) FROM cache WHERE 
+                               hash=?''', (_hash,))
+        q_count = self.cursor.fetchone()
+        if q_count[0] > 0:
+            return True
+        return False
+
+    def get_content_type(self, _hash):
+        self.cursor.execute('SELECT * FROM cache WHERE hash=?', (_hash,))
+        return self.cursor.fetchall()[0][1]
+
+    def save(self):
+        self.conn.commit()
+
+    def close(self):
+        self.conn.close()
diff --git a/lib/parser.py b/lib/parser.py
new file mode 100644
index 0000000..6d0a1c8
--- /dev/null
+++ b/lib/parser.py
@@ -0,0 +1,109 @@
+import urllib.parse
+import re
+from bs4 import BeautifulSoup
+
+def update_html(url, prefix, resp):
+    '''
+    Find and update HTML attributes which may contain external dependencies
+
+    e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com>
+
+    :url: url of page from which CSS is included/inline
+    :prefix: proxy method to prepend to join()'d resources
+    :resp: request.py urllib response object containing data, headers, etc
+    '''
+    # update href values to point to our proxy
+    soup = BeautifulSoup(resp['data'], 'html.parser')
+
+    # remove <script> elements--Javascript is evil
+    [s.decompose() for s in soup('script')]
+
+    # remove cookie popup
+    [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})]
+
+    # remove deprecated, unsupported tags (object, applet, param, embed)
+    obsolete = ['object', 'applet', 'param', 'embed']
+    [[s.decompose() for s in soup(elem)] for elem in obsolete]
+
+    # find and update src, href, srcset, and background key values
+    for elem in soup.findAll(src=True):
+        update_tag(url, prefix, elem, 'src')
+    for elem in soup.findAll(href=True):
+        update_tag(url, prefix, elem, 'href')
+    for elem in soup.findAll(srcset=True):
+        update_tag(url, prefix, elem, 'srcset')
+    for elem in soup.findAll(background=True):
+        update_tag(url, prefix, elem, 'background')
+    return soup
+
+def update_tag(url, prefix, elem, key):
+    '''
+    Update value of element's key to prefix proxy method
+
+    :url: url of page from which CSS is included/inline
+    :prefix: proxy method to prepend to join()'d resources
+    '''
+    # an element's key value can be unpopulated, ignore such cases
+    if not elem.get(key):
+        return
+
+    # urljoin() w/ root url if url is relative/absolute (no scheme specifier)
+    url_split = list(urllib.parse.urlsplit(elem[key]))
+    if not url_split[0]:
+        elem[key] = urllib.parse.urljoin(url, elem[key])
+
+    # strip extraneous values by space, use first defined resource
+    elem[key] = elem[key].split()[0]
+
+    # prepend resource url w/ proxy fetch method
+    elem[key] = prefix + elem[key]
+
+def update_css(url, prefix, soup=None, data=None):
+    '''
+    Update inline OR included CSS file to prefix proxy method
+
+    :url: url of page from which CSS is included/inline
+    :prefix: proxy method to prepend to join()'d resources
+    :soup: bs4 object (optional)
+    :data: CSS file contents as str (optional)
+    '''
+    # regex objects to match url(), src=, and @import CSS directives
+    _url    = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' +
+                         r'(?P<resource>[^"\']+)' +
+                         r'(?P<quote_close>\s*["\']?\s*\))',
+                         re.MULTILINE | re.IGNORECASE)
+    _import = re.compile(r'(?P<import>@import)'+
+                         r'(?P<quote_open>\s*["\']\s*)' +
+                         r'(?P<resource>[^"\']+)' +
+                         r'(?P<quote_close>\s*["\'])',
+                         re.MULTILINE | re.IGNORECASE)
+    _src    = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' +
+                         r'(?P<resource>[^"\']+)' +
+                         r'(?P<quote_close>\s*["\']?)',
+                         re.MULTILINE | re.IGNORECASE)
+
+    css_regexes = [_url, _import, _src]
+
+    # re.sub() aforementioned directives, prepend proxy method to resources
+    if data:
+        c = data
+        for reg in css_regexes:
+            c = reg.sub(lambda m: m.group('url') +
+                        m.group('quote_open') +
+                        prefix +
+                        urllib.parse.urljoin(url, m.group('resource')) +
+                        m.group('quote_close'), c)
+        return c
+
+    elif soup:
+        style_tags = soup.findAll('style')
+        for css in style_tags:
+            c = css.string
+            for reg in css_regexes:
+                c = reg.sub(lambda m: m.group('url') +
+                            m.group('quote_open') +
+                            prefix +
+                            urllib.parse.urljoin(url, m.group('resource')) +
+                            m.group('quote_close'), c)
+            css.string = c
+        return soup
diff --git a/lib/request.py b/lib/request.py
new file mode 100644
index 0000000..6cdfa57
--- /dev/null
+++ b/lib/request.py
@@ -0,0 +1,70 @@
+import gzip
+import zlib
+import urllib.request
+import urllib.parse
+from io import BytesIO
+from urllib.error import URLError, HTTPError
+from socket import timeout
+
+TIMEOUT =  10       # seconds to wait before killing connection
+MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default)
+
+def retrieve(url, headers):
+    '''
+    Makes HTTP request to URL and returns response
+
+    Returns dict containing the following:
+    'url': URL of resource, updated from :url: as redirects are followed
+    'code': HTTP response code returned by resource
+    'data': Downloaded resource as str if successful request and download=True,
+    else None
+    'meta': Response headers from resource (dict)
+    '''
+    try:
+        conn = urllib.request.Request(
+            url,
+            headers=headers
+        )
+
+        request = urllib.request.urlopen(conn)
+        end_url = request.geturl() # account for redirects
+
+    except HTTPError as err:
+        print('[%s] %s' % (err.code, url))
+        return {'url': url, 'code': err.code, 'data': None, 'meta': None}
+
+    except URLError as err:
+        print('error connecting to url, %s: %s' % (err, url))
+        return {'url': url, 'code': 502, 'data': None, 'meta': None}
+
+    except timeout:
+        print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url))
+        return {'url': url, 'code': 408, 'data': None, 'meta': None}
+
+    except Exception as err:
+        print('uncaught exception, %s: %s' % (err, url))
+        return {'url': url, 'code': 500, 'data': None, 'meta': None}
+
+    # fetch headers from resource, lower() them for consistency
+    request_info = dict(request.info())
+    headers = {k.lower(): v for k, v in request_info.items()}
+
+    # ensure size of resource falls within MAX_SIZE before downloading
+    if int(headers.get('content-length')) > MAX_SIZE:
+        print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url))
+        return {'url': url, 'code': 413, 'data': None, 'meta': None}
+
+    # support gzip and deflate-encoded responses
+    if headers.get('content-encoding') == 'gzip':
+        buff = BytesIO(request.read())
+        gz_f = gzip.GzipFile(fileobj=buff)
+        data = gz_f.read()
+    elif headers.get('content-encoding') == 'defalte':
+        data = zlib.decompress(request.read())
+    else:
+        data = request.read()
+
+    resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data,
+                 'meta': headers}
+
+    return resp_dict
diff --git a/run.py b/run.py
new file mode 100755
index 0000000..791f5f7
--- /dev/null
+++ b/run.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+# proxy requests to FT through Tor by default, comment to disable
+import socks
+import socket
+socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
+socket.socket = socks.socksocket
+
+import urllib.parse
+import hashlib
+import os
+from flask import Flask, request, Response, send_from_directory
+from lib.request import retrieve
+from lib.db import db_base
+from lib.parser import update_html, update_css
+
+APP = Flask(__name__, static_url_path='')
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+CACHE_PATH = os.path.join(ABS_PATH, 'cache')
+
+WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com']
+CONTENT = '/content/'
+
+@APP.route('/')
+@APP.route('/fetch/<path:resource>')
+def fetch(resource=None):
+    '''
+    Download resource using request headers, return it
+    '''
+    # return FT homepage if no resource value provided
+    if resource:
+        # uwsgi decodes characters -> flask, need to split at /fetch/
+        url = request.url.split('/fetch/', maxsplit=1)[1]
+    else:
+        url = 'https://www.ft.com/'
+
+    # refuse requests for resources not in WHITELIST
+    url_split = list(urllib.parse.urlsplit(url))
+    if url_split[1] not in WHITELIST:
+        return 'Error 403: Non-FT resource', 403
+
+    # remove unnecessary key/values from header set
+    disabled = ['host', 'accept-encoding', 'accept', 'origin']
+    headers = {k: v for (k, v) in request.headers if k.lower() not in disabled}
+
+    # hash url for cache engine, open sqlite connection
+    url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest()
+    cache_db = db_base()
+
+    # if resource is cached return w/ paired FT-derived content-type
+    if resource and cache_db.is_cached(url_sha1):
+        content_type = cache_db.get_content_type(url_sha1)
+        cache_db.close()
+        return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type)
+
+    # set referer to g-news only when browsing articles, unnecessary otherwise
+    if CONTENT in url_split[2]:
+        headers['Referer'] = 'https://news.google.com/'
+    else:
+        headers['Referer'] = 'https://www.google.com/'
+
+    # use encodings supported by lib/request.py (urllib wrapper)
+    headers['Accept-Encoding'] = 'gzip, deflate'
+
+    # fetch remote resource, pass updated set of headers
+    resp = retrieve(url=url, headers=headers)
+    if resp['data'] is None:
+        return 'Error making request: %s' % url, resp['code']
+
+    content_type = resp['meta'].get('content-type')
+    prefix = urllib.parse.urljoin(request.host_url, 'fetch/')
+
+    if content_type and 'text/html' in content_type:
+        # prefix resource includes w/ proxy fetch() method
+        soup = update_html(url, prefix, resp)
+        soup = update_css(url, prefix, soup=soup) # inline CSS
+
+        # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning
+        response = Response(response=soup.decode('utf-8'), status=resp['code'],
+                            content_type=content_type)
+        cache_db.close()
+        return response
+
+    # if resource is CSS file (text/css) parse and update resource includes
+    elif content_type and 'text/css' in content_type:
+        c = update_css(url, prefix, data=resp['data'].decode('utf-8'))
+        response = Response(response=c, status=resp['code'],
+                            content_type=content_type)
+        cache_db.close()
+        return response
+
+    # cache and return if resource is neither HTML nor CSS
+    else:
+        with open(os.path.join('cache', url_sha1), 'wb') as f_cache:
+            f_cache.write(resp['data'])
+        cache_db.cache_add(url_sha1, content_type)
+        response = Response(response=resp['data'], status=resp['code'],
+                            content_type=content_type)
+        cache_db.close()
+        return response
+
+if __name__ == '__main__':
+    APP.run(host='127.0.0.1', port='8085', threaded=True)
diff --git a/uwsgi.ini.example b/uwsgi.ini.example
new file mode 100644
index 0000000..93957e0
--- /dev/null
+++ b/uwsgi.ini.example
@@ -0,0 +1,7 @@
+[uwsgi]
+http      = 127.0.0.1:8085
+processes = 12
+threads   = 4
+wsgi-file = run.py
+callable  = APP
+master    = true
diff --git a/uwsgi.sh b/uwsgi.sh
new file mode 100755
index 0000000..d5113df
--- /dev/null
+++ b/uwsgi.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# sandbox dir is virtualenv (python3 -m venv sandbox)
+. sandbox/bin/activate
+
+# uwsgi.ini.example provided as template
+uwsgi --ini uwsgi.ini
-- 
cgit v1.2.3-54-g00ecf