aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore6
-rw-r--r--README8
-rw-r--r--lib/db.py64
-rw-r--r--lib/parser.py109
-rw-r--r--lib/request.py70
-rwxr-xr-xrun.py103
-rw-r--r--uwsgi.ini.example7
-rwxr-xr-xuwsgi.sh7
8 files changed, 374 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b47c0bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+*.swp
+*.swo
+__pycache__
+sandbox
+cache
+uwsgi.ini
diff --git a/README b/README
new file mode 100644
index 0000000..e1a2eca
--- /dev/null
+++ b/README
@@ -0,0 +1,8 @@
+ft-proxy: transparent proxy reader for The Financial Times
+
+* bypass paywall (2019-11-09)
+* FT requests routed via Tor by default (socks 9095)
+* minimal static asset caching engine
+* remove Javascript elements, GDPR popups
+
+ftcleq4k2dd6xf4wdnhnb6bpmeg2q5spxrmevatichmipz6nrvlb5zyd
diff --git a/lib/db.py b/lib/db.py
new file mode 100644
index 0000000..4c99795
--- /dev/null
+++ b/lib/db.py
@@ -0,0 +1,64 @@
+'''
+File: db.py
+
+Library to facilitate database IO
+'''
+
+import os
+import sqlite3
+
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+
+class db_base:
+ def __init__(self):
+ '''
+ Initialize conn/cursor for in-class use
+ '''
+ self.db_path = os.path.join(ABS_PATH, '../cache/cache.db')
+ self.cache_path = os.path.join(ABS_PATH, '../cache')
+ self.conn = self.db_init()
+ self.cursor = self.conn.cursor()
+
+ def db_init(self):
+ '''
+ Initialize database schema if db not found, return conn
+ '''
+ if not os.path.isdir(self.cache_path):
+ os.mkdir(self.cache_path)
+ if not os.path.isfile(self.db_path):
+ conn = sqlite3.connect(self.db_path)
+ print('database not found, initializing...')
+ conn.execute('''CREATE TABLE cache (hash TEXT, content_type
+ TEXT)''')
+ conn.commit()
+ conn.close()
+ conn = sqlite3.connect(self.db_path)
+ conn.row_factory = sqlite3.Row
+ return conn
+
+ def cache_add(self, _hash, content_type):
+ self.cursor.execute('''INSERT INTO cache(hash, content_type)
+ VALUES(?,?)''', (_hash, content_type,))
+ self.save()
+
+ def cash_del(self, _hash):
+ self.cursor.execute('DELETE FROM cache WHERE hash=?', (_hash,))
+ self.save()
+
+ def is_cached(self, _hash):
+ self.cursor.execute('''SELECT COUNT(*) FROM cache WHERE
+ hash=?''', (_hash,))
+ q_count = self.cursor.fetchone()
+ if q_count[0] > 0:
+ return True
+ return False
+
+ def get_content_type(self, _hash):
+ self.cursor.execute('SELECT * FROM cache WHERE hash=?', (_hash,))
+ return self.cursor.fetchall()[0][1]
+
+ def save(self):
+ self.conn.commit()
+
+ def close(self):
+ self.conn.close()
diff --git a/lib/parser.py b/lib/parser.py
new file mode 100644
index 0000000..6d0a1c8
--- /dev/null
+++ b/lib/parser.py
@@ -0,0 +1,109 @@
+import urllib.parse
+import re
+from bs4 import BeautifulSoup
+
+def update_html(url, prefix, resp):
+ '''
+ Find and update HTML attributes which may contain external dependencies
+
+ e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com>
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ :resp: request.py urllib response object containing data, headers, etc
+ '''
+ # update href values to point to our proxy
+ soup = BeautifulSoup(resp['data'], 'html.parser')
+
+ # remove <script> elements--Javascript is evil
+ [s.decompose() for s in soup('script')]
+
+ # remove cookie popup
+ [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})]
+
+ # remove deprecated, unsupported tags (object, applet, param, embed)
+ obsolete = ['object', 'applet', 'param', 'embed']
+ [[s.decompose() for s in soup(elem)] for elem in obsolete]
+
+ # find and update src, href, srcset, and background key values
+ for elem in soup.findAll(src=True):
+ update_tag(url, prefix, elem, 'src')
+ for elem in soup.findAll(href=True):
+ update_tag(url, prefix, elem, 'href')
+ for elem in soup.findAll(srcset=True):
+ update_tag(url, prefix, elem, 'srcset')
+ for elem in soup.findAll(background=True):
+ update_tag(url, prefix, elem, 'background')
+ return soup
+
+def update_tag(url, prefix, elem, key):
+ '''
+ Update value of element's key to prefix proxy method
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ '''
+ # an element's key value can be unpopulated, ignore such cases
+ if not elem.get(key):
+ return
+
+ # urljoin() w/ root url if url is relative/absolute (no scheme specifier)
+ url_split = list(urllib.parse.urlsplit(elem[key]))
+ if not url_split[0]:
+ elem[key] = urllib.parse.urljoin(url, elem[key])
+
+ # strip extraneous values by space, use first defined resource
+ elem[key] = elem[key].split()[0]
+
+ # prepend resource url w/ proxy fetch method
+ elem[key] = prefix + elem[key]
+
+def update_css(url, prefix, soup=None, data=None):
+ '''
+ Update inline OR included CSS file to prefix proxy method
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ :soup: bs4 object (optional)
+ :data: CSS file contents as str (optional)
+ '''
+ # regex objects to match url(), src=, and @import CSS directives
+ _url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\']?\s*\))',
+ re.MULTILINE | re.IGNORECASE)
+ _import = re.compile(r'(?P<import>@import)'+
+ r'(?P<quote_open>\s*["\']\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\'])',
+ re.MULTILINE | re.IGNORECASE)
+ _src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\']?)',
+ re.MULTILINE | re.IGNORECASE)
+
+ css_regexes = [_url, _import, _src]
+
+ # re.sub() aforementioned directives, prepend proxy method to resources
+ if data:
+ c = data
+ for reg in css_regexes:
+ c = reg.sub(lambda m: m.group('url') +
+ m.group('quote_open') +
+ prefix +
+ urllib.parse.urljoin(url, m.group('resource')) +
+ m.group('quote_close'), c)
+ return c
+
+ elif soup:
+ style_tags = soup.findAll('style')
+ for css in style_tags:
+ c = css.string
+ for reg in css_regexes:
+ c = reg.sub(lambda m: m.group('url') +
+ m.group('quote_open') +
+ prefix +
+ urllib.parse.urljoin(url, m.group('resource')) +
+ m.group('quote_close'), c)
+ css.string = c
+ return soup
diff --git a/lib/request.py b/lib/request.py
new file mode 100644
index 0000000..6cdfa57
--- /dev/null
+++ b/lib/request.py
@@ -0,0 +1,70 @@
+import gzip
+import zlib
+import urllib.request
+import urllib.parse
+from io import BytesIO
+from urllib.error import URLError, HTTPError
+from socket import timeout
+
+TIMEOUT = 10 # seconds to wait before killing connection
+MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default)
+
+def retrieve(url, headers):
+ '''
+ Makes HTTP request to URL and returns response
+
+ Returns dict containing the following:
+ 'url': URL of resource, updated from :url: as redirects are followed
+ 'code': HTTP response code returned by resource
+ 'data': Downloaded resource as str if successful request and download=True,
+ else None
+ 'meta': Response headers from resource (dict)
+ '''
+ try:
+ conn = urllib.request.Request(
+ url,
+ headers=headers
+ )
+
+ request = urllib.request.urlopen(conn)
+ end_url = request.geturl() # account for redirects
+
+ except HTTPError as err:
+ print('[%s] %s' % (err.code, url))
+ return {'url': url, 'code': err.code, 'data': None, 'meta': None}
+
+ except URLError as err:
+ print('error connecting to url, %s: %s' % (err, url))
+ return {'url': url, 'code': 502, 'data': None, 'meta': None}
+
+ except timeout:
+ print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url))
+ return {'url': url, 'code': 408, 'data': None, 'meta': None}
+
+ except Exception as err:
+ print('uncaught exception, %s: %s' % (err, url))
+ return {'url': url, 'code': 500, 'data': None, 'meta': None}
+
+ # fetch headers from resource, lower() them for consistency
+ request_info = dict(request.info())
+ headers = {k.lower(): v for k, v in request_info.items()}
+
+ # ensure size of resource falls within MAX_SIZE before downloading
+ if int(headers.get('content-length')) > MAX_SIZE:
+ print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url))
+ return {'url': url, 'code': 413, 'data': None, 'meta': None}
+
+ # support gzip and deflate-encoded responses
+ if headers.get('content-encoding') == 'gzip':
+ buff = BytesIO(request.read())
+ gz_f = gzip.GzipFile(fileobj=buff)
+ data = gz_f.read()
+ elif headers.get('content-encoding') == 'defalte':
+ data = zlib.decompress(request.read())
+ else:
+ data = request.read()
+
+ resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data,
+ 'meta': headers}
+
+ return resp_dict
diff --git a/run.py b/run.py
new file mode 100755
index 0000000..791f5f7
--- /dev/null
+++ b/run.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+# proxy requests to FT through Tor by default, comment to disable
+import socks
+import socket
+socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
+socket.socket = socks.socksocket
+
+import urllib.parse
+import hashlib
+import os
+from flask import Flask, request, Response, send_from_directory
+from lib.request import retrieve
+from lib.db import db_base
+from lib.parser import update_html, update_css
+
+APP = Flask(__name__, static_url_path='')
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+CACHE_PATH = os.path.join(ABS_PATH, 'cache')
+
+WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com']
+CONTENT = '/content/'
+
+@APP.route('/')
+@APP.route('/fetch/<path:resource>')
+def fetch(resource=None):
+ '''
+ Download resource using request headers, return it
+ '''
+ # return FT homepage if no resource value provided
+ if resource:
+ # uwsgi decodes characters -> flask, need to split at /fetch/
+ url = request.url.split('/fetch/', maxsplit=1)[1]
+ else:
+ url = 'https://www.ft.com/'
+
+ # refuse requests for resources not in WHITELIST
+ url_split = list(urllib.parse.urlsplit(url))
+ if url_split[1] not in WHITELIST:
+ return 'Error 403: Non-FT resource', 403
+
+ # remove unnecessary key/values from header set
+ disabled = ['host', 'accept-encoding', 'accept', 'origin']
+ headers = {k: v for (k, v) in request.headers if k.lower() not in disabled}
+
+ # hash url for cache engine, open sqlite connection
+ url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest()
+ cache_db = db_base()
+
+ # if resource is cached return w/ paired FT-derived content-type
+ if resource and cache_db.is_cached(url_sha1):
+ content_type = cache_db.get_content_type(url_sha1)
+ cache_db.close()
+ return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type)
+
+ # set referer to g-news only when browsing articles, unnecessary otherwise
+ if CONTENT in url_split[2]:
+ headers['Referer'] = 'https://news.google.com/'
+ else:
+ headers['Referer'] = 'https://www.google.com/'
+
+ # use encodings supported by lib/request.py (urllib wrapper)
+ headers['Accept-Encoding'] = 'gzip, deflate'
+
+ # fetch remote resource, pass updated set of headers
+ resp = retrieve(url=url, headers=headers)
+ if resp['data'] is None:
+ return 'Error making request: %s' % url, resp['code']
+
+ content_type = resp['meta'].get('content-type')
+ prefix = urllib.parse.urljoin(request.host_url, 'fetch/')
+
+ if content_type and 'text/html' in content_type:
+ # prefix resource includes w/ proxy fetch() method
+ soup = update_html(url, prefix, resp)
+ soup = update_css(url, prefix, soup=soup) # inline CSS
+
+ # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning
+ response = Response(response=soup.decode('utf-8'), status=resp['code'],
+ content_type=content_type)
+ cache_db.close()
+ return response
+
+ # if resource is CSS file (text/css) parse and update resource includes
+ elif content_type and 'text/css' in content_type:
+ c = update_css(url, prefix, data=resp['data'].decode('utf-8'))
+ response = Response(response=c, status=resp['code'],
+ content_type=content_type)
+ cache_db.close()
+ return response
+
+ # cache and return if resource is neither HTML nor CSS
+ else:
+ with open(os.path.join('cache', url_sha1), 'wb') as f_cache:
+ f_cache.write(resp['data'])
+ cache_db.cache_add(url_sha1, content_type)
+ response = Response(response=resp['data'], status=resp['code'],
+ content_type=content_type)
+ cache_db.close()
+ return response
+
+if __name__ == '__main__':
+ APP.run(host='127.0.0.1', port='8085', threaded=True)
diff --git a/uwsgi.ini.example b/uwsgi.ini.example
new file mode 100644
index 0000000..93957e0
--- /dev/null
+++ b/uwsgi.ini.example
@@ -0,0 +1,7 @@
+[uwsgi]
+http = 127.0.0.1:8085
+processes = 12
+threads = 4
+wsgi-file = run.py
+callable = APP
+master = true
diff --git a/uwsgi.sh b/uwsgi.sh
new file mode 100755
index 0000000..d5113df
--- /dev/null
+++ b/uwsgi.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# sandbox dir is virtualenv (python3 -m venv sandbox)
+. sandbox/bin/activate
+
+# uwsgi.ini.example provided as template
+uwsgi --ini uwsgi.ini