diff options
author | Jordan <me@jordan.im> | 2019-11-15 23:14:47 -0700 |
---|---|---|
committer | Jordan <me@jordan.im> | 2019-11-15 23:14:47 -0700 |
commit | 449935314000fa6391b989f0b90257d15b5c4ffe (patch) | |
tree | 33d5431d2c02b689c6feb82d72f111b7a1f07a8d | |
download | ft-bypass-449935314000fa6391b989f0b90257d15b5c4ffe.tar.gz ft-bypass-449935314000fa6391b989f0b90257d15b5c4ffe.zip |
initial commit
-rw-r--r-- | .gitignore | 6 | ||||
-rw-r--r-- | README | 8 | ||||
-rw-r--r-- | lib/db.py | 64 | ||||
-rw-r--r-- | lib/parser.py | 109 | ||||
-rw-r--r-- | lib/request.py | 70 | ||||
-rwxr-xr-x | run.py | 103 | ||||
-rw-r--r-- | uwsgi.ini.example | 7 | ||||
-rwxr-xr-x | uwsgi.sh | 7 |
8 files changed, 374 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b47c0bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.swp +*.swo +__pycache__ +sandbox +cache +uwsgi.ini @@ -0,0 +1,8 @@ +ft-proxy: transparent proxy reader for The Financial Times + +* bypass paywall (2019-11-09) +* FT requests routed via Tor by default (socks 9095) +* minimal static asset caching engine +* remove Javascript elements, GDPR popups + +ftcleq4k2dd6xf4wdnhnb6bpmeg2q5spxrmevatichmipz6nrvlb5zyd diff --git a/lib/db.py b/lib/db.py new file mode 100644 index 0000000..4c99795 --- /dev/null +++ b/lib/db.py @@ -0,0 +1,64 @@ +''' +File: db.py + +Library to facilitate database IO +''' + +import os +import sqlite3 + +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) + +class db_base: + def __init__(self): + ''' + Initialize conn/cursor for in-class use + ''' + self.db_path = os.path.join(ABS_PATH, '../cache/cache.db') + self.cache_path = os.path.join(ABS_PATH, '../cache') + self.conn = self.db_init() + self.cursor = self.conn.cursor() + + def db_init(self): + ''' + Initialize database schema if db not found, return conn + ''' + if not os.path.isdir(self.cache_path): + os.mkdir(self.cache_path) + if not os.path.isfile(self.db_path): + conn = sqlite3.connect(self.db_path) + print('database not found, initializing...') + conn.execute('''CREATE TABLE cache (hash TEXT, content_type + TEXT)''') + conn.commit() + conn.close() + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + return conn + + def cache_add(self, _hash, content_type): + self.cursor.execute('''INSERT INTO cache(hash, content_type) + VALUES(?,?)''', (_hash, content_type,)) + self.save() + + def cash_del(self, _hash): + self.cursor.execute('DELETE FROM cache WHERE hash=?', (_hash,)) + self.save() + + def is_cached(self, _hash): + self.cursor.execute('''SELECT COUNT(*) FROM cache WHERE + hash=?''', (_hash,)) + q_count = self.cursor.fetchone() + if q_count[0] > 0: + return True + return False + + def get_content_type(self, _hash): + self.cursor.execute('SELECT * FROM cache WHERE hash=?', (_hash,)) + return self.cursor.fetchall()[0][1] + + def save(self): + self.conn.commit() + + def close(self): + self.conn.close() diff --git a/lib/parser.py b/lib/parser.py new file mode 100644 index 0000000..6d0a1c8 --- /dev/null +++ b/lib/parser.py @@ -0,0 +1,109 @@ +import urllib.parse +import re +from bs4 import BeautifulSoup + +def update_html(url, prefix, resp): + ''' + Find and update HTML attributes which may contain external dependencies + + e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com> + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + :resp: request.py urllib response object containing data, headers, etc + ''' + # update href values to point to our proxy + soup = BeautifulSoup(resp['data'], 'html.parser') + + # remove <script> elements--Javascript is evil + [s.decompose() for s in soup('script')] + + # remove cookie popup + [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})] + + # remove deprecated, unsupported tags (object, applet, param, embed) + obsolete = ['object', 'applet', 'param', 'embed'] + [[s.decompose() for s in soup(elem)] for elem in obsolete] + + # find and update src, href, srcset, and background key values + for elem in soup.findAll(src=True): + update_tag(url, prefix, elem, 'src') + for elem in soup.findAll(href=True): + update_tag(url, prefix, elem, 'href') + for elem in soup.findAll(srcset=True): + update_tag(url, prefix, elem, 'srcset') + for elem in soup.findAll(background=True): + update_tag(url, prefix, elem, 'background') + return soup + +def update_tag(url, prefix, elem, key): + ''' + Update value of element's key to prefix proxy method + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + ''' + # an element's key value can be unpopulated, ignore such cases + if not elem.get(key): + return + + # urljoin() w/ root url if url is relative/absolute (no scheme specifier) + url_split = list(urllib.parse.urlsplit(elem[key])) + if not url_split[0]: + elem[key] = urllib.parse.urljoin(url, elem[key]) + + # strip extraneous values by space, use first defined resource + elem[key] = elem[key].split()[0] + + # prepend resource url w/ proxy fetch method + elem[key] = prefix + elem[key] + +def update_css(url, prefix, soup=None, data=None): + ''' + Update inline OR included CSS file to prefix proxy method + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + :soup: bs4 object (optional) + :data: CSS file contents as str (optional) + ''' + # regex objects to match url(), src=, and @import CSS directives + _url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\']?\s*\))', + re.MULTILINE | re.IGNORECASE) + _import = re.compile(r'(?P<import>@import)'+ + r'(?P<quote_open>\s*["\']\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\'])', + re.MULTILINE | re.IGNORECASE) + _src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\']?)', + re.MULTILINE | re.IGNORECASE) + + css_regexes = [_url, _import, _src] + + # re.sub() aforementioned directives, prepend proxy method to resources + if data: + c = data + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + return c + + elif soup: + style_tags = soup.findAll('style') + for css in style_tags: + c = css.string + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + css.string = c + return soup diff --git a/lib/request.py b/lib/request.py new file mode 100644 index 0000000..6cdfa57 --- /dev/null +++ b/lib/request.py @@ -0,0 +1,70 @@ +import gzip +import zlib +import urllib.request +import urllib.parse +from io import BytesIO +from urllib.error import URLError, HTTPError +from socket import timeout + +TIMEOUT = 10 # seconds to wait before killing connection +MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default) + +def retrieve(url, headers): + ''' + Makes HTTP request to URL and returns response + + Returns dict containing the following: + 'url': URL of resource, updated from :url: as redirects are followed + 'code': HTTP response code returned by resource + 'data': Downloaded resource as str if successful request and download=True, + else None + 'meta': Response headers from resource (dict) + ''' + try: + conn = urllib.request.Request( + url, + headers=headers + ) + + request = urllib.request.urlopen(conn) + end_url = request.geturl() # account for redirects + + except HTTPError as err: + print('[%s] %s' % (err.code, url)) + return {'url': url, 'code': err.code, 'data': None, 'meta': None} + + except URLError as err: + print('error connecting to url, %s: %s' % (err, url)) + return {'url': url, 'code': 502, 'data': None, 'meta': None} + + except timeout: + print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url)) + return {'url': url, 'code': 408, 'data': None, 'meta': None} + + except Exception as err: + print('uncaught exception, %s: %s' % (err, url)) + return {'url': url, 'code': 500, 'data': None, 'meta': None} + + # fetch headers from resource, lower() them for consistency + request_info = dict(request.info()) + headers = {k.lower(): v for k, v in request_info.items()} + + # ensure size of resource falls within MAX_SIZE before downloading + if int(headers.get('content-length')) > MAX_SIZE: + print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url)) + return {'url': url, 'code': 413, 'data': None, 'meta': None} + + # support gzip and deflate-encoded responses + if headers.get('content-encoding') == 'gzip': + buff = BytesIO(request.read()) + gz_f = gzip.GzipFile(fileobj=buff) + data = gz_f.read() + elif headers.get('content-encoding') == 'defalte': + data = zlib.decompress(request.read()) + else: + data = request.read() + + resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data, + 'meta': headers} + + return resp_dict @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# proxy requests to FT through Tor by default, comment to disable +import socks +import socket +socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050) +socket.socket = socks.socksocket + +import urllib.parse +import hashlib +import os +from flask import Flask, request, Response, send_from_directory +from lib.request import retrieve +from lib.db import db_base +from lib.parser import update_html, update_css + +APP = Flask(__name__, static_url_path='') +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) +CACHE_PATH = os.path.join(ABS_PATH, 'cache') + +WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com'] +CONTENT = '/content/' + +@APP.route('/') +@APP.route('/fetch/<path:resource>') +def fetch(resource=None): + ''' + Download resource using request headers, return it + ''' + # return FT homepage if no resource value provided + if resource: + # uwsgi decodes characters -> flask, need to split at /fetch/ + url = request.url.split('/fetch/', maxsplit=1)[1] + else: + url = 'https://www.ft.com/' + + # refuse requests for resources not in WHITELIST + url_split = list(urllib.parse.urlsplit(url)) + if url_split[1] not in WHITELIST: + return 'Error 403: Non-FT resource', 403 + + # remove unnecessary key/values from header set + disabled = ['host', 'accept-encoding', 'accept', 'origin'] + headers = {k: v for (k, v) in request.headers if k.lower() not in disabled} + + # hash url for cache engine, open sqlite connection + url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest() + cache_db = db_base() + + # if resource is cached return w/ paired FT-derived content-type + if resource and cache_db.is_cached(url_sha1): + content_type = cache_db.get_content_type(url_sha1) + cache_db.close() + return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type) + + # set referer to g-news only when browsing articles, unnecessary otherwise + if CONTENT in url_split[2]: + headers['Referer'] = 'https://news.google.com/' + else: + headers['Referer'] = 'https://www.google.com/' + + # use encodings supported by lib/request.py (urllib wrapper) + headers['Accept-Encoding'] = 'gzip, deflate' + + # fetch remote resource, pass updated set of headers + resp = retrieve(url=url, headers=headers) + if resp['data'] is None: + return 'Error making request: %s' % url, resp['code'] + + content_type = resp['meta'].get('content-type') + prefix = urllib.parse.urljoin(request.host_url, 'fetch/') + + if content_type and 'text/html' in content_type: + # prefix resource includes w/ proxy fetch() method + soup = update_html(url, prefix, resp) + soup = update_css(url, prefix, soup=soup) # inline CSS + + # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning + response = Response(response=soup.decode('utf-8'), status=resp['code'], + content_type=content_type) + cache_db.close() + return response + + # if resource is CSS file (text/css) parse and update resource includes + elif content_type and 'text/css' in content_type: + c = update_css(url, prefix, data=resp['data'].decode('utf-8')) + response = Response(response=c, status=resp['code'], + content_type=content_type) + cache_db.close() + return response + + # cache and return if resource is neither HTML nor CSS + else: + with open(os.path.join('cache', url_sha1), 'wb') as f_cache: + f_cache.write(resp['data']) + cache_db.cache_add(url_sha1, content_type) + response = Response(response=resp['data'], status=resp['code'], + content_type=content_type) + cache_db.close() + return response + +if __name__ == '__main__': + APP.run(host='127.0.0.1', port='8085', threaded=True) diff --git a/uwsgi.ini.example b/uwsgi.ini.example new file mode 100644 index 0000000..93957e0 --- /dev/null +++ b/uwsgi.ini.example @@ -0,0 +1,7 @@ +[uwsgi] +http = 127.0.0.1:8085 +processes = 12 +threads = 4 +wsgi-file = run.py +callable = APP +master = true diff --git a/uwsgi.sh b/uwsgi.sh new file mode 100755 index 0000000..d5113df --- /dev/null +++ b/uwsgi.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +# sandbox dir is virtualenv (python3 -m venv sandbox) +. sandbox/bin/activate + +# uwsgi.ini.example provided as template +uwsgi --ini uwsgi.ini |