From 449935314000fa6391b989f0b90257d15b5c4ffe Mon Sep 17 00:00:00 2001 From: Jordan Date: Fri, 15 Nov 2019 23:14:47 -0700 Subject: initial commit --- .gitignore | 6 +++ README | 8 ++++ lib/db.py | 64 ++++++++++++++++++++++++++++++++ lib/parser.py | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/request.py | 70 +++++++++++++++++++++++++++++++++++ run.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++ uwsgi.ini.example | 7 ++++ uwsgi.sh | 7 ++++ 8 files changed, 374 insertions(+) create mode 100644 .gitignore create mode 100644 README create mode 100644 lib/db.py create mode 100644 lib/parser.py create mode 100644 lib/request.py create mode 100755 run.py create mode 100644 uwsgi.ini.example create mode 100755 uwsgi.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b47c0bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.swp +*.swo +__pycache__ +sandbox +cache +uwsgi.ini diff --git a/README b/README new file mode 100644 index 0000000..e1a2eca --- /dev/null +++ b/README @@ -0,0 +1,8 @@ +ft-proxy: transparent proxy reader for The Financial Times + +* bypass paywall (2019-11-09) +* FT requests routed via Tor by default (socks 9095) +* minimal static asset caching engine +* remove Javascript elements, GDPR popups + +ftcleq4k2dd6xf4wdnhnb6bpmeg2q5spxrmevatichmipz6nrvlb5zyd diff --git a/lib/db.py b/lib/db.py new file mode 100644 index 0000000..4c99795 --- /dev/null +++ b/lib/db.py @@ -0,0 +1,64 @@ +''' +File: db.py + +Library to facilitate database IO +''' + +import os +import sqlite3 + +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) + +class db_base: + def __init__(self): + ''' + Initialize conn/cursor for in-class use + ''' + self.db_path = os.path.join(ABS_PATH, '../cache/cache.db') + self.cache_path = os.path.join(ABS_PATH, '../cache') + self.conn = self.db_init() + self.cursor = self.conn.cursor() + + def db_init(self): + ''' + Initialize database schema if db not found, return conn + ''' + if not os.path.isdir(self.cache_path): + os.mkdir(self.cache_path) + if not os.path.isfile(self.db_path): + conn = sqlite3.connect(self.db_path) + print('database not found, initializing...') + conn.execute('''CREATE TABLE cache (hash TEXT, content_type + TEXT)''') + conn.commit() + conn.close() + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + return conn + + def cache_add(self, _hash, content_type): + self.cursor.execute('''INSERT INTO cache(hash, content_type) + VALUES(?,?)''', (_hash, content_type,)) + self.save() + + def cash_del(self, _hash): + self.cursor.execute('DELETE FROM cache WHERE hash=?', (_hash,)) + self.save() + + def is_cached(self, _hash): + self.cursor.execute('''SELECT COUNT(*) FROM cache WHERE + hash=?''', (_hash,)) + q_count = self.cursor.fetchone() + if q_count[0] > 0: + return True + return False + + def get_content_type(self, _hash): + self.cursor.execute('SELECT * FROM cache WHERE hash=?', (_hash,)) + return self.cursor.fetchall()[0][1] + + def save(self): + self.conn.commit() + + def close(self): + self.conn.close() diff --git a/lib/parser.py b/lib/parser.py new file mode 100644 index 0000000..6d0a1c8 --- /dev/null +++ b/lib/parser.py @@ -0,0 +1,109 @@ +import urllib.parse +import re +from bs4 import BeautifulSoup + +def update_html(url, prefix, resp): + ''' + Find and update HTML attributes which may contain external dependencies + + e.g. -> [^"\']+)' + + r'(?P\s*["\']?\s*\))', + re.MULTILINE | re.IGNORECASE) + _import = re.compile(r'(?P@import)'+ + r'(?P\s*["\']\s*)' + + r'(?P[^"\']+)' + + r'(?P\s*["\'])', + re.MULTILINE | re.IGNORECASE) + _src = re.compile(r'(?Psrc\s*=\s*)(?P\s*["\']?\s*)' + + r'(?P[^"\']+)' + + r'(?P\s*["\']?)', + re.MULTILINE | re.IGNORECASE) + + css_regexes = [_url, _import, _src] + + # re.sub() aforementioned directives, prepend proxy method to resources + if data: + c = data + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + return c + + elif soup: + style_tags = soup.findAll('style') + for css in style_tags: + c = css.string + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + css.string = c + return soup diff --git a/lib/request.py b/lib/request.py new file mode 100644 index 0000000..6cdfa57 --- /dev/null +++ b/lib/request.py @@ -0,0 +1,70 @@ +import gzip +import zlib +import urllib.request +import urllib.parse +from io import BytesIO +from urllib.error import URLError, HTTPError +from socket import timeout + +TIMEOUT = 10 # seconds to wait before killing connection +MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default) + +def retrieve(url, headers): + ''' + Makes HTTP request to URL and returns response + + Returns dict containing the following: + 'url': URL of resource, updated from :url: as redirects are followed + 'code': HTTP response code returned by resource + 'data': Downloaded resource as str if successful request and download=True, + else None + 'meta': Response headers from resource (dict) + ''' + try: + conn = urllib.request.Request( + url, + headers=headers + ) + + request = urllib.request.urlopen(conn) + end_url = request.geturl() # account for redirects + + except HTTPError as err: + print('[%s] %s' % (err.code, url)) + return {'url': url, 'code': err.code, 'data': None, 'meta': None} + + except URLError as err: + print('error connecting to url, %s: %s' % (err, url)) + return {'url': url, 'code': 502, 'data': None, 'meta': None} + + except timeout: + print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url)) + return {'url': url, 'code': 408, 'data': None, 'meta': None} + + except Exception as err: + print('uncaught exception, %s: %s' % (err, url)) + return {'url': url, 'code': 500, 'data': None, 'meta': None} + + # fetch headers from resource, lower() them for consistency + request_info = dict(request.info()) + headers = {k.lower(): v for k, v in request_info.items()} + + # ensure size of resource falls within MAX_SIZE before downloading + if int(headers.get('content-length')) > MAX_SIZE: + print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url)) + return {'url': url, 'code': 413, 'data': None, 'meta': None} + + # support gzip and deflate-encoded responses + if headers.get('content-encoding') == 'gzip': + buff = BytesIO(request.read()) + gz_f = gzip.GzipFile(fileobj=buff) + data = gz_f.read() + elif headers.get('content-encoding') == 'defalte': + data = zlib.decompress(request.read()) + else: + data = request.read() + + resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data, + 'meta': headers} + + return resp_dict diff --git a/run.py b/run.py new file mode 100755 index 0000000..791f5f7 --- /dev/null +++ b/run.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# proxy requests to FT through Tor by default, comment to disable +import socks +import socket +socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050) +socket.socket = socks.socksocket + +import urllib.parse +import hashlib +import os +from flask import Flask, request, Response, send_from_directory +from lib.request import retrieve +from lib.db import db_base +from lib.parser import update_html, update_css + +APP = Flask(__name__, static_url_path='') +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) +CACHE_PATH = os.path.join(ABS_PATH, 'cache') + +WHITELIST = ['www.ft.com', 'media.acast.com', 'next-media-api.ft.com'] +CONTENT = '/content/' + +@APP.route('/') +@APP.route('/fetch/') +def fetch(resource=None): + ''' + Download resource using request headers, return it + ''' + # return FT homepage if no resource value provided + if resource: + # uwsgi decodes characters -> flask, need to split at /fetch/ + url = request.url.split('/fetch/', maxsplit=1)[1] + else: + url = 'https://www.ft.com/' + + # refuse requests for resources not in WHITELIST + url_split = list(urllib.parse.urlsplit(url)) + if url_split[1] not in WHITELIST: + return 'Error 403: Non-FT resource', 403 + + # remove unnecessary key/values from header set + disabled = ['host', 'accept-encoding', 'accept', 'origin'] + headers = {k: v for (k, v) in request.headers if k.lower() not in disabled} + + # hash url for cache engine, open sqlite connection + url_sha1 = hashlib.sha1(bytes(url, encoding='utf-8')).hexdigest() + cache_db = db_base() + + # if resource is cached return w/ paired FT-derived content-type + if resource and cache_db.is_cached(url_sha1): + content_type = cache_db.get_content_type(url_sha1) + cache_db.close() + return send_from_directory(CACHE_PATH, url_sha1, mimetype=content_type) + + # set referer to g-news only when browsing articles, unnecessary otherwise + if CONTENT in url_split[2]: + headers['Referer'] = 'https://news.google.com/' + else: + headers['Referer'] = 'https://www.google.com/' + + # use encodings supported by lib/request.py (urllib wrapper) + headers['Accept-Encoding'] = 'gzip, deflate' + + # fetch remote resource, pass updated set of headers + resp = retrieve(url=url, headers=headers) + if resp['data'] is None: + return 'Error making request: %s' % url, resp['code'] + + content_type = resp['meta'].get('content-type') + prefix = urllib.parse.urljoin(request.host_url, 'fetch/') + + if content_type and 'text/html' in content_type: + # prefix resource includes w/ proxy fetch() method + soup = update_html(url, prefix, resp) + soup = update_css(url, prefix, soup=soup) # inline CSS + + # serve up our freshly-mutated HTML w/ delectable utf-8 seasoning + response = Response(response=soup.decode('utf-8'), status=resp['code'], + content_type=content_type) + cache_db.close() + return response + + # if resource is CSS file (text/css) parse and update resource includes + elif content_type and 'text/css' in content_type: + c = update_css(url, prefix, data=resp['data'].decode('utf-8')) + response = Response(response=c, status=resp['code'], + content_type=content_type) + cache_db.close() + return response + + # cache and return if resource is neither HTML nor CSS + else: + with open(os.path.join('cache', url_sha1), 'wb') as f_cache: + f_cache.write(resp['data']) + cache_db.cache_add(url_sha1, content_type) + response = Response(response=resp['data'], status=resp['code'], + content_type=content_type) + cache_db.close() + return response + +if __name__ == '__main__': + APP.run(host='127.0.0.1', port='8085', threaded=True) diff --git a/uwsgi.ini.example b/uwsgi.ini.example new file mode 100644 index 0000000..93957e0 --- /dev/null +++ b/uwsgi.ini.example @@ -0,0 +1,7 @@ +[uwsgi] +http = 127.0.0.1:8085 +processes = 12 +threads = 4 +wsgi-file = run.py +callable = APP +master = true diff --git a/uwsgi.sh b/uwsgi.sh new file mode 100755 index 0000000..d5113df --- /dev/null +++ b/uwsgi.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +# sandbox dir is virtualenv (python3 -m venv sandbox) +. sandbox/bin/activate + +# uwsgi.ini.example provided as template +uwsgi --ini uwsgi.ini -- cgit v1.2.3-54-g00ecf