diff options
author | Jordan <me@jordan.im> | 2019-11-15 23:14:47 -0700 |
---|---|---|
committer | Jordan <me@jordan.im> | 2019-11-15 23:14:47 -0700 |
commit | 449935314000fa6391b989f0b90257d15b5c4ffe (patch) | |
tree | 33d5431d2c02b689c6feb82d72f111b7a1f07a8d /lib | |
download | ft-bypass-449935314000fa6391b989f0b90257d15b5c4ffe.tar.gz ft-bypass-449935314000fa6391b989f0b90257d15b5c4ffe.zip |
initial commit
Diffstat (limited to 'lib')
-rw-r--r-- | lib/db.py | 64 | ||||
-rw-r--r-- | lib/parser.py | 109 | ||||
-rw-r--r-- | lib/request.py | 70 |
3 files changed, 243 insertions, 0 deletions
diff --git a/lib/db.py b/lib/db.py new file mode 100644 index 0000000..4c99795 --- /dev/null +++ b/lib/db.py @@ -0,0 +1,64 @@ +''' +File: db.py + +Library to facilitate database IO +''' + +import os +import sqlite3 + +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) + +class db_base: + def __init__(self): + ''' + Initialize conn/cursor for in-class use + ''' + self.db_path = os.path.join(ABS_PATH, '../cache/cache.db') + self.cache_path = os.path.join(ABS_PATH, '../cache') + self.conn = self.db_init() + self.cursor = self.conn.cursor() + + def db_init(self): + ''' + Initialize database schema if db not found, return conn + ''' + if not os.path.isdir(self.cache_path): + os.mkdir(self.cache_path) + if not os.path.isfile(self.db_path): + conn = sqlite3.connect(self.db_path) + print('database not found, initializing...') + conn.execute('''CREATE TABLE cache (hash TEXT, content_type + TEXT)''') + conn.commit() + conn.close() + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + return conn + + def cache_add(self, _hash, content_type): + self.cursor.execute('''INSERT INTO cache(hash, content_type) + VALUES(?,?)''', (_hash, content_type,)) + self.save() + + def cash_del(self, _hash): + self.cursor.execute('DELETE FROM cache WHERE hash=?', (_hash,)) + self.save() + + def is_cached(self, _hash): + self.cursor.execute('''SELECT COUNT(*) FROM cache WHERE + hash=?''', (_hash,)) + q_count = self.cursor.fetchone() + if q_count[0] > 0: + return True + return False + + def get_content_type(self, _hash): + self.cursor.execute('SELECT * FROM cache WHERE hash=?', (_hash,)) + return self.cursor.fetchall()[0][1] + + def save(self): + self.conn.commit() + + def close(self): + self.conn.close() diff --git a/lib/parser.py b/lib/parser.py new file mode 100644 index 0000000..6d0a1c8 --- /dev/null +++ b/lib/parser.py @@ -0,0 +1,109 @@ +import urllib.parse +import re +from bs4 import BeautifulSoup + +def update_html(url, prefix, resp): + ''' + Find and update HTML attributes which may contain external dependencies + + e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com> + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + :resp: request.py urllib response object containing data, headers, etc + ''' + # update href values to point to our proxy + soup = BeautifulSoup(resp['data'], 'html.parser') + + # remove <script> elements--Javascript is evil + [s.decompose() for s in soup('script')] + + # remove cookie popup + [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})] + + # remove deprecated, unsupported tags (object, applet, param, embed) + obsolete = ['object', 'applet', 'param', 'embed'] + [[s.decompose() for s in soup(elem)] for elem in obsolete] + + # find and update src, href, srcset, and background key values + for elem in soup.findAll(src=True): + update_tag(url, prefix, elem, 'src') + for elem in soup.findAll(href=True): + update_tag(url, prefix, elem, 'href') + for elem in soup.findAll(srcset=True): + update_tag(url, prefix, elem, 'srcset') + for elem in soup.findAll(background=True): + update_tag(url, prefix, elem, 'background') + return soup + +def update_tag(url, prefix, elem, key): + ''' + Update value of element's key to prefix proxy method + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + ''' + # an element's key value can be unpopulated, ignore such cases + if not elem.get(key): + return + + # urljoin() w/ root url if url is relative/absolute (no scheme specifier) + url_split = list(urllib.parse.urlsplit(elem[key])) + if not url_split[0]: + elem[key] = urllib.parse.urljoin(url, elem[key]) + + # strip extraneous values by space, use first defined resource + elem[key] = elem[key].split()[0] + + # prepend resource url w/ proxy fetch method + elem[key] = prefix + elem[key] + +def update_css(url, prefix, soup=None, data=None): + ''' + Update inline OR included CSS file to prefix proxy method + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + :soup: bs4 object (optional) + :data: CSS file contents as str (optional) + ''' + # regex objects to match url(), src=, and @import CSS directives + _url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\']?\s*\))', + re.MULTILINE | re.IGNORECASE) + _import = re.compile(r'(?P<import>@import)'+ + r'(?P<quote_open>\s*["\']\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\'])', + re.MULTILINE | re.IGNORECASE) + _src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\']?)', + re.MULTILINE | re.IGNORECASE) + + css_regexes = [_url, _import, _src] + + # re.sub() aforementioned directives, prepend proxy method to resources + if data: + c = data + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + return c + + elif soup: + style_tags = soup.findAll('style') + for css in style_tags: + c = css.string + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + css.string = c + return soup diff --git a/lib/request.py b/lib/request.py new file mode 100644 index 0000000..6cdfa57 --- /dev/null +++ b/lib/request.py @@ -0,0 +1,70 @@ +import gzip +import zlib +import urllib.request +import urllib.parse +from io import BytesIO +from urllib.error import URLError, HTTPError +from socket import timeout + +TIMEOUT = 10 # seconds to wait before killing connection +MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default) + +def retrieve(url, headers): + ''' + Makes HTTP request to URL and returns response + + Returns dict containing the following: + 'url': URL of resource, updated from :url: as redirects are followed + 'code': HTTP response code returned by resource + 'data': Downloaded resource as str if successful request and download=True, + else None + 'meta': Response headers from resource (dict) + ''' + try: + conn = urllib.request.Request( + url, + headers=headers + ) + + request = urllib.request.urlopen(conn) + end_url = request.geturl() # account for redirects + + except HTTPError as err: + print('[%s] %s' % (err.code, url)) + return {'url': url, 'code': err.code, 'data': None, 'meta': None} + + except URLError as err: + print('error connecting to url, %s: %s' % (err, url)) + return {'url': url, 'code': 502, 'data': None, 'meta': None} + + except timeout: + print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url)) + return {'url': url, 'code': 408, 'data': None, 'meta': None} + + except Exception as err: + print('uncaught exception, %s: %s' % (err, url)) + return {'url': url, 'code': 500, 'data': None, 'meta': None} + + # fetch headers from resource, lower() them for consistency + request_info = dict(request.info()) + headers = {k.lower(): v for k, v in request_info.items()} + + # ensure size of resource falls within MAX_SIZE before downloading + if int(headers.get('content-length')) > MAX_SIZE: + print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url)) + return {'url': url, 'code': 413, 'data': None, 'meta': None} + + # support gzip and deflate-encoded responses + if headers.get('content-encoding') == 'gzip': + buff = BytesIO(request.read()) + gz_f = gzip.GzipFile(fileobj=buff) + data = gz_f.read() + elif headers.get('content-encoding') == 'defalte': + data = zlib.decompress(request.read()) + else: + data = request.read() + + resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data, + 'meta': headers} + + return resp_dict |