aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2019-11-15 23:14:47 -0700
committerJordan <me@jordan.im>2019-11-15 23:14:47 -0700
commit449935314000fa6391b989f0b90257d15b5c4ffe (patch)
tree33d5431d2c02b689c6feb82d72f111b7a1f07a8d /lib
downloadft-bypass-449935314000fa6391b989f0b90257d15b5c4ffe.tar.gz
ft-bypass-449935314000fa6391b989f0b90257d15b5c4ffe.zip
initial commit
Diffstat (limited to 'lib')
-rw-r--r--lib/db.py64
-rw-r--r--lib/parser.py109
-rw-r--r--lib/request.py70
3 files changed, 243 insertions, 0 deletions
diff --git a/lib/db.py b/lib/db.py
new file mode 100644
index 0000000..4c99795
--- /dev/null
+++ b/lib/db.py
@@ -0,0 +1,64 @@
+'''
+File: db.py
+
+Library to facilitate database IO
+'''
+
+import os
+import sqlite3
+
+ABS_PATH = os.path.dirname(os.path.abspath(__file__))
+
+class db_base:
+ def __init__(self):
+ '''
+ Initialize conn/cursor for in-class use
+ '''
+ self.db_path = os.path.join(ABS_PATH, '../cache/cache.db')
+ self.cache_path = os.path.join(ABS_PATH, '../cache')
+ self.conn = self.db_init()
+ self.cursor = self.conn.cursor()
+
+ def db_init(self):
+ '''
+ Initialize database schema if db not found, return conn
+ '''
+ if not os.path.isdir(self.cache_path):
+ os.mkdir(self.cache_path)
+ if not os.path.isfile(self.db_path):
+ conn = sqlite3.connect(self.db_path)
+ print('database not found, initializing...')
+ conn.execute('''CREATE TABLE cache (hash TEXT, content_type
+ TEXT)''')
+ conn.commit()
+ conn.close()
+ conn = sqlite3.connect(self.db_path)
+ conn.row_factory = sqlite3.Row
+ return conn
+
+ def cache_add(self, _hash, content_type):
+ self.cursor.execute('''INSERT INTO cache(hash, content_type)
+ VALUES(?,?)''', (_hash, content_type,))
+ self.save()
+
+ def cash_del(self, _hash):
+ self.cursor.execute('DELETE FROM cache WHERE hash=?', (_hash,))
+ self.save()
+
+ def is_cached(self, _hash):
+ self.cursor.execute('''SELECT COUNT(*) FROM cache WHERE
+ hash=?''', (_hash,))
+ q_count = self.cursor.fetchone()
+ if q_count[0] > 0:
+ return True
+ return False
+
+ def get_content_type(self, _hash):
+ self.cursor.execute('SELECT * FROM cache WHERE hash=?', (_hash,))
+ return self.cursor.fetchall()[0][1]
+
+ def save(self):
+ self.conn.commit()
+
+ def close(self):
+ self.conn.close()
diff --git a/lib/parser.py b/lib/parser.py
new file mode 100644
index 0000000..6d0a1c8
--- /dev/null
+++ b/lib/parser.py
@@ -0,0 +1,109 @@
+import urllib.parse
+import re
+from bs4 import BeautifulSoup
+
+def update_html(url, prefix, resp):
+ '''
+ Find and update HTML attributes which may contain external dependencies
+
+ e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com>
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ :resp: request.py urllib response object containing data, headers, etc
+ '''
+ # update href values to point to our proxy
+ soup = BeautifulSoup(resp['data'], 'html.parser')
+
+ # remove <script> elements--Javascript is evil
+ [s.decompose() for s in soup('script')]
+
+ # remove cookie popup
+ [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})]
+
+ # remove deprecated, unsupported tags (object, applet, param, embed)
+ obsolete = ['object', 'applet', 'param', 'embed']
+ [[s.decompose() for s in soup(elem)] for elem in obsolete]
+
+ # find and update src, href, srcset, and background key values
+ for elem in soup.findAll(src=True):
+ update_tag(url, prefix, elem, 'src')
+ for elem in soup.findAll(href=True):
+ update_tag(url, prefix, elem, 'href')
+ for elem in soup.findAll(srcset=True):
+ update_tag(url, prefix, elem, 'srcset')
+ for elem in soup.findAll(background=True):
+ update_tag(url, prefix, elem, 'background')
+ return soup
+
+def update_tag(url, prefix, elem, key):
+ '''
+ Update value of element's key to prefix proxy method
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ '''
+ # an element's key value can be unpopulated, ignore such cases
+ if not elem.get(key):
+ return
+
+ # urljoin() w/ root url if url is relative/absolute (no scheme specifier)
+ url_split = list(urllib.parse.urlsplit(elem[key]))
+ if not url_split[0]:
+ elem[key] = urllib.parse.urljoin(url, elem[key])
+
+ # strip extraneous values by space, use first defined resource
+ elem[key] = elem[key].split()[0]
+
+ # prepend resource url w/ proxy fetch method
+ elem[key] = prefix + elem[key]
+
+def update_css(url, prefix, soup=None, data=None):
+ '''
+ Update inline OR included CSS file to prefix proxy method
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ :soup: bs4 object (optional)
+ :data: CSS file contents as str (optional)
+ '''
+ # regex objects to match url(), src=, and @import CSS directives
+ _url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\']?\s*\))',
+ re.MULTILINE | re.IGNORECASE)
+ _import = re.compile(r'(?P<import>@import)'+
+ r'(?P<quote_open>\s*["\']\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\'])',
+ re.MULTILINE | re.IGNORECASE)
+ _src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\']?)',
+ re.MULTILINE | re.IGNORECASE)
+
+ css_regexes = [_url, _import, _src]
+
+ # re.sub() aforementioned directives, prepend proxy method to resources
+ if data:
+ c = data
+ for reg in css_regexes:
+ c = reg.sub(lambda m: m.group('url') +
+ m.group('quote_open') +
+ prefix +
+ urllib.parse.urljoin(url, m.group('resource')) +
+ m.group('quote_close'), c)
+ return c
+
+ elif soup:
+ style_tags = soup.findAll('style')
+ for css in style_tags:
+ c = css.string
+ for reg in css_regexes:
+ c = reg.sub(lambda m: m.group('url') +
+ m.group('quote_open') +
+ prefix +
+ urllib.parse.urljoin(url, m.group('resource')) +
+ m.group('quote_close'), c)
+ css.string = c
+ return soup
diff --git a/lib/request.py b/lib/request.py
new file mode 100644
index 0000000..6cdfa57
--- /dev/null
+++ b/lib/request.py
@@ -0,0 +1,70 @@
+import gzip
+import zlib
+import urllib.request
+import urllib.parse
+from io import BytesIO
+from urllib.error import URLError, HTTPError
+from socket import timeout
+
+TIMEOUT = 10 # seconds to wait before killing connection
+MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default)
+
+def retrieve(url, headers):
+ '''
+ Makes HTTP request to URL and returns response
+
+ Returns dict containing the following:
+ 'url': URL of resource, updated from :url: as redirects are followed
+ 'code': HTTP response code returned by resource
+ 'data': Downloaded resource as str if successful request and download=True,
+ else None
+ 'meta': Response headers from resource (dict)
+ '''
+ try:
+ conn = urllib.request.Request(
+ url,
+ headers=headers
+ )
+
+ request = urllib.request.urlopen(conn)
+ end_url = request.geturl() # account for redirects
+
+ except HTTPError as err:
+ print('[%s] %s' % (err.code, url))
+ return {'url': url, 'code': err.code, 'data': None, 'meta': None}
+
+ except URLError as err:
+ print('error connecting to url, %s: %s' % (err, url))
+ return {'url': url, 'code': 502, 'data': None, 'meta': None}
+
+ except timeout:
+ print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url))
+ return {'url': url, 'code': 408, 'data': None, 'meta': None}
+
+ except Exception as err:
+ print('uncaught exception, %s: %s' % (err, url))
+ return {'url': url, 'code': 500, 'data': None, 'meta': None}
+
+ # fetch headers from resource, lower() them for consistency
+ request_info = dict(request.info())
+ headers = {k.lower(): v for k, v in request_info.items()}
+
+ # ensure size of resource falls within MAX_SIZE before downloading
+ if int(headers.get('content-length')) > MAX_SIZE:
+ print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url))
+ return {'url': url, 'code': 413, 'data': None, 'meta': None}
+
+ # support gzip and deflate-encoded responses
+ if headers.get('content-encoding') == 'gzip':
+ buff = BytesIO(request.read())
+ gz_f = gzip.GzipFile(fileobj=buff)
+ data = gz_f.read()
+ elif headers.get('content-encoding') == 'defalte':
+ data = zlib.decompress(request.read())
+ else:
+ data = request.read()
+
+ resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data,
+ 'meta': headers}
+
+ return resp_dict