aboutsummaryrefslogtreecommitdiff
path: root/lib/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/parser.py')
-rw-r--r--lib/parser.py109
1 files changed, 109 insertions, 0 deletions
diff --git a/lib/parser.py b/lib/parser.py
new file mode 100644
index 0000000..6d0a1c8
--- /dev/null
+++ b/lib/parser.py
@@ -0,0 +1,109 @@
+import urllib.parse
+import re
+from bs4 import BeautifulSoup
+
+def update_html(url, prefix, resp):
+ '''
+ Find and update HTML attributes which may contain external dependencies
+
+ e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com>
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ :resp: request.py urllib response object containing data, headers, etc
+ '''
+ # update href values to point to our proxy
+ soup = BeautifulSoup(resp['data'], 'html.parser')
+
+ # remove <script> elements--Javascript is evil
+ [s.decompose() for s in soup('script')]
+
+ # remove cookie popup
+ [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})]
+
+ # remove deprecated, unsupported tags (object, applet, param, embed)
+ obsolete = ['object', 'applet', 'param', 'embed']
+ [[s.decompose() for s in soup(elem)] for elem in obsolete]
+
+ # find and update src, href, srcset, and background key values
+ for elem in soup.findAll(src=True):
+ update_tag(url, prefix, elem, 'src')
+ for elem in soup.findAll(href=True):
+ update_tag(url, prefix, elem, 'href')
+ for elem in soup.findAll(srcset=True):
+ update_tag(url, prefix, elem, 'srcset')
+ for elem in soup.findAll(background=True):
+ update_tag(url, prefix, elem, 'background')
+ return soup
+
+def update_tag(url, prefix, elem, key):
+ '''
+ Update value of element's key to prefix proxy method
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ '''
+ # an element's key value can be unpopulated, ignore such cases
+ if not elem.get(key):
+ return
+
+ # urljoin() w/ root url if url is relative/absolute (no scheme specifier)
+ url_split = list(urllib.parse.urlsplit(elem[key]))
+ if not url_split[0]:
+ elem[key] = urllib.parse.urljoin(url, elem[key])
+
+ # strip extraneous values by space, use first defined resource
+ elem[key] = elem[key].split()[0]
+
+ # prepend resource url w/ proxy fetch method
+ elem[key] = prefix + elem[key]
+
+def update_css(url, prefix, soup=None, data=None):
+ '''
+ Update inline OR included CSS file to prefix proxy method
+
+ :url: url of page from which CSS is included/inline
+ :prefix: proxy method to prepend to join()'d resources
+ :soup: bs4 object (optional)
+ :data: CSS file contents as str (optional)
+ '''
+ # regex objects to match url(), src=, and @import CSS directives
+ _url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\']?\s*\))',
+ re.MULTILINE | re.IGNORECASE)
+ _import = re.compile(r'(?P<import>@import)'+
+ r'(?P<quote_open>\s*["\']\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\'])',
+ re.MULTILINE | re.IGNORECASE)
+ _src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' +
+ r'(?P<resource>[^"\']+)' +
+ r'(?P<quote_close>\s*["\']?)',
+ re.MULTILINE | re.IGNORECASE)
+
+ css_regexes = [_url, _import, _src]
+
+ # re.sub() aforementioned directives, prepend proxy method to resources
+ if data:
+ c = data
+ for reg in css_regexes:
+ c = reg.sub(lambda m: m.group('url') +
+ m.group('quote_open') +
+ prefix +
+ urllib.parse.urljoin(url, m.group('resource')) +
+ m.group('quote_close'), c)
+ return c
+
+ elif soup:
+ style_tags = soup.findAll('style')
+ for css in style_tags:
+ c = css.string
+ for reg in css_regexes:
+ c = reg.sub(lambda m: m.group('url') +
+ m.group('quote_open') +
+ prefix +
+ urllib.parse.urljoin(url, m.group('resource')) +
+ m.group('quote_close'), c)
+ css.string = c
+ return soup