diff options
Diffstat (limited to 'lib/parser.py')
-rw-r--r-- | lib/parser.py | 109 |
1 files changed, 109 insertions, 0 deletions
diff --git a/lib/parser.py b/lib/parser.py new file mode 100644 index 0000000..6d0a1c8 --- /dev/null +++ b/lib/parser.py @@ -0,0 +1,109 @@ +import urllib.parse +import re +from bs4 import BeautifulSoup + +def update_html(url, prefix, resp): + ''' + Find and update HTML attributes which may contain external dependencies + + e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com> + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + :resp: request.py urllib response object containing data, headers, etc + ''' + # update href values to point to our proxy + soup = BeautifulSoup(resp['data'], 'html.parser') + + # remove <script> elements--Javascript is evil + [s.decompose() for s in soup('script')] + + # remove cookie popup + [s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})] + + # remove deprecated, unsupported tags (object, applet, param, embed) + obsolete = ['object', 'applet', 'param', 'embed'] + [[s.decompose() for s in soup(elem)] for elem in obsolete] + + # find and update src, href, srcset, and background key values + for elem in soup.findAll(src=True): + update_tag(url, prefix, elem, 'src') + for elem in soup.findAll(href=True): + update_tag(url, prefix, elem, 'href') + for elem in soup.findAll(srcset=True): + update_tag(url, prefix, elem, 'srcset') + for elem in soup.findAll(background=True): + update_tag(url, prefix, elem, 'background') + return soup + +def update_tag(url, prefix, elem, key): + ''' + Update value of element's key to prefix proxy method + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + ''' + # an element's key value can be unpopulated, ignore such cases + if not elem.get(key): + return + + # urljoin() w/ root url if url is relative/absolute (no scheme specifier) + url_split = list(urllib.parse.urlsplit(elem[key])) + if not url_split[0]: + elem[key] = urllib.parse.urljoin(url, elem[key]) + + # strip extraneous values by space, use first defined resource + elem[key] = elem[key].split()[0] + + # prepend resource url w/ proxy fetch method + elem[key] = prefix + elem[key] + +def update_css(url, prefix, soup=None, data=None): + ''' + Update inline OR included CSS file to prefix proxy method + + :url: url of page from which CSS is included/inline + :prefix: proxy method to prepend to join()'d resources + :soup: bs4 object (optional) + :data: CSS file contents as str (optional) + ''' + # regex objects to match url(), src=, and @import CSS directives + _url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\']?\s*\))', + re.MULTILINE | re.IGNORECASE) + _import = re.compile(r'(?P<import>@import)'+ + r'(?P<quote_open>\s*["\']\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\'])', + re.MULTILINE | re.IGNORECASE) + _src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' + + r'(?P<resource>[^"\']+)' + + r'(?P<quote_close>\s*["\']?)', + re.MULTILINE | re.IGNORECASE) + + css_regexes = [_url, _import, _src] + + # re.sub() aforementioned directives, prepend proxy method to resources + if data: + c = data + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + return c + + elif soup: + style_tags = soup.findAll('style') + for css in style_tags: + c = css.string + for reg in css_regexes: + c = reg.sub(lambda m: m.group('url') + + m.group('quote_open') + + prefix + + urllib.parse.urljoin(url, m.group('resource')) + + m.group('quote_close'), c) + css.string = c + return soup |