import urllib.parse
import re
from bs4 import BeautifulSoup
def update_html(url, prefix, resp):
'''
Find and update HTML attributes which may contain external dependencies
e.g. -> [^"\']+)' +
r'(?P\s*["\']?\s*\))',
re.MULTILINE | re.IGNORECASE)
_import = re.compile(r'(?P@import)'+
r'(?P\s*["\']\s*)' +
r'(?P[^"\']+)' +
r'(?P\s*["\'])',
re.MULTILINE | re.IGNORECASE)
_src = re.compile(r'(?Psrc\s*=\s*)(?P\s*["\']?\s*)' +
r'(?P[^"\']+)' +
r'(?P\s*["\']?)',
re.MULTILINE | re.IGNORECASE)
css_regexes = [_url, _import, _src]
# re.sub() aforementioned directives, prepend proxy method to resources
if data:
c = data
for reg in css_regexes:
c = reg.sub(lambda m: m.group('url') +
m.group('quote_open') +
prefix +
urllib.parse.urljoin(url, m.group('resource')) +
m.group('quote_close'), c)
return c
elif soup:
style_tags = soup.findAll('style')
for css in style_tags:
c = css.string
for reg in css_regexes:
c = reg.sub(lambda m: m.group('url') +
m.group('quote_open') +
prefix +
urllib.parse.urljoin(url, m.group('resource')) +
m.group('quote_close'), c)
css.string = c
return soup