1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
import urllib.parse
import re
from bs4 import BeautifulSoup
def update_html(url, prefix, resp):
'''
Find and update HTML attributes which may contain external dependencies
e.g. <a href="example.com"> -> <a href="http://proxy/fetch/example.com>
:url: url of page from which CSS is included/inline
:prefix: proxy method to prepend to join()'d resources
:resp: request.py urllib response object containing data, headers, etc
'''
# update href values to point to our proxy
soup = BeautifulSoup(resp['data'], 'html.parser')
# remove <script> elements--Javascript is evil
[s.decompose() for s in soup('script')]
# remove cookie popup
[s.decompose() for s in soup.findAll('div', attrs={'class': 'n-messaging-slot'})]
# remove deprecated, unsupported tags (object, applet, param, embed)
obsolete = ['object', 'applet', 'param', 'embed']
[[s.decompose() for s in soup(elem)] for elem in obsolete]
# find and update src, href, srcset, and background key values
for elem in soup.findAll(src=True):
update_tag(url, prefix, elem, 'src')
for elem in soup.findAll(href=True):
update_tag(url, prefix, elem, 'href')
for elem in soup.findAll(srcset=True):
update_tag(url, prefix, elem, 'srcset')
for elem in soup.findAll(background=True):
update_tag(url, prefix, elem, 'background')
return soup
def update_tag(url, prefix, elem, key):
'''
Update value of element's key to prefix proxy method
:url: url of page from which CSS is included/inline
:prefix: proxy method to prepend to join()'d resources
'''
# an element's key value can be unpopulated, ignore such cases
if not elem.get(key):
return
# urljoin() w/ root url if url is relative/absolute (no scheme specifier)
url_split = list(urllib.parse.urlsplit(elem[key]))
if not url_split[0]:
elem[key] = urllib.parse.urljoin(url, elem[key])
# strip extraneous values by space, use first defined resource
elem[key] = elem[key].split()[0]
# prepend resource url w/ proxy fetch method
elem[key] = prefix + elem[key]
def update_css(url, prefix, soup=None, data=None):
'''
Update inline OR included CSS file to prefix proxy method
:url: url of page from which CSS is included/inline
:prefix: proxy method to prepend to join()'d resources
:soup: bs4 object (optional)
:data: CSS file contents as str (optional)
'''
# regex objects to match url(), src=, and @import CSS directives
_url = re.compile(r'(?P<url>url\s*\()(?P<quote_open>\s*["\']?\s*)' +
r'(?P<resource>[^"\']+)' +
r'(?P<quote_close>\s*["\']?\s*\))',
re.MULTILINE | re.IGNORECASE)
_import = re.compile(r'(?P<import>@import)'+
r'(?P<quote_open>\s*["\']\s*)' +
r'(?P<resource>[^"\']+)' +
r'(?P<quote_close>\s*["\'])',
re.MULTILINE | re.IGNORECASE)
_src = re.compile(r'(?P<src>src\s*=\s*)(?P<quote_open>\s*["\']?\s*)' +
r'(?P<resource>[^"\']+)' +
r'(?P<quote_close>\s*["\']?)',
re.MULTILINE | re.IGNORECASE)
css_regexes = [_url, _import, _src]
# re.sub() aforementioned directives, prepend proxy method to resources
if data:
c = data
for reg in css_regexes:
c = reg.sub(lambda m: m.group('url') +
m.group('quote_open') +
prefix +
urllib.parse.urljoin(url, m.group('resource')) +
m.group('quote_close'), c)
return c
elif soup:
style_tags = soup.findAll('style')
for css in style_tags:
c = css.string
for reg in css_regexes:
c = reg.sub(lambda m: m.group('url') +
m.group('quote_open') +
prefix +
urllib.parse.urljoin(url, m.group('resource')) +
m.group('quote_close'), c)
css.string = c
return soup
|