1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
import gzip
import zlib
import urllib.request
import urllib.parse
from io import BytesIO
from urllib.error import URLError, HTTPError
from socket import timeout
TIMEOUT = 10 # seconds to wait before killing connection
MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default)
def retrieve(url, headers):
'''
Makes HTTP request to URL and returns response
Returns dict containing the following:
'url': URL of resource, updated from :url: as redirects are followed
'code': HTTP response code returned by resource
'data': Downloaded resource as str if successful request and download=True,
else None
'meta': Response headers from resource (dict)
'''
try:
conn = urllib.request.Request(
url,
headers=headers
)
request = urllib.request.urlopen(conn)
end_url = request.geturl() # account for redirects
except HTTPError as err:
print('[%s] %s' % (err.code, url))
return {'url': url, 'code': err.code, 'data': None, 'meta': None}
except URLError as err:
print('error connecting to url, %s: %s' % (err, url))
return {'url': url, 'code': 502, 'data': None, 'meta': None}
except timeout:
print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url))
return {'url': url, 'code': 408, 'data': None, 'meta': None}
except Exception as err:
print('uncaught exception, %s: %s' % (err, url))
return {'url': url, 'code': 500, 'data': None, 'meta': None}
# fetch headers from resource, lower() them for consistency
request_info = dict(request.info())
headers = {k.lower(): v for k, v in request_info.items()}
# ensure size of resource falls within MAX_SIZE before downloading
if int(headers.get('content-length')) > MAX_SIZE:
print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url))
return {'url': url, 'code': 413, 'data': None, 'meta': None}
# support gzip and deflate-encoded responses
if headers.get('content-encoding') == 'gzip':
buff = BytesIO(request.read())
gz_f = gzip.GzipFile(fileobj=buff)
data = gz_f.read()
elif headers.get('content-encoding') == 'defalte':
data = zlib.decompress(request.read())
else:
data = request.read()
resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data,
'meta': headers}
return resp_dict
|