import gzip import zlib import urllib.request import urllib.parse from io import BytesIO from urllib.error import URLError, HTTPError from socket import timeout TIMEOUT = 10 # seconds to wait before killing connection MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default) def retrieve(url, headers): ''' Makes HTTP request to URL and returns response Returns dict containing the following: 'url': URL of resource, updated from :url: as redirects are followed 'code': HTTP response code returned by resource 'data': Downloaded resource as str if successful request and download=True, else None 'meta': Response headers from resource (dict) ''' try: conn = urllib.request.Request( url, headers=headers ) request = urllib.request.urlopen(conn) end_url = request.geturl() # account for redirects except HTTPError as err: print('[%s] %s' % (err.code, url)) return {'url': url, 'code': err.code, 'data': None, 'meta': None} except URLError as err: print('error connecting to url, %s: %s' % (err, url)) return {'url': url, 'code': 502, 'data': None, 'meta': None} except timeout: print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url)) return {'url': url, 'code': 408, 'data': None, 'meta': None} except Exception as err: print('uncaught exception, %s: %s' % (err, url)) return {'url': url, 'code': 500, 'data': None, 'meta': None} # fetch headers from resource, lower() them for consistency request_info = dict(request.info()) headers = {k.lower(): v for k, v in request_info.items()} # ensure size of resource falls below MAX_SIZE before downloading # TODO: support requests for resources which lack content-length (streams) if not headers.get('content-length'): print('no content-length provided, blocking: %s' % (url)) return {'url': url, 'code': 413, 'data': None, 'meta': None} if int(headers['content-length']) > MAX_SIZE: print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url)) return {'url': url, 'code': 413, 'data': None, 'meta': None} # support gzip and deflate-encoded responses if headers.get('content-encoding') == 'gzip': buff = BytesIO(request.read()) gz_f = gzip.GzipFile(fileobj=buff) data = gz_f.read() elif headers.get('content-encoding') == 'defalte': data = zlib.decompress(request.read()) else: data = request.read() resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data, 'meta': headers} return resp_dict