From 449935314000fa6391b989f0b90257d15b5c4ffe Mon Sep 17 00:00:00 2001 From: Jordan Date: Fri, 15 Nov 2019 23:14:47 -0700 Subject: initial commit --- lib/request.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 lib/request.py (limited to 'lib/request.py') diff --git a/lib/request.py b/lib/request.py new file mode 100644 index 0000000..6cdfa57 --- /dev/null +++ b/lib/request.py @@ -0,0 +1,70 @@ +import gzip +import zlib +import urllib.request +import urllib.parse +from io import BytesIO +from urllib.error import URLError, HTTPError +from socket import timeout + +TIMEOUT = 10 # seconds to wait before killing connection +MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default) + +def retrieve(url, headers): + ''' + Makes HTTP request to URL and returns response + + Returns dict containing the following: + 'url': URL of resource, updated from :url: as redirects are followed + 'code': HTTP response code returned by resource + 'data': Downloaded resource as str if successful request and download=True, + else None + 'meta': Response headers from resource (dict) + ''' + try: + conn = urllib.request.Request( + url, + headers=headers + ) + + request = urllib.request.urlopen(conn) + end_url = request.geturl() # account for redirects + + except HTTPError as err: + print('[%s] %s' % (err.code, url)) + return {'url': url, 'code': err.code, 'data': None, 'meta': None} + + except URLError as err: + print('error connecting to url, %s: %s' % (err, url)) + return {'url': url, 'code': 502, 'data': None, 'meta': None} + + except timeout: + print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url)) + return {'url': url, 'code': 408, 'data': None, 'meta': None} + + except Exception as err: + print('uncaught exception, %s: %s' % (err, url)) + return {'url': url, 'code': 500, 'data': None, 'meta': None} + + # fetch headers from resource, lower() them for consistency + request_info = dict(request.info()) + headers = {k.lower(): v for k, v in request_info.items()} + + # ensure size of resource falls within MAX_SIZE before downloading + if int(headers.get('content-length')) > MAX_SIZE: + print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url)) + return {'url': url, 'code': 413, 'data': None, 'meta': None} + + # support gzip and deflate-encoded responses + if headers.get('content-encoding') == 'gzip': + buff = BytesIO(request.read()) + gz_f = gzip.GzipFile(fileobj=buff) + data = gz_f.read() + elif headers.get('content-encoding') == 'defalte': + data = zlib.decompress(request.read()) + else: + data = request.read() + + resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data, + 'meta': headers} + + return resp_dict -- cgit v1.2.3-54-g00ecf