aboutsummaryrefslogtreecommitdiff
path: root/lib/request.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/request.py')
-rw-r--r--lib/request.py70
1 files changed, 70 insertions, 0 deletions
diff --git a/lib/request.py b/lib/request.py
new file mode 100644
index 0000000..6cdfa57
--- /dev/null
+++ b/lib/request.py
@@ -0,0 +1,70 @@
+import gzip
+import zlib
+import urllib.request
+import urllib.parse
+from io import BytesIO
+from urllib.error import URLError, HTTPError
+from socket import timeout
+
+TIMEOUT = 10 # seconds to wait before killing connection
+MAX_SIZE = 25000000 # maximum content-length of resource (bytes, 25MB default)
+
+def retrieve(url, headers):
+ '''
+ Makes HTTP request to URL and returns response
+
+ Returns dict containing the following:
+ 'url': URL of resource, updated from :url: as redirects are followed
+ 'code': HTTP response code returned by resource
+ 'data': Downloaded resource as str if successful request and download=True,
+ else None
+ 'meta': Response headers from resource (dict)
+ '''
+ try:
+ conn = urllib.request.Request(
+ url,
+ headers=headers
+ )
+
+ request = urllib.request.urlopen(conn)
+ end_url = request.geturl() # account for redirects
+
+ except HTTPError as err:
+ print('[%s] %s' % (err.code, url))
+ return {'url': url, 'code': err.code, 'data': None, 'meta': None}
+
+ except URLError as err:
+ print('error connecting to url, %s: %s' % (err, url))
+ return {'url': url, 'code': 502, 'data': None, 'meta': None}
+
+ except timeout:
+ print('socket timed out, exceeded %s seconds: %s' % (TIMEOUT, url))
+ return {'url': url, 'code': 408, 'data': None, 'meta': None}
+
+ except Exception as err:
+ print('uncaught exception, %s: %s' % (err, url))
+ return {'url': url, 'code': 500, 'data': None, 'meta': None}
+
+ # fetch headers from resource, lower() them for consistency
+ request_info = dict(request.info())
+ headers = {k.lower(): v for k, v in request_info.items()}
+
+ # ensure size of resource falls within MAX_SIZE before downloading
+ if int(headers.get('content-length')) > MAX_SIZE:
+ print('exceeded MAX_SIZE of %s bytes, skipping: %s' % (MAX_SIZE, url))
+ return {'url': url, 'code': 413, 'data': None, 'meta': None}
+
+ # support gzip and deflate-encoded responses
+ if headers.get('content-encoding') == 'gzip':
+ buff = BytesIO(request.read())
+ gz_f = gzip.GzipFile(fileobj=buff)
+ data = gz_f.read()
+ elif headers.get('content-encoding') == 'defalte':
+ data = zlib.decompress(request.read())
+ else:
+ data = request.read()
+
+ resp_dict = {'url': end_url, 'code': request.getcode(), 'data': data,
+ 'meta': headers}
+
+ return resp_dict