From caffd07d85a2a181ecc6705c1198036e030da3d2 Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 17 Sep 2020 18:55:14 -0700 Subject: change project name to allium, closes #4 --- allium/relays.py | 283 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 allium/relays.py (limited to 'allium/relays.py') diff --git a/allium/relays.py b/allium/relays.py new file mode 100644 index 0000000..97e1514 --- /dev/null +++ b/allium/relays.py @@ -0,0 +1,283 @@ +''' +File: relays.py + +Relays class object consisting of relays (list of dict) and onionoo fetch +timestamp +''' + +import hashlib +import json +import os +import re +import time +import urllib.request +from shutil import rmtree +import config +import countries +from jinja2 import Environment, FileSystemLoader + +ABS_PATH = os.path.dirname(os.path.abspath(__file__)) +ENV = Environment(loader=FileSystemLoader(os.path.join(ABS_PATH, 'templates')), + trim_blocks=True, lstrip_blocks=True) + +def hash_filter(value, hash_type='md5'): + ''' + Custom hash filter for jinja; defaults to "md5" if no type specified + + :param value: value to be hashed + :param hash_type: valid hash type + :return: computed hash as a hexadecimal string + ''' + hash_func = getattr(hashlib, hash_type, None) + + if hash_func: + computed_hash = hash_func(value.encode('utf-8')).hexdigest() + else: + raise AttributeError( + 'No hashing function named {hname}'.format(hname=hash_type) + ) + + return computed_hash + +ENV.filters['hash'] = hash_filter + +class Relays: + ''' + Relay class consisting of relays (list of dict) and onionoo fetch timestamp + + :ts_file: absolute path to timestamp file used in setting If-Modified_since + :json: relay listings stored as a list of dict, derived from onionoo JSON + :timestamp: timestamp of onionoo fetch + ''' + def __init__(self): + self.url = config.CONFIG['onionoo_url'] + self.ts_file = os.path.join(ABS_PATH, "timestamp") + self.json = self._fetch_onionoo_details() + self.timestamp = self._write_timestamp() + + self._fix_missing_observed_bandwidth() + self._sort_by_bandwidth() + self._trim_platform() + self._categorize() + + def _fetch_onionoo_details(self): + ''' + Make request to onionoo to retrieve details document, return prepared + JSON response (trimmed platform and sorted by highest observed + bandwidth) + ''' + if os.path.isfile(self.ts_file): + with open(self.ts_file, 'r') as ts_file: + prev_timestamp = ts_file.read() + headers = {"If-Modified-Since": prev_timestamp} + conn = urllib.request.Request(self.url, headers=headers) + else: + conn = urllib.request.Request(self.url) + + api_response = urllib.request.urlopen(conn).read() + + return json.loads(api_response.decode('utf-8')) + + def _trim_platform(self): + ''' + Trim platform to retain base operating system without version number or + unnecessary classification which could affect sorting + + e.g. "Tor 0.3.4.9 on Linux" -> "Linux" + ''' + for relay in self.json['relays']: + relay['platform'] = relay['platform'].split(' on ', 1)[1].split(' ')[0] + relay['platform'] = relay['platform'].split('/')[-1] # GNU/* + + def _fix_missing_observed_bandwidth(self): + ''' + Set the observed_bandwidth parameter value for any relay missing the + parameter to 0; the observed_bandwidth parameter is (apparently) + optional, I hadn't run into an instance of it missing until 2019-10-03 + + "[...] Missing if router descriptor containing this information cannot be + found." + --https://metrics.torproject.org/onionoo.html#details_relay_observed_bandwidth + + ''' + for idx, relay in enumerate(self.json['relays']): + if not relay.get('observed_bandwidth'): + self.json['relays'][idx]['observed_bandwidth'] = 0 + + def _sort_by_bandwidth(self): + ''' + Sort full JSON list by highest observed_bandwidth, retain this order + during subsequent sorting (country, AS, etc) + ''' + self.json['relays'].sort(key=lambda x: x['observed_bandwidth'], + reverse=True) + + def _write_timestamp(self): + ''' + Store encoded timestamp in a file to retain time of last request, passed + to onionoo via If-Modified-Since header during fetch() if exists + ''' + timestamp = time.time() + f_timestamp = time.strftime('%a, %d %b %Y %H:%M:%S GMT', + time.gmtime(timestamp)) + if self.json is not None: + with open(self.ts_file, 'w', encoding='utf8') as ts_file: + ts_file.write(f_timestamp) + + return f_timestamp + + def _sort(self, relay, idx, k, v): + ''' + Populate self.sorted dictionary with values from :relay: + + :relay: relay from which values are derived + :idx: index at which the relay can be found in self.json['relays'] + :k: the name of the key to use in self.sorted + :v: the name of the subkey to use in self.sorted[k] + ''' + if not v or not re.match(r'^[A-Za-z0-9_-]+$', v): + return + if not k in self.json['sorted']: + self.json['sorted'][k] = dict() + if not v in self.json['sorted'][k]: + self.json['sorted'][k][v] = { + 'relays': list(), + 'bandwidth': 0, + 'exit_count': 0, + 'middle_count': 0 + } + bw = relay['observed_bandwidth'] + self.json['sorted'][k][v]['relays'].append(idx) + self.json['sorted'][k][v]['bandwidth'] += bw + if 'Exit' in relay['flags']: + self.json['sorted'][k][v]['exit_count'] += 1 + else: + self.json['sorted'][k][v]['middle_count'] += 1 + + if k is 'as': + self.json['sorted'][k][v]['country'] = relay.get('country') + self.json['sorted'][k][v]['country_name'] = relay.get('country') + self.json['sorted'][k][v]['as_name'] = relay.get('as_name') + + if k is 'family': + self.json['sorted'][k][v]['contact'] = relay.get('contact') + + # update the first_seen parameter to always contain the oldest + # relay's first_seen date + if not self.json['sorted'][k][v].get('first_seen'): + self.json['sorted'][k][v]['first_seen'] = relay['first_seen'] + elif self.json['sorted'][k][v]['first_seen'] > relay['first_seen']: + self.json['sorted'][k][v]['first_seen'] = relay['first_seen'] + + def _categorize(self): + ''' + Iterate over self.json['relays'] set and call self._sort() against + discovered relays with attributes we use to generate static sets + ''' + self.json['sorted'] = dict() + for idx, relay in enumerate(self.json['relays']): + keys = ['as', 'country', 'platform'] + for key in keys: + self._sort(relay, idx, key, relay.get(key)) + + for flag in relay['flags']: + self._sort(relay, idx, 'flag', flag) + + for member in relay['effective_family']: + if not len(relay['effective_family']) > 1: + continue + self._sort(relay, idx, 'family', member) + + self._sort(relay, idx, 'first_seen', relay['first_seen'].split(' ')[0]) + + c_str = relay.get('contact', '').encode('utf-8') + c_hash = hashlib.md5(c_str).hexdigest() + self._sort(relay, idx, 'contact', c_hash) + + def create_output_dir(self): + ''' + Ensure config:output_root exists (required for write functions) + ''' + os.makedirs(config.CONFIG['output_root'],exist_ok=True) + + def write_misc(self, template, path, path_prefix='../', sorted_by=None, + reverse=True, is_index=False): + ''' + Render and write unsorted HTML listings to disk + + :template: jinja template name + :path_prefix: path to prefix other docs/includes + :path: path to generate HTML document + :sorted_by: key to sort by, used in family and networks pages + :reverse: passed to sort() function in family and networks pages + :is_index: whether document is main index listing, limits list to 500 + ''' + template = ENV.get_template(template) + self.json['relay_subset'] = self.json['relays'] + template_render = template.render( + relays = self, + sorted_by = sorted_by, + reverse = reverse, + is_index = is_index, + path_prefix = path_prefix + ) + output = os.path.join(config.CONFIG['output_root'], path) + os.makedirs(os.path.dirname(output), exist_ok=True) + with open(output, 'w', encoding='utf8') as html: + html.write(template_render) + + def write_pages_by_key(self, k): + ''' + Render and write HTML listings to disk sorted by :k: + ''' + template = ENV.get_template(k + '.html') + output_path = os.path.join(config.CONFIG['output_root'], k) + if os.path.exists(output_path): + rmtree(output_path) + for v in self.json['sorted'][k]: + i = self.json['sorted'][k][v] + members = [] + for m_relay in i['relays']: + members.append(self.json['relays'][m_relay]) + if k is 'flag': + dir_path = os.path.join(output_path, v.lower()) + else: + dir_path = os.path.join(output_path, v) + os.makedirs(dir_path) + self.json['relay_subset'] = members + rendered = template.render( + relays = self, + bandwidth = round(i['bandwidth'] / 1000000, 2), + exit_count = i['exit_count'], + middle_count = i['middle_count'], + is_index = False, + path_prefix = '../../', + key = k, + value = v, + sp_countries = countries.THE_PREFIXED + ) + with open(os.path.join(dir_path, 'index.html'), 'w', + encoding='utf8') as html: + html.write(rendered) + + def write_relay_info(self): + ''' + Render and write per-relay HTML info documents to disk + ''' + relay_list = self.json['relays'] + template = ENV.get_template('relay-info.html') + output_path = os.path.join(config.CONFIG['output_root'], 'relay') + if os.path.exists(output_path): + rmtree(output_path) + os.makedirs(output_path) + for relay in relay_list: + if not relay['fingerprint'].isalnum(): + continue + rendered = template.render( + relay = relay, + path_prefix = '../', + relays = self + ) + with open(os.path.join(output_path, '%s.html' % relay['fingerprint']), + 'w', encoding='utf8') as html: + html.write(rendered) -- cgit v1.2.3-54-g00ecf