1 files changed, 67 insertions, 35 deletions
diff --git a/scripts/maint/updateFallbackDirs.py b/scripts/maint/updateFallbackDirs.py
index 53676c08fa..5c9b320ee4 100755
--- a/scripts/maint/updateFallbackDirs.py
+++ b/scripts/maint/updateFallbackDirs.py
@@ -27,7 +27,7 @@ import dateutil.parser
 #from bson import json_util
 
 import logging
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.DEBUG)
 
 ## Top-Level Configuration
 
@@ -91,7 +91,7 @@ PERMITTED_BADEXIT = .00
 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else 0.2
 
 # Limit the number of fallbacks (eliminating lowest by weight)
-MAX_FALLBACK_COUNT = 500
+MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 500
 # Emit a C #error if the number of fallbacks is below
 MIN_FALLBACK_COUNT = 100
 
@@ -266,6 +266,17 @@ def load_json_from_file(json_file_name):
 
 ## OnionOO Functions
 
+def datestr_to_datetime(datestr):
+  # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
+  if datestr is not None:
+    dt = dateutil.parser.parse(datestr)
+  else:
+    # Never modified - use start of epoch
+    dt = datetime.datetime.utcfromtimestamp(0)
+  # strip any timezone out (in case they're supported in future)
+  dt = dt.replace(tzinfo=None)
+  return dt
+
 def onionoo_fetch(what, **kwargs):
   params = kwargs
   params['type'] = 'relay'
@@ -304,37 +315,42 @@ def onionoo_fetch(what, **kwargs):
     if last_mod_date is not None:
       request.add_header('If-modified-since', last_mod_date)
 
-    # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
-    if last_mod_date is not None:
-      last_mod = dateutil.parser.parse(last_mod_date)
-    else:
-      # Never modified - use start of epoch
-      last_mod = datetime.datetime.utcfromtimestamp(0)
-    # strip any timezone out (in case they're supported in future)
-    last_mod = last_mod.replace(tzinfo=None)
+    # Parse last modified date
+    last_mod = datestr_to_datetime(last_mod_date)
+
+    # Not Modified and still recent enough to be useful
+    # Onionoo / Globe used to use 6 hours, but we can afford a day
+    required_freshness = datetime.datetime.utcnow()
+    # strip any timezone out (to match dateutil.parser)
+    required_freshness = required_freshness.replace(tzinfo=None)
+    required_freshness -= datetime.timedelta(hours=24)
 
+    # Make the OnionOO request
     response_code = 0
     try:
       response = urllib2.urlopen(request)
       response_code = response.getcode()
     except urllib2.HTTPError, error:
       response_code = error.code
-      # strip any timezone out (to match dateutil.parser)
-      six_hours_ago = datetime.datetime.utcnow()
-      six_hours_ago = six_hours_ago.replace(tzinfo=None)
-      six_hours_ago -= datetime.timedelta(hours=6)
-      # Not Modified and still recent enough to be useful (Globe uses 6 hours)
-      if response_code == 304:
-        if last_mod < six_hours_ago:
-          raise Exception("Outdated data from " + url + ": "
-                          + str(error.code) + ": " + error.reason)
-        else:
-          pass
+      if response_code == 304: # not modified
+        pass
       else:
         raise Exception("Could not get " + url + ": "
                         + str(error.code) + ": " + error.reason)
 
     if response_code == 200: # OK
+      last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
+
+    # Check for freshness
+    if last_mod < required_freshness:
+      if last_mod_date is not None:
+        date_message = "Outdated data: last updated " + last_mod_date
+      else:
+        date_message = "No data: never downloaded "
+      raise Exception(date_message + " from " + url)
+
+    # Process the data
+    if response_code == 200: # OK
 
       response_json = load_possibly_compressed_response_json(response)
 
@@ -579,9 +595,15 @@ class Candidate(object):
                      %(p, which))
       for v in reversed(h['values']):
         if (this_ts <= newest):
+          agt1 = now - this_ts
+          agt2 = interval
+          agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
+                     * 10**6) / 10**6
+          agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
+                     * 10**6) / 10**6
           generic_history.append(
-            { 'age': (now - this_ts).total_seconds(),
-              'length': interval.total_seconds(),
+            { 'age': agetmp1,
+              'length': agetmp2,
               'value': v
             })
           newest = this_ts
@@ -599,6 +621,8 @@ class Candidate(object):
   def _avg_generic_history(generic_history):
     a = []
     for i in generic_history:
+      if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
+        continue
       if (i['length'] is not None
           and i['age'] is not None
           and i['value'] is not None):
@@ -608,7 +632,11 @@ class Candidate(object):
     sv = math.fsum(map(lambda x: x[0], a))
     sw = math.fsum(map(lambda x: x[1], a))
 
-    return sv/sw
+    if sw == 0.0:
+      svw = 0.0
+    else:
+      svw = sv/sw
+    return svw
 
   def _add_generic_history(self, history):
     periods = r['read_history'].keys()
@@ -659,10 +687,6 @@ class Candidate(object):
       logging.debug('%s not a candidate: running avg too low (%lf)',
                     self._fpr, self._running)
       return False
-    if self._guard < CUTOFF_GUARD:
-      logging.debug('%s not a candidate: guard avg too low (%lf)',
-                    self._fpr, self._guard)
-      return False
     if self._v2dir < CUTOFF_V2DIR:
       logging.debug('%s not a candidate: v2dir avg too low (%lf)',
                     self._fpr, self._v2dir)
@@ -675,6 +699,10 @@ class Candidate(object):
     if (not self._data.has_key('recommended_version')
         or not self._data['recommended_version']):
       return False
+    if self._guard < CUTOFF_GUARD:
+      logging.debug('%s not a candidate: guard avg too low (%lf)',
+                    self._fpr, self._guard)
+      return False
     return True
 
   def is_in_whitelist(self, relaylist):
@@ -998,7 +1026,8 @@ class CandidateList(dict):
   # starting with the lowest-weighted fallbacks
   # total_weight should be recalculated after calling this
   def exclude_excess_fallbacks(self):
-    self.fallbacks = self.fallbacks[:MAX_FALLBACK_COUNT]
+    if MAX_FALLBACK_COUNT is not None:
+      self.fallbacks = self.fallbacks[:MAX_FALLBACK_COUNT]
 
   # Clamp the weight of all fallbacks to MAX_WEIGHT_FRACTION * total_weight
   # fallbacks are kept sorted, but since excessive weights are reduced to
@@ -1069,15 +1098,15 @@ class CandidateList(dict):
     else:
       fallback_proportion = ' (%d * %f)'%(guard_count,
                                           FALLBACK_PROPORTION_OF_GUARDS)
-    s += 'Final Count:  %d (Eligible %d, Usable %d, Target %d%s, '%(
+    s += 'Final Count:  %d (Eligible %d, Usable %d, Target %d%s'%(
             min(max_count, fallback_count),
             eligible_count,
             fallback_count,
             target_count,
             fallback_proportion)
-    s += 'Clamped to %d)'%(
-            MAX_FALLBACK_COUNT)
-    s += '\n'
+    if MAX_FALLBACK_COUNT is not None:
+      s += ', Clamped to %d'%(MAX_FALLBACK_COUNT)
+    s += ')\n'
     if fallback_count < MIN_FALLBACK_COUNT:
       s += '*/'
       s += '\n'
@@ -1147,13 +1176,16 @@ def list_fallbacks():
 
   guard_count = candidates.count_guards()
   if FALLBACK_PROPORTION_OF_GUARDS is None:
-    target_count = MAX_FALLBACK_COUNT
+    target_count = guard_count
   else:
     target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
   # the maximum number of fallbacks is the least of:
   # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
   # - the maximum fallback count (MAX_FALLBACK_COUNT)
-  max_count = min(target_count, MAX_FALLBACK_COUNT)
+  if MAX_FALLBACK_COUNT is None:
+    max_count = guard_count
+  else:
+    max_count = min(target_count, MAX_FALLBACK_COUNT)
 
   candidates.compute_fallbacks()