Merge remote-tracking branch 'teor/ticket24838'

author: Nick Mathewson <nickm@torproject.org> 2018-12-11 10:04:16 -0500
committer: Nick Mathewson <nickm@torproject.org> 2018-12-11 10:04:16 -0500
commit: 91be6a77e9f16a9817b20c5b0f5ba3df3b54b31f (patch)
tree: e9cdf43b104879c2a3d30cc02dd83e69b86582dc
parent: 1983426073effeb2b35e0f11f4c02e2526c804a1 (diff)
parent: 6bc5c06dc25630c5a5f97f1da2af29f88683d243 (diff)
download: tor-91be6a77e9f16a9817b20c5b0f5ba3df3b54b31f.tar.gz
tor-91be6a77e9f16a9817b20c5b0f5ba3df3b54b31f.zip
4 files changed, 248 insertions, 101 deletions
diff --git a/changes/ticket24838 b/changes/ticket24838
new file mode 100644
index 0000000000..d068e31b91
--- /dev/null
+++ b/changes/ticket24838
@@ -0,0 +1,6 @@
+  o Minor features (fallback directory mirrors):
+    - Accept relays that are a fuzzy match to a fallback whitelist entry.
+      If a relay matches at least one fingerprint, IPv4 address, or IPv6
+      address in the fallback whitelist, it can become a fallback. This
+      reduces the work required to keep the list up to date.
+      Closes ticket 24838.
diff --git a/changes/ticket28768 b/changes/ticket28768
new file mode 100644
index 0000000000..27d90febc8
--- /dev/null
+++ b/changes/ticket28768
@@ -0,0 +1,4 @@
+  o Minor features (fallback directory mirrors):
+    - Accept fallbacks that deliver reasonably live consensuses.
+      (Consensuses that will become valid less than 24 hours in the future,
+      or that expired less than 24 hours ago.) Closes ticket 28768.
diff --git a/scripts/maint/fallback.whitelist b/scripts/maint/fallback.whitelist
index 23c7be58a5..60d3e7bb85 100644
--- a/scripts/maint/fallback.whitelist
+++ b/scripts/maint/fallback.whitelist
@@ -1,34 +1,23 @@
 # updateFallbackDirs.py directory mirror whitelist
 #
-# Format:
-# IPv4:DirPort orport=<ORPort> id=<ID> [ ipv6=<IPv6>:<IPv6 ORPort> ]
-# or use:
-# scripts/maint/generateFallbackDirLine.py fingerprint ...
+# At least one of these keys must match for a directory mirror to be included
+# in the fallback list:
+#   id
+#   ipv4
+#   ipv6
+# The ports and nickname are ignored. Missing or extra ipv6 addresses
+# are ignored.
 #
-# All attributes must match for the directory mirror to be included.
-# If the fallback has an ipv6 key, the whitelist line must also have
-# it, and vice versa, otherwise they don't match.
-# (The blacklist overrides the whitelist.)
-
-# To replace this list with the hard-coded fallback list (for testing), use
-# a command similar to:
-#   cat src/app/config/fallback_dirs.inc | grep \" | grep -v weight | \
-#   tr -d '\n' | \
-#   sed 's/"" / /g' | sed 's/""/"/g' | tr \" '\n' | grep -v '^$' \
-#   > scripts/maint/fallback.whitelist
+# The latest relay details from Onionoo are included in the generated list.
 #
-# When testing before a release, exclusions due to changed details will result
-# in a warning, unless the IPv4 address or port change happened recently.
-# Then it is only logged at info level, as part of the eligibility check.
-# Exclusions due to stability also are only shown at info level.
+# To check the hard-coded fallback list (for testing), use:
+# $ updateFallbackDirs.py check_existing
 #
-# Add the number of selected, slow, and excluded relays, and compare that to
-# the number of hard-coded relays. If it's less, use info-level logs to find
-# out why each of the missing relays was excluded.
-
 # If a relay operator wants their relay to be a FallbackDir,
 # enter the following information here:
-# <IPv4>:<DirPort> orport=<ORPort> id=<ID> [ ipv6=<IPv6>:<IPv6 ORPort> ]
+# <IPv4>:<DirPort> orport=<ORPort> id=<ID> ( ipv6=[<IPv6>]:<IPv6 ORPort> )?
+# or use:
+# scripts/maint/generateFallbackDirLine.py fingerprint ...
 
 # https://lists.torproject.org/pipermail/tor-relays/2015-December/008362.html
 # https://trac.torproject.org/projects/tor/ticket/22321#comment:22
diff --git a/scripts/maint/updateFallbackDirs.py b/scripts/maint/updateFallbackDirs.py
index 0ea3992d8f..14372d0e83 100755
--- a/scripts/maint/updateFallbackDirs.py
+++ b/scripts/maint/updateFallbackDirs.py
@@ -18,8 +18,8 @@
 # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
 # for netblock analysis.
 #
-# Then read the logs to make sure the fallbacks aren't dominated by a single
-# netblock or port.
+# After running this script, read the logs to make sure the fallbacks aren't
+# dominated by a single netblock or port.
 
 # Script by weasel, April 2015
 # Portions by gsathya & karsten, 2013
@@ -39,8 +39,6 @@ import urllib
 import urllib2
 import hashlib
 import dateutil.parser
-# bson_lazy provides bson
-#from bson import json_util
 import copy
 import re
 
@@ -100,19 +98,29 @@ MUST_BE_RUNNING_NOW = (PERFORM_IPV4_DIRPORT_CHECKS
 # Clients have been using microdesc consensuses by default for a while now
 DOWNLOAD_MICRODESC_CONSENSUS = True
 
-# If a relay delivers an expired consensus, if it expired less than this many
-# seconds ago, we still allow the relay. This should never be less than -90,
-# as all directory mirrors should have downloaded a consensus 90 minutes
-# before it expires. It should never be more than 24 hours, because clients
-# reject consensuses that are older than REASONABLY_LIVE_TIME.
-# For the consensus expiry check to be accurate, the machine running this
-# script needs an accurate clock.
+# If a relay delivers an invalid consensus, if it will become valid less than
+# this many seconds in the future, or expired less than this many seconds ago,
+# accept the relay as a fallback. For the consensus expiry check to be
+# accurate, the machine running this script needs an accurate clock.
 #
-# Relays on 0.3.0 and later return a 404 when they are about to serve an
-# expired consensus. This makes them fail the download check.
-# We use a tolerance of 0, so that 0.2.x series relays also fail the download
-# check if they serve an expired consensus.
-CONSENSUS_EXPIRY_TOLERANCE = 0
+# Relays on 0.3.0 and later return a 404 when they are about to serve a
+# consensus that expired more than 24 hours ago. 0.2.9 and earlier relays
+# will serve consensuses that are very old.
+#
+# Relays on 0.3.5.6-rc? and later return a 404 when they are about to serve a
+# consensus that will become valid more than 24 hours in the future. Older
+# relays don't serve future consensuses.
+#
+# A 404 makes relays fail the download check. We use a tolerance of 24 hours,
+# so that 0.2.9 relays also fail the download check if they serve a consensus
+# that is not reasonably live.
+#
+# REASONABLY_LIVE_TIME should never be more than Tor's REASONABLY_LIVE_TIME,
+# (24 hours), because clients reject consensuses that are older than that.
+# Clients on 0.3.5.5-alpha? and earlier also won't select guards from
+# consensuses that have expired, but can bootstrap if they already have guards
+# in their state file.
+REASONABLY_LIVE_TIME = 24*60*60
 
 # Output fallback name, flags, bandwidth, and ContactInfo in a C comment?
 OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
@@ -912,61 +920,181 @@ class Candidate(object):
       return False
     return True
 
-  def is_in_whitelist(self, relaylist):
-    """ A fallback matches if each key in the whitelist line matches:
+  def id_matches(self, id, exact=False):
+    """ Does this fallback's id match id?
+        exact is ignored. """
+    return self._fpr == id
+
+  def ipv4_addr_matches(self, ipv4_addr, exact=False):
+    """ Does this fallback's IPv4 address match ipv4_addr?
+        exact is ignored. """
+    return self.dirip == ipv4_addr
+
+  def ipv4_dirport_matches(self, ipv4_dirport, exact=False):
+    """ Does this fallback's IPv4 dirport match ipv4_dirport?
+        If exact is False, always return True. """
+    if exact:
+      return self.dirport == int(ipv4_dirport)
+    else:
+      return True
+
+  def ipv4_and_dirport_matches(self, ipv4_addr, ipv4_dirport, exact=False):
+    """ Does this fallback's IPv4 address match ipv4_addr?
+        If exact is True, also check ipv4_dirport. """
+    ipv4_match = self.ipv4_addr_matches(ipv4_addr, exact=exact)
+    if exact:
+      return ipv4_match and self.ipv4_dirport_matches(ipv4_dirport,
+                                                      exact=exact)
+    else:
+      return ipv4_match
+
+  def ipv4_orport_matches(self, ipv4_orport, exact=False):
+    """ Does this fallback's IPv4 orport match ipv4_orport?
+        If exact is False, always return True. """
+    if exact:
+      return self.orport == int(ipv4_orport)
+    else:
+      return True
+
+  def ipv4_and_orport_matches(self, ipv4_addr, ipv4_orport, exact=False):
+    """ Does this fallback's IPv4 address match ipv4_addr?
+        If exact is True, also check ipv4_orport. """
+    ipv4_match = self.ipv4_addr_matches(ipv4_addr, exact=exact)
+    if exact:
+      return ipv4_match and self.ipv4_orport_matches(ipv4_orport,
+                                                     exact=exact)
+    else:
+      return ipv4_match
+
+  def ipv6_addr_matches(self, ipv6_addr, exact=False):
+    """ Does this fallback's IPv6 address match ipv6_addr?
+        Both addresses must be present to match.
+        exact is ignored. """
+    if self.has_ipv6() and ipv6_addr is not None:
+      # Check that we have a bracketed IPv6 address without a port
+      assert(ipv6_addr.startswith('[') and ipv6_addr.endswith(']'))
+      return self.ipv6addr == ipv6_addr
+    else:
+      return False
+
+  def ipv6_orport_matches(self, ipv6_orport, exact=False):
+    """ Does this fallback's IPv6 orport match ipv6_orport?
+        Both ports must be present to match.
+        If exact is False, always return True. """
+    if exact:
+      return (self.has_ipv6() and ipv6_orport is not None and
+              self.ipv6orport == int(ipv6_orport))
+    else:
+      return True
+
+  def ipv6_and_orport_matches(self, ipv6_addr, ipv6_orport, exact=False):
+    """ Does this fallback's IPv6 address match ipv6_addr?
+        If exact is True, also check ipv6_orport. """
+    ipv6_match = self.ipv6_addr_matches(ipv6_addr, exact=exact)
+    if exact:
+      return ipv6_match and self.ipv6_orport_matches(ipv6_orport,
+                                                     exact=exact)
+    else:
+      return ipv6_match
+
+  def entry_matches_exact(self, entry):
+    """ Is entry an exact match for this fallback?
+        A fallback is an exact match for entry if each key in entry matches:
           ipv4
           dirport
           orport
           id
-          ipv6 address and port (if present)
+          ipv6 address and port (if present in the fallback or the whitelist)
         If the fallback has an ipv6 key, the whitelist line must also have
-        it, and vice versa, otherwise they don't match. """
-    ipv6 = None
-    if self.has_ipv6():
-      ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
-    for entry in relaylist:
-      if entry['id'] != self._fpr:
-        # can't log here unless we match an IP and port, because every relay's
-        # fingerprint is compared to every entry's fingerprint
-        if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
-          logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
-                          '%s?', entry['id'], self.dirip, self.orport,
-                          self._fpr)
-        if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
-          logging.warning('%s excluded: has OR %s changed fingerprint to ' +
-                          '%s?', entry['id'], ipv6, self._fpr)
-        continue
-      if entry['ipv4'] != self.dirip:
-        logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
-                        self._fpr, entry['ipv4'], self.dirip)
-        continue
-      if int(entry['dirport']) != self.dirport:
-        logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
-                        '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
-                        self.dirip, self.dirport)
-        continue
-      if int(entry['orport']) != self.orport:
-        logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
-                        '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
-                        self.dirip, self.orport)
-        continue
-      if entry.has_key('ipv6') and self.has_ipv6():
-        # if both entry and fallback have an ipv6 address, compare them
-        if entry['ipv6'] != ipv6:
-          logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
-                          'to %s?', self._fpr, entry['ipv6'], ipv6)
-          continue
-      # if the fallback has an IPv6 address but the whitelist entry
-      # doesn't, or vice versa, the whitelist entry doesn't match
-      elif entry.has_key('ipv6') and not self.has_ipv6():
-        logging.warning('%s excluded: has it lost its former IPv6 address %s?',
-                        self._fpr, entry['ipv6'])
-        continue
-      elif not entry.has_key('ipv6') and self.has_ipv6():
-        logging.warning('%s excluded: has it gained an IPv6 address %s?',
-                        self._fpr, ipv6)
-        continue
+        it, otherwise they don't match.
+
+        Logs a warning-level message if the fallback would be an exact match,
+        but one of the id, ipv4, ipv4 orport, ipv4 dirport, or ipv6 orport
+        have changed. """
+    if not self.id_matches(entry['id'], exact=True):
+      # can't log here unless we match an IP and port, because every relay's
+      # fingerprint is compared to every entry's fingerprint
+      if self.ipv4_and_orport_matches(entry['ipv4'],
+                                      entry['orport'],
+                                      exact=True):
+        logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
+                        '%s?', entry['id'], self.dirip, self.orport,
+                        self._fpr)
+      if self.ipv6_and_orport_matches(entry.get('ipv6_addr'),
+                                      entry.get('ipv6_orport'),
+                                      exact=True):
+        logging.warning('%s excluded: has OR %s changed fingerprint to ' +
+                        '%s?', entry['id'], entry['ipv6'], self._fpr)
+      return False
+    if not self.ipv4_addr_matches(entry['ipv4'], exact=True):
+      logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
+                      self._fpr, entry['ipv4'], self.dirip)
+      return False
+    if not self.ipv4_dirport_matches(entry['dirport'], exact=True):
+      logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
+                      '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
+                      self.dirip, self.dirport)
+      return False
+    if not self.ipv4_orport_matches(entry['orport'], exact=True):
+      logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
+                      '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
+                      self.dirip, self.orport)
+      return False
+    if entry.has_key('ipv6') and self.has_ipv6():
+      # if both entry and fallback have an ipv6 address, compare them
+      if not self.ipv6_and_orport_matches(entry['ipv6_addr'],
+                                          entry['ipv6_orport'],
+                                          exact=True):
+        logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
+                        'to %s:%d?', self._fpr, entry['ipv6'],
+                        self.ipv6addr, self.ipv6orport)
+        return False
+    # if the fallback has an IPv6 address but the whitelist entry
+    # doesn't, or vice versa, the whitelist entry doesn't match
+    elif entry.has_key('ipv6') and not self.has_ipv6():
+      logging.warning('%s excluded: has it lost its former IPv6 address %s?',
+                      self._fpr, entry['ipv6'])
+      return False
+    elif not entry.has_key('ipv6') and self.has_ipv6():
+      logging.warning('%s excluded: has it gained an IPv6 address %s:%d?',
+                      self._fpr, self.ipv6addr, self.ipv6orport)
+      return False
+    return True
+
+  def entry_matches_fuzzy(self, entry):
+    """ Is entry a fuzzy match for this fallback?
+        A fallback is a fuzzy match for entry if at least one of these keys
+        in entry matches:
+          id
+          ipv4
+          ipv6 (if present in both the fallback and whitelist)
+        The ports and nickname are ignored. Missing or extra ipv6 addresses
+        are ignored.
+
+        Doesn't log any warning messages. """
+    if self.id_matches(entry['id'], exact=False):
       return True
+    if self.ipv4_addr_matches(entry['ipv4'], exact=False):
+      return True
+    if entry.has_key('ipv6') and self.has_ipv6():
+      # if both entry and fallback have an ipv6 address, compare them
+      if self.ipv6_addr_matches(entry['ipv6_addr'], exact=False):
+        return True
+    return False
+
+  def is_in_whitelist(self, relaylist, exact=False):
+    """ If exact is True (existing fallback list), check if this fallback is
+        an exact match for any whitelist entry, using entry_matches_exact().
+
+        If exact is False (new fallback whitelist), check if this fallback is
+        a fuzzy match for any whitelist entry, using entry_matches_fuzzy(). """
+    for entry in relaylist:
+      if exact:
+        if self.entry_matches_exact(entry):
+          return True
+      else:
+        if self.entry_matches_fuzzy(entry):
+          return True
     return False
 
   def cw_to_bw_factor(self):
@@ -1124,6 +1252,7 @@ class Candidate(object):
                                 ).run()[0]
       end = datetime.datetime.utcnow()
       time_since_expiry = (end - consensus.valid_until).total_seconds()
+      time_until_valid = (consensus.valid_after - end).total_seconds()
     except Exception, stem_error:
       end = datetime.datetime.utcnow()
       log_excluded('Unable to retrieve a consensus from %s: %s', nickname,
@@ -1141,8 +1270,17 @@ class Candidate(object):
       download_failed = True
     elif (time_since_expiry > 0):
       status = 'outdated consensus, expired %ds ago'%(int(time_since_expiry))
-      if time_since_expiry <= CONSENSUS_EXPIRY_TOLERANCE:
-        status += ', tolerating up to %ds'%(CONSENSUS_EXPIRY_TOLERANCE)
+      if time_since_expiry <= REASONABLY_LIVE_TIME:
+        status += ', tolerating up to %ds'%(REASONABLY_LIVE_TIME)
+        level = logging.INFO
+      else:
+        status += ', invalid'
+        level = logging.WARNING
+        download_failed = True
+    elif (time_until_valid > 0):
+      status = 'future consensus, valid in %ds'%(int(time_until_valid))
+      if time_until_valid <= REASONABLY_LIVE_TIME:
+        status += ', tolerating up to %ds'%(REASONABLY_LIVE_TIME)
         level = logging.INFO
       else:
         status += ', invalid'
@@ -1400,7 +1538,7 @@ class CandidateList(dict):
         each line's key/value pairs are placed in a dictonary,
         (of string -> string key/value pairs),
         and these dictionaries are placed in an array.
-        comments start with # and are ignored """
+        comments start with # and are ignored. """
     file_data = file_obj['data']
     file_name = file_obj['name']
     relaylist = []
@@ -1440,18 +1578,28 @@ class CandidateList(dict):
             relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
         elif kvl == 2:
           relay_entry[key_value_split[0]] = key_value_split[1]
+          # split ipv6 addresses and orports
+          if key_value_split[0] == 'ipv6':
+            ipv6_orport_split = key_value_split[1].rsplit(':', 1)
+            ipv6l = len(ipv6_orport_split)
+            if ipv6l != 2:
+              print '#error Bad %s IPv6 item: %s, format is [ipv6]:orport.'%(
+                                                          file_name, item)
+            relay_entry['ipv6_addr'] = ipv6_orport_split[0]
+            relay_entry['ipv6_orport'] = ipv6_orport_split[1]
       relaylist.append(relay_entry)
     return relaylist
 
-  # apply the fallback whitelist
-  def apply_filter_lists(self, whitelist_obj):
+  def apply_filter_lists(self, whitelist_obj, exact=False):
+    """ Apply the fallback whitelist_obj to this fallback list,
+        passing exact to is_in_whitelist(). """
     excluded_count = 0
     logging.debug('Applying whitelist')
     # parse the whitelist
     whitelist = self.load_relaylist(whitelist_obj)
     filtered_fallbacks = []
     for f in self.fallbacks:
-      in_whitelist = f.is_in_whitelist(whitelist)
+      in_whitelist = f.is_in_whitelist(whitelist, exact=exact)
       if in_whitelist:
         # include
         filtered_fallbacks.append(f)
@@ -2064,14 +2212,14 @@ def process_existing():
   logging.getLogger('stem').setLevel(logging.INFO)
   whitelist = {'data': parse_fallback_file(FALLBACK_FILE_NAME),
                'name': FALLBACK_FILE_NAME}
-  list_fallbacks(whitelist)
+  list_fallbacks(whitelist, exact=True)
 
 def process_default():
   logging.basicConfig(level=logging.WARNING)
   logging.getLogger('stem').setLevel(logging.WARNING)
   whitelist = {'data': read_from_file(WHITELIST_FILE_NAME, MAX_LIST_FILE_SIZE),
                'name': WHITELIST_FILE_NAME}
-  list_fallbacks(whitelist)
+  list_fallbacks(whitelist, exact=False)
 
 ## Main Function
 def main():
@@ -2092,10 +2240,10 @@ def log_excluded(msg, *args):
   else:
     logging.info(msg, *args)
 
-def list_fallbacks(whitelist):
+def list_fallbacks(whitelist, exact=False):
   """ Fetches required onionoo documents and evaluates the
-      fallback directory criteria for each of the relays """
-
+      fallback directory criteria for each of the relays,
+      passing exact to apply_filter_lists(). """
   print "/* type=fallback */"
   print ("/* version={} */"
          .format(cleanse_c_multiline_comment(FALLBACK_FORMAT_VERSION)))
@@ -2135,7 +2283,7 @@ def list_fallbacks(whitelist):
   # warning that the details have changed from those in the whitelist.
   # instead, there will be an info-level log during the eligibility check.
   initial_count = len(candidates.fallbacks)
-  excluded_count = candidates.apply_filter_lists(whitelist)
+  excluded_count = candidates.apply_filter_lists(whitelist, exact=exact)
   print candidates.summarise_filters(initial_count, excluded_count)
   eligible_count = len(candidates.fallbacks)
author	Nick Mathewson <nickm@torproject.org>	2018-12-11 10:04:16 -0500
committer	Nick Mathewson <nickm@torproject.org>	2018-12-11 10:04:16 -0500
commit	91be6a77e9f16a9817b20c5b0f5ba3df3b54b31f (patch)
tree	e9cdf43b104879c2a3d30cc02dd83e69b86582dc
parent	1983426073effeb2b35e0f11f4c02e2526c804a1 (diff)
parent	6bc5c06dc25630c5a5f97f1da2af29f88683d243 (diff)
download	tor-91be6a77e9f16a9817b20c5b0f5ba3df3b54b31f.tar.gz tor-91be6a77e9f16a9817b20c5b0f5ba3df3b54b31f.zip