1 files changed, 552 insertions, 486 deletions
diff --git a/src/or/scheduler.c b/src/or/scheduler.c
index 49ac1b939a..382b3e3ca9 100644
--- a/src/or/scheduler.c
+++ b/src/or/scheduler.c
@@ -1,178 +1,409 @@
-/* * Copyright (c) 2013-2016, The Tor Project, Inc. */
+/* Copyright (c) 2013-2017, The Tor Project, Inc. */
 /* See LICENSE for licensing information */
 
-/**
- * \file scheduler.c
- * \brief Relay scheduling system
- **/
-
 #include "or.h"
-
-#define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
-#include "channel.h"
+#include "config.h"
 
 #include "compat_libevent.h"
 #define SCHEDULER_PRIVATE_
+#define SCHEDULER_KIST_PRIVATE
 #include "scheduler.h"
+#include "main.h"
+#include "buffers.h"
+#define TOR_CHANNEL_INTERNAL_
+#include "channeltls.h"
 
 #include <event2/event.h>
 
-/*
- * Scheduler high/low watermarks
- */
-
-static uint32_t sched_q_low_water = 16384;
-static uint32_t sched_q_high_water = 32768;
-
-/*
- * Maximum cells to flush in a single call to channel_flush_some_cells();
- * setting this low means more calls, but too high and we could overshoot
- * sched_q_high_water.
- */
-
-static uint32_t sched_max_flush_cells = 16;
-
-/*
- * Write scheduling works by keeping track of which channels can
- * accept cells, and have cells to write.  From the scheduler's perspective,
- * a channel can be in four possible states:
+/**
+ * \file scheduler.c
+ * \brief Channel scheduling system: decides which channels should send and
+ * receive when.
+ *
+ * This module is the global/common parts of the scheduling system. This system
+ * is what decides what channels get to send cells on their circuits and when.
  *
- * 1.) Not open for writes, no cells to send
- *     - Not much to do here, and the channel will have scheduler_state ==
- *       SCHED_CHAN_IDLE
- *     - Transitions from:
- *       - Open for writes/has cells by simultaneously draining all circuit
+ * Terms:
+ * - "Scheduling system": the collection of scheduler*.{h,c} files and their
+ *   aggregate behavior.
+ * - "Scheduler implementation": a scheduler_t. The scheduling system has one
+ *   active scheduling implementation at a time.
+ *
+ * In this file you will find state that any scheduler implementation can have
+ * access to as well as the functions the rest of Tor uses to interact with the
+ * scheduling system.
+ *
+ * The earliest versions of Tor approximated a kind of round-robin system
+ * among active connections, but only approximated it. It would only consider
+ * one connection (roughly equal to a channel in today's terms) at a time, and
+ * thus could only prioritize circuits against others on the same connection.
+ *
+ * Then in response to the KIST paper[0], Tor implemented a global
+ * circuit scheduler. It was supposed to prioritize circuits across many
+ * channels, but wasn't effective. It is preserved in scheduler_vanilla.c.
+ *
+ * [0]: http://www.robgjansen.com/publications/kist-sec2014.pdf
+ *
+ * Then we actually got around to implementing KIST for real. We decided to
+ * modularize the scheduler so new ones can be implemented. You can find KIST
+ * in scheduler_kist.c.
+ *
+ * Channels have one of four scheduling states based on whether or not they
+ * have cells to send and whether or not they are able to send.
+ *
+ * <ol>
+ * <li>
+ *   Not open for writes, no cells to send.
+ *     <ul><li> Not much to do here, and the channel will have scheduler_state
+ *       == SCHED_CHAN_IDLE
+ *     <li> Transitions from:
+ *       <ul>
+ *       <li>Open for writes/has cells by simultaneously draining all circuit
  *         queues and filling the output buffer.
- *     - Transitions to:
- *       - Not open for writes/has cells by arrival of cells on an attached
+ *       </ul>
+ *     <li> Transitions to:
+ *      <ul>
+ *       <li> Not open for writes/has cells by arrival of cells on an attached
  *         circuit (this would be driven from append_cell_to_circuit_queue())
- *       - Open for writes/no cells by a channel type specific path;
+ *       <li> Open for writes/no cells by a channel type specific path;
  *         driven from connection_or_flushed_some() for channel_tls_t.
+ *      </ul>
+ *    </ul>
  *
- * 2.) Open for writes, no cells to send
- *     - Not much here either; this will be the state an idle but open channel
- *       can be expected to settle in.  It will have scheduler_state ==
- *       SCHED_CHAN_WAITING_FOR_CELLS
- *     - Transitions from:
- *       - Not open for writes/no cells by flushing some of the output
+ * <li> Open for writes, no cells to send
+ *   <ul>
+ *     <li>Not much here either; this will be the state an idle but open
+ *       channel can be expected to settle in.  It will have scheduler_state
+ *       == SCHED_CHAN_WAITING_FOR_CELLS
+ *     <li> Transitions from:
+ *       <ul>
+ *       <li>Not open for writes/no cells by flushing some of the output
  *         buffer.
- *       - Open for writes/has cells by the scheduler moving cells from
+ *       <li>Open for writes/has cells by the scheduler moving cells from
  *         circuit queues to channel output queue, but not having enough
  *         to fill the output queue.
- *     - Transitions to:
- *       - Open for writes/has cells by arrival of new cells on an attached
+ *       </ul>
+ *     <li> Transitions to:
+ *       <ul>
+ *        <li>Open for writes/has cells by arrival of new cells on an attached
  *         circuit, in append_cell_to_circuit_queue()
+ *       </ul>
+ *     </ul>
  *
- * 3.) Not open for writes, cells to send
- *     - This is the state of a busy circuit limited by output bandwidth;
+ * <li>Not open for writes, cells to send
+ *     <ul>
+ *     <li>This is the state of a busy circuit limited by output bandwidth;
  *       cells have piled up in the circuit queues waiting to be relayed.
  *       The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE.
- *     - Transitions from:
- *       - Not open for writes/no cells by arrival of cells on an attached
+ *     <li> Transitions from:
+ *       <ul>
+ *       <li>Not open for writes/no cells by arrival of cells on an attached
  *         circuit
- *       - Open for writes/has cells by filling an output buffer without
+ *       <li>Open for writes/has cells by filling an output buffer without
  *         draining all cells from attached circuits
- *    - Transitions to:
- *       - Opens for writes/has cells by draining some of the output buffer
+ *       </ul>
+ *    <li> Transitions to:
+ *       <ul>
+ *       <li>Opens for writes/has cells by draining some of the output buffer
  *         via the connection_or_flushed_some() path (for channel_tls_t).
+ *       </ul>
+ *    </ul>
  *
- * 4.) Open for writes, cells to send
- *     - This connection is ready to relay some cells and waiting for
+ * <li>Open for writes, cells to send
+ *     <ul>
+ *     <li>This connection is ready to relay some cells and waiting for
  *       the scheduler to choose it.  The channel will have scheduler_state ==
  *       SCHED_CHAN_PENDING.
- *     - Transitions from:
- *       - Not open for writes/has cells by the connection_or_flushed_some()
+ *     <li>Transitions from:
+ *       <ul>
+ *       <li>Not open for writes/has cells by the connection_or_flushed_some()
  *         path
- *       - Open for writes/no cells by the append_cell_to_circuit_queue()
+ *       <li>Open for writes/no cells by the append_cell_to_circuit_queue()
  *         path
- *     - Transitions to:
- *       - Not open for writes/no cells by draining all circuit queues and
- *         simultaneously filling the output buffer.
- *       - Not open for writes/has cells by writing enough cells to fill the
+ *       </ul>
+ *     <li> Transitions to:
+ *       <ul>
+ *        <li>Not open for writes/no cells by draining all circuit queues and
+ *          simultaneously filling the output buffer.
+ *        <li>Not open for writes/has cells by writing enough cells to fill the
  *         output buffer
- *       - Open for writes/no cells by draining all attached circuit queues
+ *        <li>Open for writes/no cells by draining all attached circuit queues
  *         without also filling the output buffer
+ *       </ul>
+ *    </ul>
+ * </ol>
  *
  * Other event-driven parts of the code move channels between these scheduling
- * states by calling scheduler functions; the scheduler only runs on open-for-
- * writes/has-cells channels and is the only path for those to transition to
- * other states.  The scheduler_run() function gives us the opportunity to do
- * scheduling work, and is called from other scheduler functions whenever a
- * state transition occurs, and periodically from the main event loop.
+ * states by calling scheduler functions. The scheduling system builds up a
+ * list of channels in the SCHED_CHAN_PENDING state that the scheduler
+ * implementation should then use when it runs. Scheduling implementations need
+ * to properly update channel states during their scheduler_t->run() function
+ * as that is the only opportunity for channels to move from SCHED_CHAN_PENDING
+ * to any other state.
+ *
+ * The remainder of this file is a small amount of state that any scheduler
+ * implementation should have access to, and the functions the rest of Tor uses
+ * to interact with the scheduling system.
  */
 
-/* Scheduler global data structures */
+/*****************************************************************************
+ * Scheduling system state
+ *
+ * State that can be accessed from any scheduler implementation (but not
+ * outside the scheduling system)
+ *****************************************************************************/
 
-/*
+/** DOCDOC */
+STATIC const scheduler_t *the_scheduler;
+
+/**
  * We keep a list of channels that are pending - i.e, have cells to write
- * and can accept them to send.  The enum scheduler_state in channel_t
+ * and can accept them to send. The enum scheduler_state in channel_t
  * is reserved for our use.
+ *
+ * Priority queue of channels that can write and have cells (pending work)
  */
-
-/* Pqueue of channels that can write and have cells (pending work) */
 STATIC smartlist_t *channels_pending = NULL;
 
-/*
+/**
  * This event runs the scheduler from its callback, and is manually
  * activated whenever a channel enters open for writes/cells to send.
  */
-
 STATIC struct event *run_sched_ev = NULL;
 
-/*
- * Queue heuristic; this is not the queue size, but an 'effective queuesize'
- * that ages out contributions from stalled channels.
- */
+static int have_logged_kist_suddenly_disabled = 0;
 
-STATIC uint64_t queue_heuristic = 0;
+/*****************************************************************************
+ * Scheduling system static function definitions
+ *
+ * Functions that can only be accessed from this file.
+ *****************************************************************************/
 
-/*
- * Timestamp for last queue heuristic update
+/** Return a human readable string for the given scheduler type. */
+static const char *
+get_scheduler_type_string(scheduler_types_t type)
+{
+  switch (type) {
+  case SCHEDULER_VANILLA:
+    return "Vanilla";
+  case SCHEDULER_KIST:
+    return "KIST";
+  case SCHEDULER_KIST_LITE:
+    return "KISTLite";
+  case SCHEDULER_NONE:
+    /* fallthrough */
+  default:
+    tor_assert_unreached();
+    return "(N/A)";
+  }
+}
+
+/**
+ * Scheduler event callback; this should get triggered once per event loop
+ * if any scheduling work was created during the event loop.
  */
+static void
+scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
+{
+  (void) fd;
+  (void) events;
+  (void) arg;
 
-STATIC time_t queue_heuristic_timestamp = 0;
+  log_debug(LD_SCHED, "Scheduler event callback called");
 
-/* Scheduler static function declarations */
+  /* Run the scheduler. This is a mandatory function. */
 
-static void scheduler_evt_callback(evutil_socket_t fd,
-                                   short events, void *arg);
-static int scheduler_more_work(void);
-static void scheduler_retrigger(void);
-#if 0
-static void scheduler_trigger(void);
-#endif
+  /* We might as well assert on this. If this function doesn't exist, no cells
+   * are getting scheduled. Things are very broken. scheduler_t says the run()
+   * function is mandatory. */
+  tor_assert(the_scheduler->run);
+  the_scheduler->run();
 
-/* Scheduler function implementations */
+  /* Schedule itself back in if it has more work. */
 
-/** Free everything and shut down the scheduling system */
+  /* Again, might as well assert on this mandatory scheduler_t function. If it
+   * doesn't exist, there's no way to tell libevent to run the scheduler again
+   * in the future. */
+  tor_assert(the_scheduler->schedule);
+  the_scheduler->schedule();
+}
 
-void
-scheduler_free_all(void)
+/** Using the global options, select the scheduler we should be using. */
+static void
+select_scheduler(void)
 {
-  log_debug(LD_SCHED, "Shutting down scheduler");
-
-  if (run_sched_ev) {
-    if (event_del(run_sched_ev) < 0) {
-      log_warn(LD_BUG, "Problem deleting run_sched_ev");
+  scheduler_t *new_scheduler = NULL;
+
+#ifdef TOR_UNIT_TESTS
+  /* This is hella annoying to set in the options for every test that passes
+   * through the scheduler and there are many so if we don't explicitly have
+   * a list of types set, just put the vanilla one. */
+  if (get_options()->SchedulerTypes_ == NULL) {
+    the_scheduler = get_vanilla_scheduler();
+    return;
+  }
+#endif /* defined(TOR_UNIT_TESTS) */
+
+  /* This list is ordered that is first entry has the first priority. Thus, as
+   * soon as we find a scheduler type that we can use, we use it and stop. */
+  SMARTLIST_FOREACH_BEGIN(get_options()->SchedulerTypes_, int *, type) {
+    switch (*type) {
+    case SCHEDULER_VANILLA:
+      new_scheduler = get_vanilla_scheduler();
+      goto end;
+    case SCHEDULER_KIST:
+      if (!scheduler_can_use_kist()) {
+#ifdef HAVE_KIST_SUPPORT
+        if (!have_logged_kist_suddenly_disabled) {
+          /* We should only log this once in most cases. If it was the kernel
+           * losing support for kist that caused scheduler_can_use_kist() to
+           * return false, then this flag makes sure we only log this message
+           * once. If it was the consensus that switched from "yes use kist"
+           * to "no don't use kist", then we still set the flag so we log
+           * once, but we unset the flag elsewhere if we ever can_use_kist()
+           * again.
+           */
+          have_logged_kist_suddenly_disabled = 1;
+          log_notice(LD_SCHED, "Scheduler type KIST has been disabled by "
+                               "the consensus or no kernel support.");
+        }
+#else /* !(defined(HAVE_KIST_SUPPORT)) */
+        log_info(LD_SCHED, "Scheduler type KIST not built in");
+#endif /* defined(HAVE_KIST_SUPPORT) */
+        continue;
+      }
+      /* This flag will only get set in one of two cases:
+       * 1 - the kernel lost support for kist. In that case, we don't expect to
+       *     ever end up here
+       * 2 - the consensus went from "yes use kist" to "no don't use kist".
+       * We might end up here if the consensus changes back to "yes", in which
+       * case we might want to warn the user again if it goes back to "no"
+       * yet again. Thus we unset the flag */
+      have_logged_kist_suddenly_disabled = 0;
+      new_scheduler = get_kist_scheduler();
+      scheduler_kist_set_full_mode();
+      goto end;
+    case SCHEDULER_KIST_LITE:
+      new_scheduler = get_kist_scheduler();
+      scheduler_kist_set_lite_mode();
+      goto end;
+    case SCHEDULER_NONE:
+      /* fallthrough */
+    default:
+      /* Our option validation should have caught this. */
+      tor_assert_unreached();
     }
-    tor_event_free(run_sched_ev);
-    run_sched_ev = NULL;
+  } SMARTLIST_FOREACH_END(type);
+
+ end:
+  if (new_scheduler == NULL) {
+    log_err(LD_SCHED, "Tor was unable to select a scheduler type. Please "
+                      "make sure Schedulers is correctly configured with "
+                      "what Tor does support.");
+    /* We weren't able to choose a scheduler which means that none of the ones
+     * set in Schedulers are supported or usable. We will respect the user
+     * wishes of using what it has been configured and don't do a sneaky
+     * fallback. Because this can be changed at runtime, we have to stop tor
+     * right now. */
+    exit(1); // XXXX bad exit
   }
 
-  if (channels_pending) {
-    smartlist_free(channels_pending);
-    channels_pending = NULL;
-  }
+  /* Set the chosen scheduler. */
+  the_scheduler = new_scheduler;
 }
 
 /**
- * Comparison function to use when sorting pending channels
+ * Helper function called from a few different places. It changes the
+ * scheduler implementation, if necessary. And if it did, it then tells the
+ * old one to free its state and the new one to initialize.
  */
+static void
+set_scheduler(void)
+{
+  const scheduler_t *old_scheduler = the_scheduler;
+  scheduler_types_t old_scheduler_type = SCHEDULER_NONE;
+
+  /* We keep track of the type in order to log only if the type switched. We
+   * can't just use the scheduler pointers because KIST and KISTLite share the
+   * same object. */
+  if (the_scheduler) {
+    old_scheduler_type = the_scheduler->type;
+  }
+
+  /* From the options, select the scheduler type to set. */
+  select_scheduler();
+  tor_assert(the_scheduler);
+
+  /* We look at the pointer difference in case the old sched and new sched
+   * share the same scheduler object, as is the case with KIST and KISTLite. */
+  if (old_scheduler != the_scheduler) {
+    /* Allow the old scheduler to clean up, if needed. */
+    if (old_scheduler && old_scheduler->free_all) {
+      old_scheduler->free_all();
+    }
+
+    /* Initialize the new scheduler. */
+    if (the_scheduler->init) {
+      the_scheduler->init();
+    }
+  }
+
+  /* Finally we notice log if we switched schedulers. We use the type in case
+   * two schedulers share a scheduler object. */
+  if (old_scheduler_type != the_scheduler->type) {
+    log_notice(LD_CONFIG, "Scheduler type %s has been enabled.",
+               get_scheduler_type_string(the_scheduler->type));
+  }
+}
+
+/*****************************************************************************
+ * Scheduling system private function definitions
+ *
+ * Functions that can only be accessed from scheduler*.c
+ *****************************************************************************/
 
-MOCK_IMPL(STATIC int,
+/** Returns human readable string for the given channel scheduler state. */
+const char *
+get_scheduler_state_string(int scheduler_state)
+{
+  switch (scheduler_state) {
+  case SCHED_CHAN_IDLE:
+    return "IDLE";
+  case SCHED_CHAN_WAITING_FOR_CELLS:
+    return "WAITING_FOR_CELLS";
+  case SCHED_CHAN_WAITING_TO_WRITE:
+    return "WAITING_TO_WRITE";
+  case SCHED_CHAN_PENDING:
+    return "PENDING";
+  default:
+    return "(invalid)";
+  }
+}
+
+/** Helper that logs channel scheduler_state changes. Use this instead of
+ * setting scheduler_state directly. */
+void
+scheduler_set_channel_state(channel_t *chan, int new_state)
+{
+  log_debug(LD_SCHED, "chan %" PRIu64 " changed from scheduler state %s to %s",
+      chan->global_identifier,
+      get_scheduler_state_string(chan->scheduler_state),
+      get_scheduler_state_string(new_state));
+  chan->scheduler_state = new_state;
+}
+
+/** Return the pending channel list. */
+smartlist_t *
+get_channels_pending(void)
+{
+  return channels_pending;
+}
+
+/** Comparison function to use when sorting pending channels. */
+MOCK_IMPL(int,
 scheduler_compare_channels, (const void *c1_v, const void *c2_v))
 {
-  channel_t *c1 = NULL, *c2 = NULL;
+  const channel_t *c1 = NULL, *c2 = NULL;
   /* These are a workaround for -Wbad-function-cast throwing a fit */
   const circuitmux_policy_t *p1, *p2;
   uintptr_t p1_i, p2_i;
@@ -180,11 +411,8 @@ scheduler_compare_channels, (const void *c1_v, const void *c2_v))
   tor_assert(c1_v);
   tor_assert(c2_v);
 
-  c1 = (channel_t *)(c1_v);
-  c2 = (channel_t *)(c2_v);
-
-  tor_assert(c1);
-  tor_assert(c2);
+  c1 = (const channel_t *)(c1_v);
+  c2 = (const channel_t *)(c2_v);
 
   if (c1 != c2) {
     if (circuitmux_get_policy(c1->cmux) ==
@@ -211,36 +439,83 @@ scheduler_compare_channels, (const void *c1_v, const void *c2_v))
   }
 }
 
-/*
- * Scheduler event callback; this should get triggered once per event loop
- * if any scheduling work was created during the event loop.
- */
+/*****************************************************************************
+ * Scheduling system global functions
+ *
+ * Functions that can be accessed from anywhere in Tor.
+ *****************************************************************************/
 
-static void
-scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
+/**
+ * This is how the scheduling system is notified of Tor's configuration
+ * changing. For example: a SIGHUP was issued.
+ */
+void
+scheduler_conf_changed(void)
 {
-  (void)fd;
-  (void)events;
-  (void)arg;
-  log_debug(LD_SCHED, "Scheduler event callback called");
+  /* Let the scheduler decide what it should do. */
+  set_scheduler();
 
-  tor_assert(run_sched_ev);
+  /* Then tell the (possibly new) scheduler that we have new options. */
+  if (the_scheduler->on_new_options) {
+    the_scheduler->on_new_options();
+  }
+}
 
-  /* Run the scheduler */
-  scheduler_run();
+/**
+ * Whenever we get a new consensus, this function is called.
+ */
+void
+scheduler_notify_networkstatus_changed(void)
+{
+  /* Maybe the consensus param made us change the scheduler. */
+  set_scheduler();
 
-  /* Do we have more work to do? */
-  if (scheduler_more_work()) scheduler_retrigger();
+  /* Then tell the (possibly new) scheduler that we have a new consensus */
+  if (the_scheduler->on_new_consensus) {
+    the_scheduler->on_new_consensus();
+  }
 }
 
-/** Mark a channel as no longer ready to accept writes */
+/**
+ * Free everything scheduling-related from main.c. Note this is only called
+ * when Tor is shutting down, while scheduler_t->free_all() is called both when
+ * Tor is shutting down and when we are switching schedulers.
+ */
+void
+scheduler_free_all(void)
+{
+  log_debug(LD_SCHED, "Shutting down scheduler");
+
+  if (run_sched_ev) {
+    if (event_del(run_sched_ev) < 0) {
+      log_warn(LD_BUG, "Problem deleting run_sched_ev");
+    }
+    tor_event_free(run_sched_ev);
+    run_sched_ev = NULL;
+  }
+
+  if (channels_pending) {
+    /* We don't have ownership of the objects in this list. */
+    smartlist_free(channels_pending);
+    channels_pending = NULL;
+  }
+
+  if (the_scheduler && the_scheduler->free_all) {
+    the_scheduler->free_all();
+  }
+  the_scheduler = NULL;
+}
 
+/** Mark a channel as no longer ready to accept writes. */
 MOCK_IMPL(void,
 scheduler_channel_doesnt_want_writes,(channel_t *chan))
 {
-  tor_assert(chan);
-
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
   /* If it's already in pending, we can put it in waiting_to_write */
   if (chan->scheduler_state == SCHED_CHAN_PENDING) {
@@ -251,13 +526,9 @@ scheduler_channel_doesnt_want_writes,(channel_t *chan))
      */
     smartlist_pqueue_remove(channels_pending,
                             scheduler_compare_channels,
-                            STRUCT_OFFSET(channel_t, sched_heap_idx),
+                            offsetof(channel_t, sched_heap_idx),
                             chan);
-    chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
-    log_debug(LD_SCHED,
-              "Channel " U64_FORMAT " at %p went from pending "
-              "to waiting_to_write",
-              U64_PRINTF_ARG(chan->global_identifier), chan);
+    scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_TO_WRITE);
   } else {
     /*
      * It's not in pending, so it can't become waiting_to_write; it's
@@ -265,41 +536,39 @@ scheduler_channel_doesnt_want_writes,(channel_t *chan))
      * waiting_for_cells (remove it, can't write any more).
      */
     if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
-      chan->scheduler_state = SCHED_CHAN_IDLE;
-      log_debug(LD_SCHED,
-                "Channel " U64_FORMAT " at %p left waiting_for_cells",
-                U64_PRINTF_ARG(chan->global_identifier), chan);
+      scheduler_set_channel_state(chan, SCHED_CHAN_IDLE);
     }
   }
 }
 
-/** Mark a channel as having waiting cells */
-
+/** Mark a channel as having waiting cells. */
 MOCK_IMPL(void,
 scheduler_channel_has_waiting_cells,(channel_t *chan))
 {
-  int became_pending = 0;
-
-  tor_assert(chan);
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
-  /* First, check if this one also writeable */
+  /* First, check if it's also writeable */
   if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
     /*
      * It's in channels_waiting_for_cells, so it shouldn't be in any of
      * the other lists.  It has waiting cells now, so it goes to
      * channels_pending.
      */
-    chan->scheduler_state = SCHED_CHAN_PENDING;
-    smartlist_pqueue_add(channels_pending,
-                         scheduler_compare_channels,
-                         STRUCT_OFFSET(channel_t, sched_heap_idx),
-                         chan);
-    log_debug(LD_SCHED,
-              "Channel " U64_FORMAT " at %p went from waiting_for_cells "
-              "to pending",
-              U64_PRINTF_ARG(chan->global_identifier), chan);
-    became_pending = 1;
+    scheduler_set_channel_state(chan, SCHED_CHAN_PENDING);
+    if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) {
+      smartlist_pqueue_add(channels_pending,
+                           scheduler_compare_channels,
+                           offsetof(channel_t, sched_heap_idx),
+                           chan);
+    }
+    /* If we made a channel pending, we potentially have scheduling work to
+     * do. */
+    the_scheduler->schedule();
   } else {
     /*
      * It's not in waiting_for_cells, so it can't become pending; it's
@@ -308,256 +577,122 @@ scheduler_channel_has_waiting_cells,(channel_t *chan))
      */
     if (!(chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE ||
           chan->scheduler_state == SCHED_CHAN_PENDING)) {
-      chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
-      log_debug(LD_SCHED,
-                "Channel " U64_FORMAT " at %p entered waiting_to_write",
-                U64_PRINTF_ARG(chan->global_identifier), chan);
+      scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_TO_WRITE);
     }
   }
+}
 
-  /*
-   * If we made a channel pending, we potentially have scheduling work
-   * to do.
-   */
-  if (became_pending) scheduler_retrigger();
+/** Add the scheduler event to the set of pending events with next_run being
+ * the longest time libevent should wait before triggering the event. */
+void
+scheduler_ev_add(const struct timeval *next_run)
+{
+  tor_assert(run_sched_ev);
+  tor_assert(next_run);
+  if (BUG(event_add(run_sched_ev, next_run) < 0)) {
+    log_warn(LD_SCHED, "Adding to libevent failed. Next run time was set to: "
+                       "%ld.%06ld", next_run->tv_sec, (long)next_run->tv_usec);
+    return;
+  }
 }
 
-/** Set up the scheduling system */
+/** Make the scheduler event active with the given flags. */
+void
+scheduler_ev_active(int flags)
+{
+  tor_assert(run_sched_ev);
+  event_active(run_sched_ev, flags, 1);
+}
 
+/*
+ * Initialize everything scheduling-related from config.c. Note this is only
+ * called when Tor is starting up, while scheduler_t->init() is called both
+ * when Tor is starting up and when we are switching schedulers.
+ */
 void
 scheduler_init(void)
 {
   log_debug(LD_SCHED, "Initting scheduler");
 
-  tor_assert(!run_sched_ev);
+  // Two '!' because we really do want to check if the pointer is non-NULL
+  IF_BUG_ONCE(!!run_sched_ev) {
+    log_warn(LD_SCHED, "We should not already have a libevent scheduler event."
+             "I'll clean the old one up, but this is odd.");
+    tor_event_free(run_sched_ev);
+    run_sched_ev = NULL;
+  }
   run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
                                0, scheduler_evt_callback, NULL);
-
   channels_pending = smartlist_new();
-  queue_heuristic = 0;
-  queue_heuristic_timestamp = approx_time();
-}
-
-/** Check if there's more scheduling work */
-
-static int
-scheduler_more_work(void)
-{
-  tor_assert(channels_pending);
 
-  return ((scheduler_get_queue_heuristic() < sched_q_low_water) &&
-          ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
-}
-
-/** Retrigger the scheduler in a way safe to use from the callback */
-
-static void
-scheduler_retrigger(void)
-{
-  tor_assert(run_sched_ev);
-  event_active(run_sched_ev, EV_TIMEOUT, 1);
+  set_scheduler();
 }
 
-/** Notify the scheduler of a channel being closed */
-
+/*
+ * If a channel is going away, this is how the scheduling system is informed
+ * so it can do any freeing necessary. This ultimately calls
+ * scheduler_t->on_channel_free() so the current scheduler can release any
+ * state specific to this channel.
+ */
 MOCK_IMPL(void,
 scheduler_release_channel,(channel_t *chan))
 {
-  tor_assert(chan);
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
-  if (chan->scheduler_state == SCHED_CHAN_PENDING) {
+  /* Try to remove the channel from the pending list regardless of its
+   * scheduler state. We can release a channel in many places in the tor code
+   * so we can't rely on the channel state (PENDING) to remove it from the
+   * list.
+   *
+   * For instance, the channel can change state from OPEN to CLOSING while
+   * being handled in the scheduler loop leading to the channel being in
+   * PENDING state but not in the pending list. Furthermore, we release the
+   * channel when it changes state to close and a second time when we free it.
+   * Not ideal at all but for now that is the way it is. */
+  if (chan->sched_heap_idx != -1) {
     smartlist_pqueue_remove(channels_pending,
                             scheduler_compare_channels,
-                            STRUCT_OFFSET(channel_t, sched_heap_idx),
+                            offsetof(channel_t, sched_heap_idx),
                             chan);
   }
 
-  chan->scheduler_state = SCHED_CHAN_IDLE;
-}
-
-/** Run the scheduling algorithm if necessary */
-
-MOCK_IMPL(void,
-scheduler_run, (void))
-{
-  int n_cells, n_chans_before, n_chans_after;
-  uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
-  ssize_t flushed, flushed_this_time;
-  smartlist_t *to_readd = NULL;
-  channel_t *chan = NULL;
-
-  log_debug(LD_SCHED, "We have a chance to run the scheduler");
-
-  if (scheduler_get_queue_heuristic() < sched_q_low_water) {
-    n_chans_before = smartlist_len(channels_pending);
-    q_len_before = channel_get_global_queue_estimate();
-    q_heur_before = scheduler_get_queue_heuristic();
-
-    while (scheduler_get_queue_heuristic() <= sched_q_high_water &&
-           smartlist_len(channels_pending) > 0) {
-      /* Pop off a channel */
-      chan = smartlist_pqueue_pop(channels_pending,
-                                  scheduler_compare_channels,
-                                  STRUCT_OFFSET(channel_t, sched_heap_idx));
-      tor_assert(chan);
-
-      /* Figure out how many cells we can write */
-      n_cells = channel_num_cells_writeable(chan);
-      if (n_cells > 0) {
-        log_debug(LD_SCHED,
-                  "Scheduler saw pending channel " U64_FORMAT " at %p with "
-                  "%d cells writeable",
-                  U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
-
-        flushed = 0;
-        while (flushed < n_cells &&
-               scheduler_get_queue_heuristic() <= sched_q_high_water) {
-          flushed_this_time =
-            channel_flush_some_cells(chan,
-                                     MIN(sched_max_flush_cells,
-                                         (size_t) n_cells - flushed));
-          if (flushed_this_time <= 0) break;
-          flushed += flushed_this_time;
-        }
-
-        if (flushed < n_cells) {
-          /* We ran out of cells to flush */
-          chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
-          log_debug(LD_SCHED,
-                    "Channel " U64_FORMAT " at %p "
-                    "entered waiting_for_cells from pending",
-                    U64_PRINTF_ARG(chan->global_identifier),
-                    chan);
-        } else {
-          /* The channel may still have some cells */
-          if (channel_more_to_flush(chan)) {
-          /* The channel goes to either pending or waiting_to_write */
-            if (channel_num_cells_writeable(chan) > 0) {
-              /* Add it back to pending later */
-              if (!to_readd) to_readd = smartlist_new();
-              smartlist_add(to_readd, chan);
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "is still pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            } else {
-              /* It's waiting to be able to write more */
-              chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "entered waiting_to_write from pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            }
-          } else {
-            /* No cells left; it can go to idle or waiting_for_cells */
-            if (channel_num_cells_writeable(chan) > 0) {
-              /*
-               * It can still accept writes, so it goes to
-               * waiting_for_cells
-               */
-              chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "entered waiting_for_cells from pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            } else {
-              /*
-               * We exactly filled up the output queue with all available
-               * cells; go to idle.
-               */
-              chan->scheduler_state = SCHED_CHAN_IDLE;
-              log_debug(LD_SCHED,
-                        "Channel " U64_FORMAT " at %p "
-                        "become idle from pending",
-                        U64_PRINTF_ARG(chan->global_identifier),
-                        chan);
-            }
-          }
-        }
-
-        log_debug(LD_SCHED,
-                  "Scheduler flushed %d cells onto pending channel "
-                  U64_FORMAT " at %p",
-                  (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
-                  chan);
-      } else {
-        log_info(LD_SCHED,
-                 "Scheduler saw pending channel " U64_FORMAT " at %p with "
-                 "no cells writeable",
-                 U64_PRINTF_ARG(chan->global_identifier), chan);
-        /* Put it back to WAITING_TO_WRITE */
-        chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
-      }
-    }
-
-    /* Readd any channels we need to */
-    if (to_readd) {
-      SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
-        readd_chan->scheduler_state = SCHED_CHAN_PENDING;
-        smartlist_pqueue_add(channels_pending,
-                             scheduler_compare_channels,
-                             STRUCT_OFFSET(channel_t, sched_heap_idx),
-                             readd_chan);
-      } SMARTLIST_FOREACH_END(readd_chan);
-      smartlist_free(to_readd);
-    }
-
-    n_chans_after = smartlist_len(channels_pending);
-    q_len_after = channel_get_global_queue_estimate();
-    q_heur_after = scheduler_get_queue_heuristic();
-    log_debug(LD_SCHED,
-              "Scheduler handled %d of %d pending channels, queue size from "
-              U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
-              U64_FORMAT " to " U64_FORMAT,
-              n_chans_before - n_chans_after, n_chans_before,
-              U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
-              U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
+  if (the_scheduler->on_channel_free) {
+    the_scheduler->on_channel_free(chan);
   }
+  scheduler_set_channel_state(chan, SCHED_CHAN_IDLE);
 }
 
-/** Trigger the scheduling event so we run the scheduler later */
-
-#if 0
-static void
-scheduler_trigger(void)
-{
-  log_debug(LD_SCHED, "Triggering scheduler event");
-
-  tor_assert(run_sched_ev);
-
-  event_add(run_sched_ev, EV_TIMEOUT, 1);
-}
-#endif
-
 /** Mark a channel as ready to accept writes */
 
 void
 scheduler_channel_wants_writes(channel_t *chan)
 {
-  int became_pending = 0;
-
-  tor_assert(chan);
-  tor_assert(channels_pending);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
+  IF_BUG_ONCE(!channels_pending) {
+    return;
+  }
 
   /* If it's already in waiting_to_write, we can put it in pending */
   if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) {
     /*
      * It can write now, so it goes to channels_pending.
      */
-    smartlist_pqueue_add(channels_pending,
-                         scheduler_compare_channels,
-                         STRUCT_OFFSET(channel_t, sched_heap_idx),
-                         chan);
-    chan->scheduler_state = SCHED_CHAN_PENDING;
-    log_debug(LD_SCHED,
-              "Channel " U64_FORMAT " at %p went from waiting_to_write "
-              "to pending",
-              U64_PRINTF_ARG(chan->global_identifier), chan);
-    became_pending = 1;
+    scheduler_set_channel_state(chan, SCHED_CHAN_PENDING);
+    if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) {
+      smartlist_pqueue_add(channels_pending,
+                           scheduler_compare_channels,
+                           offsetof(channel_t, sched_heap_idx),
+                           chan);
+    }
+    /* We just made a channel pending, we have scheduling work to do. */
+    the_scheduler->schedule();
   } else {
     /*
      * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending;
@@ -565,143 +700,74 @@ scheduler_channel_wants_writes(channel_t *chan)
      */
     if (!(chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS ||
           chan->scheduler_state == SCHED_CHAN_PENDING)) {
-      chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
-      log_debug(LD_SCHED,
-                "Channel " U64_FORMAT " at %p entered waiting_for_cells",
-                U64_PRINTF_ARG(chan->global_identifier), chan);
+      scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
     }
   }
+}
+
+/* Log warn the given channel and extra scheduler context as well. This is
+ * used by SCHED_BUG() in order to be able to extract as much information as
+ * we can when we hit a bug. Channel chan can be NULL. */
+void
+scheduler_bug_occurred(const channel_t *chan)
+{
+  char buf[128];
+
+  if (chan != NULL) {
+    const size_t outbuf_len =
+      buf_datalen(TO_CONN(BASE_CHAN_TO_TLS((channel_t *) chan)->conn)->outbuf);
+    tor_snprintf(buf, sizeof(buf),
+                 "Channel %" PRIu64 " in state %s and scheduler state %s."
+                 " Num cells on cmux: %d. Connection outbuf len: %lu.",
+                 chan->global_identifier,
+                 channel_state_to_string(chan->state),
+                 get_scheduler_state_string(chan->scheduler_state),
+                 circuitmux_num_cells(chan->cmux),
+                 (unsigned long)outbuf_len);
+  }
 
-  /*
-   * If we made a channel pending, we potentially have scheduling work
-   * to do.
-   */
-  if (became_pending) scheduler_retrigger();
+  {
+    char *msg;
+    /* Rate limit every 60 seconds. If we start seeing this every 60 sec, we
+     * know something is stuck/wrong. It *should* be loud but not too much. */
+    static ratelim_t rlimit = RATELIM_INIT(60);
+    if ((msg = rate_limit_log(&rlimit, approx_time()))) {
+      log_warn(LD_BUG, "%s Num pending channels: %d. "
+                       "Channel in pending list: %s.%s",
+               (chan != NULL) ? buf : "No channel in bug context.",
+               smartlist_len(channels_pending),
+               (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes",
+               msg);
+      tor_free(msg);
+    }
+  }
 }
 
-/**
- * Notify the scheduler that a channel's position in the pqueue may have
- * changed
- */
+#ifdef TOR_UNIT_TESTS
 
+/*
+ * Notify scheduler that a channel's queue position may have changed.
+ */
 void
 scheduler_touch_channel(channel_t *chan)
 {
-  tor_assert(chan);
+  IF_BUG_ONCE(!chan) {
+    return;
+  }
 
   if (chan->scheduler_state == SCHED_CHAN_PENDING) {
     /* Remove and re-add it */
     smartlist_pqueue_remove(channels_pending,
                             scheduler_compare_channels,
-                            STRUCT_OFFSET(channel_t, sched_heap_idx),
+                            offsetof(channel_t, sched_heap_idx),
                             chan);
     smartlist_pqueue_add(channels_pending,
                          scheduler_compare_channels,
-                         STRUCT_OFFSET(channel_t, sched_heap_idx),
+                         offsetof(channel_t, sched_heap_idx),
                          chan);
   }
   /* else no-op, since it isn't in the queue */
 }
 
-/**
- * Notify the scheduler of a queue size adjustment, to recalculate the
- * queue heuristic.
- */
-
-void
-scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj)
-{
-  time_t now = approx_time();
-
-  log_debug(LD_SCHED,
-            "Queue size adjustment by %s" U64_FORMAT " for channel "
-            U64_FORMAT,
-            (dir >= 0) ? "+" : "-",
-            U64_PRINTF_ARG(adj),
-            U64_PRINTF_ARG(chan->global_identifier));
-
-  /* Get the queue heuristic up to date */
-  scheduler_update_queue_heuristic(now);
-
-  /* Adjust as appropriate */
-  if (dir >= 0) {
-    /* Increasing it */
-    queue_heuristic += adj;
-  } else {
-    /* Decreasing it */
-    if (queue_heuristic > adj) queue_heuristic -= adj;
-    else queue_heuristic = 0;
-  }
-
-  log_debug(LD_SCHED,
-            "Queue heuristic is now " U64_FORMAT,
-            U64_PRINTF_ARG(queue_heuristic));
-}
-
-/**
- * Query the current value of the queue heuristic
- */
-
-STATIC uint64_t
-scheduler_get_queue_heuristic(void)
-{
-  time_t now = approx_time();
-
-  scheduler_update_queue_heuristic(now);
-
-  return queue_heuristic;
-}
-
-/**
- * Adjust the queue heuristic value to the present time
- */
-
-STATIC void
-scheduler_update_queue_heuristic(time_t now)
-{
-  time_t diff;
-
-  if (queue_heuristic_timestamp == 0) {
-    /*
-     * Nothing we can sensibly do; must not have been initted properly.
-     * Oh well.
-     */
-    queue_heuristic_timestamp = now;
-  } else if (queue_heuristic_timestamp < now) {
-    diff = now - queue_heuristic_timestamp;
-    /*
-     * This is a simple exponential age-out; the other proposed alternative
-     * was a linear age-out using the bandwidth history in rephist.c; I'm
-     * going with this out of concern that if an adversary can jam the
-     * scheduler long enough, it would cause the bandwidth to drop to
-     * zero and render the aging mechanism ineffective thereafter.
-     */
-    if (0 <= diff && diff < 64) queue_heuristic >>= diff;
-    else queue_heuristic = 0;
-
-    queue_heuristic_timestamp = now;
-
-    log_debug(LD_SCHED,
-              "Queue heuristic is now " U64_FORMAT,
-              U64_PRINTF_ARG(queue_heuristic));
-  }
-  /* else no update needed, or time went backward */
-}
-
-/**
- * Set scheduler watermarks and flush size
- */
-
-void
-scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush)
-{
-  /* Sanity assertions - caller should ensure these are true */
-  tor_assert(lo > 0);
-  tor_assert(hi > lo);
-  tor_assert(max_flush > 0);
-
-  sched_q_low_water = lo;
-  sched_q_high_water = hi;
-  sched_max_flush_cells = max_flush;
-}
+#endif /* defined(TOR_UNIT_TESTS) */