diff options
Diffstat (limited to 'src/or/scheduler.c')
-rw-r--r-- | src/or/scheduler.c | 976 |
1 files changed, 517 insertions, 459 deletions
diff --git a/src/or/scheduler.c b/src/or/scheduler.c index 49ac1b939a..1984084feb 100644 --- a/src/or/scheduler.c +++ b/src/or/scheduler.c @@ -1,178 +1,379 @@ -/* * Copyright (c) 2013-2016, The Tor Project, Inc. */ +/* Copyright (c) 2013-2017, The Tor Project, Inc. */ /* See LICENSE for licensing information */ -/** - * \file scheduler.c - * \brief Relay scheduling system - **/ - #include "or.h" - -#define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */ -#include "channel.h" +#include "config.h" #include "compat_libevent.h" #define SCHEDULER_PRIVATE_ +#define SCHEDULER_KIST_PRIVATE #include "scheduler.h" +#include "main.h" +#include "buffers.h" +#define TOR_CHANNEL_INTERNAL_ +#include "channeltls.h" #include <event2/event.h> -/* - * Scheduler high/low watermarks - */ - -static uint32_t sched_q_low_water = 16384; -static uint32_t sched_q_high_water = 32768; - -/* - * Maximum cells to flush in a single call to channel_flush_some_cells(); - * setting this low means more calls, but too high and we could overshoot - * sched_q_high_water. - */ - -static uint32_t sched_max_flush_cells = 16; - -/* - * Write scheduling works by keeping track of which channels can - * accept cells, and have cells to write. From the scheduler's perspective, - * a channel can be in four possible states: +/** + * \file scheduler.c + * \brief Channel scheduling system: decides which channels should send and + * receive when. + * + * This module is the global/common parts of the scheduling system. This system + * is what decides what channels get to send cells on their circuits and when. * - * 1.) Not open for writes, no cells to send - * - Not much to do here, and the channel will have scheduler_state == - * SCHED_CHAN_IDLE - * - Transitions from: - * - Open for writes/has cells by simultaneously draining all circuit + * Terms: + * - "Scheduling system": the collection of scheduler*.{h,c} files and their + * aggregate behavior. + * - "Scheduler implementation": a scheduler_t. The scheduling system has one + * active scheduling implementation at a time. + * + * In this file you will find state that any scheduler implementation can have + * access to as well as the functions the rest of Tor uses to interact with the + * scheduling system. + * + * The earliest versions of Tor approximated a kind of round-robin system + * among active connections, but only approximated it. It would only consider + * one connection (roughly equal to a channel in today's terms) at a time, and + * thus could only prioritize circuits against others on the same connection. + * + * Then in response to the KIST paper[0], Tor implemented a global + * circuit scheduler. It was supposed to prioritize circuits across many + * channels, but wasn't effective. It is preserved in scheduler_vanilla.c. + * + * [0]: http://www.robgjansen.com/publications/kist-sec2014.pdf + * + * Then we actually got around to implementing KIST for real. We decided to + * modularize the scheduler so new ones can be implemented. You can find KIST + * in scheduler_kist.c. + * + * Channels have one of four scheduling states based on whether or not they + * have cells to send and whether or not they are able to send. + * + * <ol> + * <li> + * Not open for writes, no cells to send. + * <ul><li> Not much to do here, and the channel will have scheduler_state + * == SCHED_CHAN_IDLE + * <li> Transitions from: + * <ul> + * <li>Open for writes/has cells by simultaneously draining all circuit * queues and filling the output buffer. - * - Transitions to: - * - Not open for writes/has cells by arrival of cells on an attached + * </ul> + * <li> Transitions to: + * <ul> + * <li> Not open for writes/has cells by arrival of cells on an attached * circuit (this would be driven from append_cell_to_circuit_queue()) - * - Open for writes/no cells by a channel type specific path; + * <li> Open for writes/no cells by a channel type specific path; * driven from connection_or_flushed_some() for channel_tls_t. + * </ul> + * </ul> * - * 2.) Open for writes, no cells to send - * - Not much here either; this will be the state an idle but open channel - * can be expected to settle in. It will have scheduler_state == - * SCHED_CHAN_WAITING_FOR_CELLS - * - Transitions from: - * - Not open for writes/no cells by flushing some of the output + * <li> Open for writes, no cells to send + * <ul> + * <li>Not much here either; this will be the state an idle but open + * channel can be expected to settle in. It will have scheduler_state + * == SCHED_CHAN_WAITING_FOR_CELLS + * <li> Transitions from: + * <ul> + * <li>Not open for writes/no cells by flushing some of the output * buffer. - * - Open for writes/has cells by the scheduler moving cells from + * <li>Open for writes/has cells by the scheduler moving cells from * circuit queues to channel output queue, but not having enough * to fill the output queue. - * - Transitions to: - * - Open for writes/has cells by arrival of new cells on an attached + * </ul> + * <li> Transitions to: + * <ul> + * <li>Open for writes/has cells by arrival of new cells on an attached * circuit, in append_cell_to_circuit_queue() + * </ul> + * </ul> * - * 3.) Not open for writes, cells to send - * - This is the state of a busy circuit limited by output bandwidth; + * <li>Not open for writes, cells to send + * <ul> + * <li>This is the state of a busy circuit limited by output bandwidth; * cells have piled up in the circuit queues waiting to be relayed. * The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE. - * - Transitions from: - * - Not open for writes/no cells by arrival of cells on an attached + * <li> Transitions from: + * <ul> + * <li>Not open for writes/no cells by arrival of cells on an attached * circuit - * - Open for writes/has cells by filling an output buffer without + * <li>Open for writes/has cells by filling an output buffer without * draining all cells from attached circuits - * - Transitions to: - * - Opens for writes/has cells by draining some of the output buffer + * </ul> + * <li> Transitions to: + * <ul> + * <li>Opens for writes/has cells by draining some of the output buffer * via the connection_or_flushed_some() path (for channel_tls_t). + * </ul> + * </ul> * - * 4.) Open for writes, cells to send - * - This connection is ready to relay some cells and waiting for + * <li>Open for writes, cells to send + * <ul> + * <li>This connection is ready to relay some cells and waiting for * the scheduler to choose it. The channel will have scheduler_state == * SCHED_CHAN_PENDING. - * - Transitions from: - * - Not open for writes/has cells by the connection_or_flushed_some() + * <li>Transitions from: + * <ul> + * <li>Not open for writes/has cells by the connection_or_flushed_some() * path - * - Open for writes/no cells by the append_cell_to_circuit_queue() + * <li>Open for writes/no cells by the append_cell_to_circuit_queue() * path - * - Transitions to: - * - Not open for writes/no cells by draining all circuit queues and - * simultaneously filling the output buffer. - * - Not open for writes/has cells by writing enough cells to fill the + * </ul> + * <li> Transitions to: + * <ul> + * <li>Not open for writes/no cells by draining all circuit queues and + * simultaneously filling the output buffer. + * <li>Not open for writes/has cells by writing enough cells to fill the * output buffer - * - Open for writes/no cells by draining all attached circuit queues + * <li>Open for writes/no cells by draining all attached circuit queues * without also filling the output buffer + * </ul> + * </ul> + * </ol> * * Other event-driven parts of the code move channels between these scheduling - * states by calling scheduler functions; the scheduler only runs on open-for- - * writes/has-cells channels and is the only path for those to transition to - * other states. The scheduler_run() function gives us the opportunity to do - * scheduling work, and is called from other scheduler functions whenever a - * state transition occurs, and periodically from the main event loop. + * states by calling scheduler functions. The scheduling system builds up a + * list of channels in the SCHED_CHAN_PENDING state that the scheduler + * implementation should then use when it runs. Scheduling implementations need + * to properly update channel states during their scheduler_t->run() function + * as that is the only opportunity for channels to move from SCHED_CHAN_PENDING + * to any other state. + * + * The remainder of this file is a small amount of state that any scheduler + * implementation should have access to, and the functions the rest of Tor uses + * to interact with the scheduling system. */ -/* Scheduler global data structures */ +/***************************************************************************** + * Scheduling system state + * + * State that can be accessed from any scheduler implementation (but not + * outside the scheduling system) + *****************************************************************************/ -/* +/** DOCDOC */ +STATIC const scheduler_t *the_scheduler; + +/** * We keep a list of channels that are pending - i.e, have cells to write - * and can accept them to send. The enum scheduler_state in channel_t + * and can accept them to send. The enum scheduler_state in channel_t * is reserved for our use. + * + * Priority queue of channels that can write and have cells (pending work) */ - -/* Pqueue of channels that can write and have cells (pending work) */ STATIC smartlist_t *channels_pending = NULL; -/* +/** * This event runs the scheduler from its callback, and is manually * activated whenever a channel enters open for writes/cells to send. */ - STATIC struct event *run_sched_ev = NULL; -/* - * Queue heuristic; this is not the queue size, but an 'effective queuesize' - * that ages out contributions from stalled channels. - */ +static int have_logged_kist_suddenly_disabled = 0; -STATIC uint64_t queue_heuristic = 0; +/***************************************************************************** + * Scheduling system static function definitions + * + * Functions that can only be accessed from this file. + *****************************************************************************/ -/* - * Timestamp for last queue heuristic update +/** Return a human readable string for the given scheduler type. */ +static const char * +get_scheduler_type_string(scheduler_types_t type) +{ + switch (type) { + case SCHEDULER_VANILLA: + return "Vanilla"; + case SCHEDULER_KIST: + return "KIST"; + case SCHEDULER_KIST_LITE: + return "KISTLite"; + case SCHEDULER_NONE: + /* fallthrough */ + default: + tor_assert_unreached(); + return "(N/A)"; + } +} + +/** + * Scheduler event callback; this should get triggered once per event loop + * if any scheduling work was created during the event loop. */ +static void +scheduler_evt_callback(evutil_socket_t fd, short events, void *arg) +{ + (void) fd; + (void) events; + (void) arg; -STATIC time_t queue_heuristic_timestamp = 0; + log_debug(LD_SCHED, "Scheduler event callback called"); -/* Scheduler static function declarations */ + /* Run the scheduler. This is a mandatory function. */ -static void scheduler_evt_callback(evutil_socket_t fd, - short events, void *arg); -static int scheduler_more_work(void); -static void scheduler_retrigger(void); -#if 0 -static void scheduler_trigger(void); -#endif + /* We might as well assert on this. If this function doesn't exist, no cells + * are getting scheduled. Things are very broken. scheduler_t says the run() + * function is mandatory. */ + tor_assert(the_scheduler->run); + the_scheduler->run(); -/* Scheduler function implementations */ + /* Schedule itself back in if it has more work. */ -/** Free everything and shut down the scheduling system */ + /* Again, might as well assert on this mandatory scheduler_t function. If it + * doesn't exist, there's no way to tell libevent to run the scheduler again + * in the future. */ + tor_assert(the_scheduler->schedule); + the_scheduler->schedule(); +} -void -scheduler_free_all(void) +/** Using the global options, select the scheduler we should be using. */ +static void +select_scheduler(void) { - log_debug(LD_SCHED, "Shutting down scheduler"); - - if (run_sched_ev) { - if (event_del(run_sched_ev) < 0) { - log_warn(LD_BUG, "Problem deleting run_sched_ev"); + scheduler_t *new_scheduler = NULL; + +#ifdef TOR_UNIT_TESTS + /* This is hella annoying to set in the options for every test that passes + * through the scheduler and there are many so if we don't explicitly have + * a list of types set, just put the vanilla one. */ + if (get_options()->SchedulerTypes_ == NULL) { + the_scheduler = get_vanilla_scheduler(); + return; + } +#endif /* defined(TOR_UNIT_TESTS) */ + + /* This list is ordered that is first entry has the first priority. Thus, as + * soon as we find a scheduler type that we can use, we use it and stop. */ + SMARTLIST_FOREACH_BEGIN(get_options()->SchedulerTypes_, int *, type) { + switch (*type) { + case SCHEDULER_VANILLA: + new_scheduler = get_vanilla_scheduler(); + goto end; + case SCHEDULER_KIST: + if (!scheduler_can_use_kist()) { +#ifdef HAVE_KIST_SUPPORT + if (!have_logged_kist_suddenly_disabled) { + /* We should only log this once in most cases. If it was the kernel + * losing support for kist that caused scheduler_can_use_kist() to + * return false, then this flag makes sure we only log this message + * once. If it was the consensus that switched from "yes use kist" + * to "no don't use kist", then we still set the flag so we log + * once, but we unset the flag elsewhere if we ever can_use_kist() + * again. + */ + have_logged_kist_suddenly_disabled = 1; + log_notice(LD_SCHED, "Scheduler type KIST has been disabled by " + "the consensus or no kernel support."); + } +#else /* !(defined(HAVE_KIST_SUPPORT)) */ + log_info(LD_SCHED, "Scheduler type KIST not built in"); +#endif /* defined(HAVE_KIST_SUPPORT) */ + continue; + } + /* This flag will only get set in one of two cases: + * 1 - the kernel lost support for kist. In that case, we don't expect to + * ever end up here + * 2 - the consensus went from "yes use kist" to "no don't use kist". + * We might end up here if the consensus changes back to "yes", in which + * case we might want to warn the user again if it goes back to "no" + * yet again. Thus we unset the flag */ + have_logged_kist_suddenly_disabled = 0; + new_scheduler = get_kist_scheduler(); + scheduler_kist_set_full_mode(); + goto end; + case SCHEDULER_KIST_LITE: + new_scheduler = get_kist_scheduler(); + scheduler_kist_set_lite_mode(); + goto end; + case SCHEDULER_NONE: + /* fallthrough */ + default: + /* Our option validation should have caught this. */ + tor_assert_unreached(); } - tor_event_free(run_sched_ev); - run_sched_ev = NULL; + } SMARTLIST_FOREACH_END(type); + + end: + if (new_scheduler == NULL) { + log_err(LD_SCHED, "Tor was unable to select a scheduler type. Please " + "make sure Schedulers is correctly configured with " + "what Tor does support."); + /* We weren't able to choose a scheduler which means that none of the ones + * set in Schedulers are supported or usable. We will respect the user + * wishes of using what it has been configured and don't do a sneaky + * fallback. Because this can be changed at runtime, we have to stop tor + * right now. */ + exit(1); } - if (channels_pending) { - smartlist_free(channels_pending); - channels_pending = NULL; - } + /* Set the chosen scheduler. */ + the_scheduler = new_scheduler; } /** - * Comparison function to use when sorting pending channels + * Helper function called from a few different places. It changes the + * scheduler implementation, if necessary. And if it did, it then tells the + * old one to free its state and the new one to initialize. */ +static void +set_scheduler(void) +{ + const scheduler_t *old_scheduler = the_scheduler; + scheduler_types_t old_scheduler_type = SCHEDULER_NONE; + + /* We keep track of the type in order to log only if the type switched. We + * can't just use the scheduler pointers because KIST and KISTLite share the + * same object. */ + if (the_scheduler) { + old_scheduler_type = the_scheduler->type; + } + + /* From the options, select the scheduler type to set. */ + select_scheduler(); + tor_assert(the_scheduler); + + /* We look at the pointer difference in case the old sched and new sched + * share the same scheduler object, as is the case with KIST and KISTLite. */ + if (old_scheduler != the_scheduler) { + /* Allow the old scheduler to clean up, if needed. */ + if (old_scheduler && old_scheduler->free_all) { + old_scheduler->free_all(); + } + + /* Initialize the new scheduler. */ + if (the_scheduler->init) { + the_scheduler->init(); + } + } -MOCK_IMPL(STATIC int, + /* Finally we notice log if we switched schedulers. We use the type in case + * two schedulers share a scheduler object. */ + if (old_scheduler_type != the_scheduler->type) { + log_notice(LD_CONFIG, "Scheduler type %s has been enabled.", + get_scheduler_type_string(the_scheduler->type)); + } +} + +/***************************************************************************** + * Scheduling system private function definitions + * + * Functions that can only be accessed from scheduler*.c + *****************************************************************************/ + +/** Return the pending channel list. */ +smartlist_t * +get_channels_pending(void) +{ + return channels_pending; +} + +/** Comparison function to use when sorting pending channels. */ +MOCK_IMPL(int, scheduler_compare_channels, (const void *c1_v, const void *c2_v)) { - channel_t *c1 = NULL, *c2 = NULL; + const channel_t *c1 = NULL, *c2 = NULL; /* These are a workaround for -Wbad-function-cast throwing a fit */ const circuitmux_policy_t *p1, *p2; uintptr_t p1_i, p2_i; @@ -180,11 +381,8 @@ scheduler_compare_channels, (const void *c1_v, const void *c2_v)) tor_assert(c1_v); tor_assert(c2_v); - c1 = (channel_t *)(c1_v); - c2 = (channel_t *)(c2_v); - - tor_assert(c1); - tor_assert(c2); + c1 = (const channel_t *)(c1_v); + c2 = (const channel_t *)(c2_v); if (c1 != c2) { if (circuitmux_get_policy(c1->cmux) == @@ -211,36 +409,83 @@ scheduler_compare_channels, (const void *c1_v, const void *c2_v)) } } -/* - * Scheduler event callback; this should get triggered once per event loop - * if any scheduling work was created during the event loop. - */ +/***************************************************************************** + * Scheduling system global functions + * + * Functions that can be accessed from anywhere in Tor. + *****************************************************************************/ -static void -scheduler_evt_callback(evutil_socket_t fd, short events, void *arg) +/** + * This is how the scheduling system is notified of Tor's configuration + * changing. For example: a SIGHUP was issued. + */ +void +scheduler_conf_changed(void) { - (void)fd; - (void)events; - (void)arg; - log_debug(LD_SCHED, "Scheduler event callback called"); + /* Let the scheduler decide what it should do. */ + set_scheduler(); - tor_assert(run_sched_ev); + /* Then tell the (possibly new) scheduler that we have new options. */ + if (the_scheduler->on_new_options) { + the_scheduler->on_new_options(); + } +} - /* Run the scheduler */ - scheduler_run(); +/** + * Whenever we get a new consensus, this function is called. + */ +void +scheduler_notify_networkstatus_changed(void) +{ + /* Maybe the consensus param made us change the scheduler. */ + set_scheduler(); - /* Do we have more work to do? */ - if (scheduler_more_work()) scheduler_retrigger(); + /* Then tell the (possibly new) scheduler that we have a new consensus */ + if (the_scheduler->on_new_consensus) { + the_scheduler->on_new_consensus(); + } } -/** Mark a channel as no longer ready to accept writes */ +/** + * Free everything scheduling-related from main.c. Note this is only called + * when Tor is shutting down, while scheduler_t->free_all() is called both when + * Tor is shutting down and when we are switching schedulers. + */ +void +scheduler_free_all(void) +{ + log_debug(LD_SCHED, "Shutting down scheduler"); + + if (run_sched_ev) { + if (event_del(run_sched_ev) < 0) { + log_warn(LD_BUG, "Problem deleting run_sched_ev"); + } + tor_event_free(run_sched_ev); + run_sched_ev = NULL; + } + + if (channels_pending) { + /* We don't have ownership of the objects in this list. */ + smartlist_free(channels_pending); + channels_pending = NULL; + } + + if (the_scheduler && the_scheduler->free_all) { + the_scheduler->free_all(); + } + the_scheduler = NULL; +} +/** Mark a channel as no longer ready to accept writes. */ MOCK_IMPL(void, scheduler_channel_doesnt_want_writes,(channel_t *chan)) { - tor_assert(chan); - - tor_assert(channels_pending); + IF_BUG_ONCE(!chan) { + return; + } + IF_BUG_ONCE(!channels_pending) { + return; + } /* If it's already in pending, we can put it in waiting_to_write */ if (chan->scheduler_state == SCHED_CHAN_PENDING) { @@ -251,7 +496,7 @@ scheduler_channel_doesnt_want_writes,(channel_t *chan)) */ smartlist_pqueue_remove(channels_pending, scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), + offsetof(channel_t, sched_heap_idx), chan); chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; log_debug(LD_SCHED, @@ -273,17 +518,18 @@ scheduler_channel_doesnt_want_writes,(channel_t *chan)) } } -/** Mark a channel as having waiting cells */ - +/** Mark a channel as having waiting cells. */ MOCK_IMPL(void, scheduler_channel_has_waiting_cells,(channel_t *chan)) { - int became_pending = 0; - - tor_assert(chan); - tor_assert(channels_pending); + IF_BUG_ONCE(!chan) { + return; + } + IF_BUG_ONCE(!channels_pending) { + return; + } - /* First, check if this one also writeable */ + /* First, check if it's also writeable */ if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) { /* * It's in channels_waiting_for_cells, so it shouldn't be in any of @@ -291,15 +537,19 @@ scheduler_channel_has_waiting_cells,(channel_t *chan)) * channels_pending. */ chan->scheduler_state = SCHED_CHAN_PENDING; - smartlist_pqueue_add(channels_pending, - scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), - chan); + if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) { + smartlist_pqueue_add(channels_pending, + scheduler_compare_channels, + offsetof(channel_t, sched_heap_idx), + chan); + } log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p went from waiting_for_cells " "to pending", U64_PRINTF_ARG(chan->global_identifier), chan); - became_pending = 1; + /* If we made a channel pending, we potentially have scheduling work to + * do. */ + the_scheduler->schedule(); } else { /* * It's not in waiting_for_cells, so it can't become pending; it's @@ -314,250 +564,125 @@ scheduler_channel_has_waiting_cells,(channel_t *chan)) U64_PRINTF_ARG(chan->global_identifier), chan); } } +} - /* - * If we made a channel pending, we potentially have scheduling work - * to do. - */ - if (became_pending) scheduler_retrigger(); +/** Add the scheduler event to the set of pending events with next_run being + * the longest time libevent should wait before triggering the event. */ +void +scheduler_ev_add(const struct timeval *next_run) +{ + tor_assert(run_sched_ev); + tor_assert(next_run); + if (BUG(event_add(run_sched_ev, next_run) < 0)) { + log_warn(LD_SCHED, "Adding to libevent failed. Next run time was set to: " + "%ld.%06ld", next_run->tv_sec, (long)next_run->tv_usec); + return; + } } -/** Set up the scheduling system */ +/** Make the scheduler event active with the given flags. */ +void +scheduler_ev_active(int flags) +{ + tor_assert(run_sched_ev); + event_active(run_sched_ev, flags, 1); +} +/* + * Initialize everything scheduling-related from config.c. Note this is only + * called when Tor is starting up, while scheduler_t->init() is called both + * when Tor is starting up and when we are switching schedulers. + */ void scheduler_init(void) { log_debug(LD_SCHED, "Initting scheduler"); - tor_assert(!run_sched_ev); + // Two '!' because we really do want to check if the pointer is non-NULL + IF_BUG_ONCE(!!run_sched_ev) { + log_warn(LD_SCHED, "We should not already have a libevent scheduler event." + "I'll clean the old one up, but this is odd."); + tor_event_free(run_sched_ev); + run_sched_ev = NULL; + } run_sched_ev = tor_event_new(tor_libevent_get_base(), -1, 0, scheduler_evt_callback, NULL); - channels_pending = smartlist_new(); - queue_heuristic = 0; - queue_heuristic_timestamp = approx_time(); -} - -/** Check if there's more scheduling work */ -static int -scheduler_more_work(void) -{ - tor_assert(channels_pending); - - return ((scheduler_get_queue_heuristic() < sched_q_low_water) && - ((smartlist_len(channels_pending) > 0))) ? 1 : 0; + set_scheduler(); } -/** Retrigger the scheduler in a way safe to use from the callback */ - -static void -scheduler_retrigger(void) -{ - tor_assert(run_sched_ev); - event_active(run_sched_ev, EV_TIMEOUT, 1); -} - -/** Notify the scheduler of a channel being closed */ - +/* + * If a channel is going away, this is how the scheduling system is informed + * so it can do any freeing necessary. This ultimately calls + * scheduler_t->on_channel_free() so the current scheduler can release any + * state specific to this channel. + */ MOCK_IMPL(void, scheduler_release_channel,(channel_t *chan)) { - tor_assert(chan); - tor_assert(channels_pending); + IF_BUG_ONCE(!chan) { + return; + } + IF_BUG_ONCE(!channels_pending) { + return; + } - if (chan->scheduler_state == SCHED_CHAN_PENDING) { + /* Try to remove the channel from the pending list regardless of its + * scheduler state. We can release a channel in many places in the tor code + * so we can't rely on the channel state (PENDING) to remove it from the + * list. + * + * For instance, the channel can change state from OPEN to CLOSING while + * being handled in the scheduler loop leading to the channel being in + * PENDING state but not in the pending list. Furthermore, we release the + * channel when it changes state to close and a second time when we free it. + * Not ideal at all but for now that is the way it is. */ + if (chan->sched_heap_idx != -1) { smartlist_pqueue_remove(channels_pending, scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), + offsetof(channel_t, sched_heap_idx), chan); } - chan->scheduler_state = SCHED_CHAN_IDLE; -} - -/** Run the scheduling algorithm if necessary */ - -MOCK_IMPL(void, -scheduler_run, (void)) -{ - int n_cells, n_chans_before, n_chans_after; - uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after; - ssize_t flushed, flushed_this_time; - smartlist_t *to_readd = NULL; - channel_t *chan = NULL; - - log_debug(LD_SCHED, "We have a chance to run the scheduler"); - - if (scheduler_get_queue_heuristic() < sched_q_low_water) { - n_chans_before = smartlist_len(channels_pending); - q_len_before = channel_get_global_queue_estimate(); - q_heur_before = scheduler_get_queue_heuristic(); - - while (scheduler_get_queue_heuristic() <= sched_q_high_water && - smartlist_len(channels_pending) > 0) { - /* Pop off a channel */ - chan = smartlist_pqueue_pop(channels_pending, - scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx)); - tor_assert(chan); - - /* Figure out how many cells we can write */ - n_cells = channel_num_cells_writeable(chan); - if (n_cells > 0) { - log_debug(LD_SCHED, - "Scheduler saw pending channel " U64_FORMAT " at %p with " - "%d cells writeable", - U64_PRINTF_ARG(chan->global_identifier), chan, n_cells); - - flushed = 0; - while (flushed < n_cells && - scheduler_get_queue_heuristic() <= sched_q_high_water) { - flushed_this_time = - channel_flush_some_cells(chan, - MIN(sched_max_flush_cells, - (size_t) n_cells - flushed)); - if (flushed_this_time <= 0) break; - flushed += flushed_this_time; - } - - if (flushed < n_cells) { - /* We ran out of cells to flush */ - chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS; - log_debug(LD_SCHED, - "Channel " U64_FORMAT " at %p " - "entered waiting_for_cells from pending", - U64_PRINTF_ARG(chan->global_identifier), - chan); - } else { - /* The channel may still have some cells */ - if (channel_more_to_flush(chan)) { - /* The channel goes to either pending or waiting_to_write */ - if (channel_num_cells_writeable(chan) > 0) { - /* Add it back to pending later */ - if (!to_readd) to_readd = smartlist_new(); - smartlist_add(to_readd, chan); - log_debug(LD_SCHED, - "Channel " U64_FORMAT " at %p " - "is still pending", - U64_PRINTF_ARG(chan->global_identifier), - chan); - } else { - /* It's waiting to be able to write more */ - chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; - log_debug(LD_SCHED, - "Channel " U64_FORMAT " at %p " - "entered waiting_to_write from pending", - U64_PRINTF_ARG(chan->global_identifier), - chan); - } - } else { - /* No cells left; it can go to idle or waiting_for_cells */ - if (channel_num_cells_writeable(chan) > 0) { - /* - * It can still accept writes, so it goes to - * waiting_for_cells - */ - chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS; - log_debug(LD_SCHED, - "Channel " U64_FORMAT " at %p " - "entered waiting_for_cells from pending", - U64_PRINTF_ARG(chan->global_identifier), - chan); - } else { - /* - * We exactly filled up the output queue with all available - * cells; go to idle. - */ - chan->scheduler_state = SCHED_CHAN_IDLE; - log_debug(LD_SCHED, - "Channel " U64_FORMAT " at %p " - "become idle from pending", - U64_PRINTF_ARG(chan->global_identifier), - chan); - } - } - } - - log_debug(LD_SCHED, - "Scheduler flushed %d cells onto pending channel " - U64_FORMAT " at %p", - (int)flushed, U64_PRINTF_ARG(chan->global_identifier), - chan); - } else { - log_info(LD_SCHED, - "Scheduler saw pending channel " U64_FORMAT " at %p with " - "no cells writeable", - U64_PRINTF_ARG(chan->global_identifier), chan); - /* Put it back to WAITING_TO_WRITE */ - chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE; - } - } - - /* Readd any channels we need to */ - if (to_readd) { - SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) { - readd_chan->scheduler_state = SCHED_CHAN_PENDING; - smartlist_pqueue_add(channels_pending, - scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), - readd_chan); - } SMARTLIST_FOREACH_END(readd_chan); - smartlist_free(to_readd); - } - - n_chans_after = smartlist_len(channels_pending); - q_len_after = channel_get_global_queue_estimate(); - q_heur_after = scheduler_get_queue_heuristic(); - log_debug(LD_SCHED, - "Scheduler handled %d of %d pending channels, queue size from " - U64_FORMAT " to " U64_FORMAT ", queue heuristic from " - U64_FORMAT " to " U64_FORMAT, - n_chans_before - n_chans_after, n_chans_before, - U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after), - U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after)); + if (the_scheduler->on_channel_free) { + the_scheduler->on_channel_free(chan); } + chan->scheduler_state = SCHED_CHAN_IDLE; } -/** Trigger the scheduling event so we run the scheduler later */ - -#if 0 -static void -scheduler_trigger(void) -{ - log_debug(LD_SCHED, "Triggering scheduler event"); - - tor_assert(run_sched_ev); - - event_add(run_sched_ev, EV_TIMEOUT, 1); -} -#endif - /** Mark a channel as ready to accept writes */ void scheduler_channel_wants_writes(channel_t *chan) { - int became_pending = 0; - - tor_assert(chan); - tor_assert(channels_pending); + IF_BUG_ONCE(!chan) { + return; + } + IF_BUG_ONCE(!channels_pending) { + return; + } /* If it's already in waiting_to_write, we can put it in pending */ if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) { /* * It can write now, so it goes to channels_pending. */ - smartlist_pqueue_add(channels_pending, - scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), - chan); + log_debug(LD_SCHED, "chan=%" PRIu64 " became pending", + chan->global_identifier); + if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) { + smartlist_pqueue_add(channels_pending, + scheduler_compare_channels, + offsetof(channel_t, sched_heap_idx), + chan); + } chan->scheduler_state = SCHED_CHAN_PENDING; log_debug(LD_SCHED, "Channel " U64_FORMAT " at %p went from waiting_to_write " "to pending", U64_PRINTF_ARG(chan->global_identifier), chan); - became_pending = 1; + /* We just made a channel pending, we have scheduling work to do. */ + the_scheduler->schedule(); } else { /* * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending; @@ -571,137 +696,70 @@ scheduler_channel_wants_writes(channel_t *chan) U64_PRINTF_ARG(chan->global_identifier), chan); } } +} + +/* Log warn the given channel and extra scheduler context as well. This is + * used by SCHED_BUG() in order to be able to extract as much information as + * we can when we hit a bug. Channel chan can be NULL. */ +void +scheduler_bug_occurred(const channel_t *chan) +{ + char buf[128]; + + if (chan != NULL) { + const size_t outbuf_len = + buf_datalen(TO_CONN(BASE_CHAN_TO_TLS((channel_t *) chan)->conn)->outbuf); + tor_snprintf(buf, sizeof(buf), + "Channel %" PRIu64 " in state %s and scheduler state %d." + " Num cells on cmux: %d. Connection outbuf len: %lu.", + chan->global_identifier, + channel_state_to_string(chan->state), + chan->scheduler_state, circuitmux_num_cells(chan->cmux), + (unsigned long)outbuf_len); + } - /* - * If we made a channel pending, we potentially have scheduling work - * to do. - */ - if (became_pending) scheduler_retrigger(); + { + char *msg; + /* Rate limit every 60 seconds. If we start seeing this every 60 sec, we + * know something is stuck/wrong. It *should* be loud but not too much. */ + static ratelim_t rlimit = RATELIM_INIT(60); + if ((msg = rate_limit_log(&rlimit, approx_time()))) { + log_warn(LD_BUG, "%s Num pending channels: %d. " + "Channel in pending list: %s.%s", + (chan != NULL) ? buf : "No channel in bug context.", + smartlist_len(channels_pending), + (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes", + msg); + tor_free(msg); + } + } } -/** - * Notify the scheduler that a channel's position in the pqueue may have - * changed - */ +#ifdef TOR_UNIT_TESTS +/* + * Notify scheduler that a channel's queue position may have changed. + */ void scheduler_touch_channel(channel_t *chan) { - tor_assert(chan); + IF_BUG_ONCE(!chan) { + return; + } if (chan->scheduler_state == SCHED_CHAN_PENDING) { /* Remove and re-add it */ smartlist_pqueue_remove(channels_pending, scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), + offsetof(channel_t, sched_heap_idx), chan); smartlist_pqueue_add(channels_pending, scheduler_compare_channels, - STRUCT_OFFSET(channel_t, sched_heap_idx), + offsetof(channel_t, sched_heap_idx), chan); } /* else no-op, since it isn't in the queue */ } -/** - * Notify the scheduler of a queue size adjustment, to recalculate the - * queue heuristic. - */ - -void -scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj) -{ - time_t now = approx_time(); - - log_debug(LD_SCHED, - "Queue size adjustment by %s" U64_FORMAT " for channel " - U64_FORMAT, - (dir >= 0) ? "+" : "-", - U64_PRINTF_ARG(adj), - U64_PRINTF_ARG(chan->global_identifier)); - - /* Get the queue heuristic up to date */ - scheduler_update_queue_heuristic(now); - - /* Adjust as appropriate */ - if (dir >= 0) { - /* Increasing it */ - queue_heuristic += adj; - } else { - /* Decreasing it */ - if (queue_heuristic > adj) queue_heuristic -= adj; - else queue_heuristic = 0; - } - - log_debug(LD_SCHED, - "Queue heuristic is now " U64_FORMAT, - U64_PRINTF_ARG(queue_heuristic)); -} - -/** - * Query the current value of the queue heuristic - */ - -STATIC uint64_t -scheduler_get_queue_heuristic(void) -{ - time_t now = approx_time(); - - scheduler_update_queue_heuristic(now); - - return queue_heuristic; -} - -/** - * Adjust the queue heuristic value to the present time - */ - -STATIC void -scheduler_update_queue_heuristic(time_t now) -{ - time_t diff; - - if (queue_heuristic_timestamp == 0) { - /* - * Nothing we can sensibly do; must not have been initted properly. - * Oh well. - */ - queue_heuristic_timestamp = now; - } else if (queue_heuristic_timestamp < now) { - diff = now - queue_heuristic_timestamp; - /* - * This is a simple exponential age-out; the other proposed alternative - * was a linear age-out using the bandwidth history in rephist.c; I'm - * going with this out of concern that if an adversary can jam the - * scheduler long enough, it would cause the bandwidth to drop to - * zero and render the aging mechanism ineffective thereafter. - */ - if (0 <= diff && diff < 64) queue_heuristic >>= diff; - else queue_heuristic = 0; - - queue_heuristic_timestamp = now; - - log_debug(LD_SCHED, - "Queue heuristic is now " U64_FORMAT, - U64_PRINTF_ARG(queue_heuristic)); - } - /* else no update needed, or time went backward */ -} - -/** - * Set scheduler watermarks and flush size - */ - -void -scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush) -{ - /* Sanity assertions - caller should ensure these are true */ - tor_assert(lo > 0); - tor_assert(hi > lo); - tor_assert(max_flush > 0); - - sched_q_low_water = lo; - sched_q_high_water = hi; - sched_max_flush_cells = max_flush; -} +#endif /* defined(TOR_UNIT_TESTS) */ |