diff options
-rw-r--r-- | changes/cbt-bugfixes | 33 | ||||
-rw-r--r-- | doc/spec/control-spec.txt | 7 | ||||
-rw-r--r-- | doc/spec/path-spec.txt | 177 | ||||
-rw-r--r-- | doc/spec/proposals/151-path-selection-improvements.txt | 1 | ||||
-rw-r--r-- | doc/tor.1.txt | 9 | ||||
-rw-r--r-- | src/or/circuitbuild.c | 534 | ||||
-rw-r--r-- | src/or/circuituse.c | 66 | ||||
-rw-r--r-- | src/or/config.c | 6 | ||||
-rw-r--r-- | src/or/control.c | 8 | ||||
-rw-r--r-- | src/or/or.h | 85 | ||||
-rw-r--r-- | src/test/test.c | 54 |
11 files changed, 755 insertions, 225 deletions
diff --git a/changes/cbt-bugfixes b/changes/cbt-bugfixes new file mode 100644 index 0000000000..31241c90e9 --- /dev/null +++ b/changes/cbt-bugfixes @@ -0,0 +1,33 @@ + o Major bugfixes: + - Ignore negative and large timeout values that can happen during a + suspend or hibernate. These values caused various asserts to fire + in the circuit build times code, crashing Tor. Bug 1245, bugfix on + 0.2.2.2-alpha. + - Alter calculation of Pareto distribution parameter 'Xm' for Circuit Build + Timeout learning to use the weighted average of the top N=3 modes. This + should improve the timeout calculation in some cases, and prevent + extremely high timeout values. Bug 1335, bugfix on 0.2.2.2-alpha. + - Alter calculation of Pareto distribution parameter 'Alpha' to use a + right censored distribution model. This allows us to remove the synthetic + timeout generation and instead calculate build timeouts using truncated + times. Bugs 1245+1335, bugfix on 0.2.2.2-alpha. + - Keep circuits open (but do not use them) after the circuit timeout for + up until the time corresponding to the 95th percentile on the Pareto CDF + or 60 seconds, whichever is greater. This is done to provide better data + for the new Pareto model. This percentile can be controlled by the + consensus. + + o Minor bugfixes: + - Eliminate a case where a circuit build time warning was displayed after + network connectivity resumed. Bugfix on 0.2.2.2-alpha. + + o Minor features: + - Add a TIMEOUT_RATE keyword to the BUILDTIMEOUT_SET control port event, + to give information on the current rate of circuit timeouts over our + stored history. + - Add ability to disable circuit build time learning via consensus + parameter and via a LearnCircuitBuildTimeout config option. Also + automatically disable circuit build time calculation if we are either + a AuthoritativeDirectory, or if we fail to write our state file. Bug 1296. + + diff --git a/doc/spec/control-spec.txt b/doc/spec/control-spec.txt index 1eb699c8b6..5a68864b29 100644 --- a/doc/spec/control-spec.txt +++ b/doc/spec/control-spec.txt @@ -1675,13 +1675,18 @@ The syntax is: "650" SP "BUILDTIMEOUT_SET" SP Type SP "TOTAL_TIMES=" Total SP "TIMEOUT_MS=" Timeout SP "XM=" Xm SP "ALPHA=" Alpha SP - "CUTOFF_QUANTILE=" Quantile CRLF + "CUTOFF_QUANTILE=" Quantile SP "TIMEOUT_RATE=" TimeoutRate SP + "CLOSE_MS=" CloseTimeout SP "CLOSE_RATE=" CloseRate + CRLF Type = "COMPUTED" / "RESET" / "SUSPENDED" / "DISCARD" / "RESUME" Total = Integer count of timeouts stored Timeout = Integer timeout in milliseconds Xm = Estimated integer Pareto parameter Xm in milliseconds Alpha = Estimated floating point Paredo paremter alpha Quantile = Floating point CDF quantile cutoff point for this timeout + TimeoutRate = Floating point ratio of circuits that timeout + CloseTimeout = How long to keep measurement circs in milliseconds + CloseRate = Floating point ratio of measurement circuits that are closed A new circuit build timeout time has been set. If Type is "COMPUTED", Tor has computed the value based on historical data. If Type is "RESET", diff --git a/doc/spec/path-spec.txt b/doc/spec/path-spec.txt index b137eca89b..2e4207bd56 100644 --- a/doc/spec/path-spec.txt +++ b/doc/spec/path-spec.txt @@ -180,6 +180,7 @@ of their choices. XXXX + 2.2. Path selection and constraints We choose the path for each new circuit before we build it. We choose the @@ -300,8 +301,182 @@ of their choices. at a given node -- either via the ".exit" notation or because the destination is running at the same location as an exit node. +2.4. Learning when to give up ("timeout") on circuit construction + + Since version 0.2.2.8-alpha, Tor attempts to learn when to give up on + circuits based on network conditions. + +2.4.1 Distribution choice and parameter estimation + + Based on studies of build times, we found that the distribution of + circuit build times appears to be a Frechet distribution. However, + estimators and quantile functions of the Frechet distribution are + difficult to work with and slow to converge. So instead, since we + are only interested in the accuracy of the tail, we approximate + the tail of the distribution with a Pareto curve. + + We calculate the parameters for a Pareto distribution fitting the data + using the estimators in equation 4 from: + http://portal.acm.org/citation.cfm?id=1647962.1648139 + + This is: + + alpha_m = s/(ln(U(X)/Xm^n)) + + where s is the total number of completed circuits we have seen, and + + U(X) = x_max^u * Prod_s{x_i} + + with x_i as our i-th completed circuit time, x_max as the longest + completed circuit build time we have yet observed, u as the + number of unobserved timeouts that have no exact value recorded, + and n as u+s, the total number of circuits that either timeout or + complete. + + Using log laws, we compute this as the sum of logs to avoid + overflow and ln(1.0+epsilon) precision issues: + + alpha_m = s/(u*ln(x_max) + Sum_s{ln(x_i)} - n*ln(Xm)) + + This estimator is closely related to the parameters present in: + http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation + except they are adjusted to handle the fact that our samples are + right-censored at the timeout cutoff. + + Additionally, because this is not a true Pareto distribution, we alter + how Xm is computed. The Xm parameter is computed as the midpoint of the most + frequently occurring 50ms histogram bin, until the point where 1000 + circuits are recorded. After this point, the weighted average of the top + 'cbtnummodes' (default: 3) midpoint modes is used as Xm. All times below + this value are counted as having the midpoint value of this weighted average bin. + + The timeout itself is calculated by using the Pareto Quantile function (the + inverted CDF) to give us the value on the CDF such that 80% of the mass + of the distribution is below the timeout value. + + Thus, we expect that the Tor client will accept the fastest 80% of + the total number of paths on the network. + +2.4.2. How much data to record + + From our observations, the minimum number of circuit build times for a + reasonable fit appears to be on the order of 100. However, to keep a + good fit over the long term, we store 1000 most recent circuit build times + in a circular array. + + The Tor client should build test circuits at a rate of one per + minute up until 100 circuits are built. This allows a fresh Tor to have + a CircuitBuildTimeout estimated within 1.5 hours after install, + upgrade, or network change (see below). + + Timeouts are stored on disk in a histogram of 50ms bin width, the same + width used to calculate the Xm value above. This histogram must be shuffled + after being read from disk, to preserve a proper expiration of old values + after restart. + +2.4.3. How to record timeouts + + Circuits that pass the timeout threshold should be allowed to continue + building until a time corresponding to the point 'cbtclosequantile' + (default 95) on the Pareto curve, or 60 seconds, whichever is greater. + + The actual completion times for these circuits should be recorded. + Implementations should completely abandon a circuit and record a value + as an 'unknown' timeout if the total build time exceeds this threshold. + + The reason for this is that right-censored pareto estimators begin to lose + their accuracy if more than approximately 5% of the values are censored. + Since we wish to set the cutoff at 20%, we must allow circuits to continue + building past this cutoff point up to the 95th percentile. + +2.4.4. Detecting Changing Network Conditions + + We attempt to detect both network connectivity loss and drastic + changes in the timeout characteristics. + + We assume that we've had network connectivity loss if 3 circuits + timeout and we've received no cells or TLS handshakes since those + circuits began. We then temporarily set the timeout to 60 seconds + and stop counting timeouts. + + If 3 more circuits timeout and the network still has not been + live within this new 60 second timeout window, we then discard + the previous timeouts during this period from our history. + + To detect changing network conditions, we keep a history of + the timeout or non-timeout status of the past 20 circuits that + successfully completed at least one hop. If more than 90% of + these circuits timeout, we discard all buildtimes history, reset + the timeout to 60, and then begin recomputing the timeout. + + If the timeout was already 60 or higher, we double the timeout. + +2.4.5. Consensus parameters governing behavior + + Clients that implement circuit build timeout learning should obey the + following consensus parameters that govern behavior, in order to allow + us to handle bugs or other emergent behaviors due to client circuit + construction. If these parameters are not present in the consensus, + the listed default values should be used instead. -2.4. Handling failure + cbtdisabled + Default: 0 + Effect: If non-zero, all CircuitBuildTime learning code should be + disabled and history should be discarded. For use in + emergency situations only. + + cbtnummodes + Default: 3 + Effect: This value governs how many modes to use in the weighted + average calculation of Pareto paramter Xm. A value of 3 introduces + some bias (2-5% of CDF) under ideal conditions, but allows for better + performance in the event that a client chooses guard nodes of radically + different performance characteristics. + + cbtrecentcount + Default: 20 + Effect: This is the number of circuit build times to keep track of + for the following option. + + cbtmaxtimeouts + Default: 18 + Effect: When this many timeouts happen in the last 'cbtrecentcount' + circuit attempts, the client should discard all of its + history and begin learning a fresh timeout value. + + cbtmincircs + Default: 100 + Effect: This is the minimum number of circuits to build before + computing a timeout. + + cbtquantile + Default: 80 + Effect: This is the position on the quantile curve to use to set the + timeout value. It is a percent (0-99). + + cbtclosequantile + Default: 95 + Effect: This is the position on the quantile curve to use to set the + timeout value to use to actually close circuits. It is a percent + (0-99). + + cbttestfreq + Default: 60 + Effect: Describes how often in seconds to build a test circuit to + gather timeout values. Only applies if less than 'cbtmincircs' + have been recorded. + + cbtmintimeout + Default: 2000 + Effect: This is the minimum allowed timeout value in milliseconds. + + cbtinitialtimeout + Default: 60000 + Effect: This is the timeout value to use before computing a timeout, + in milliseconds. + + +2.5. Handling failure If an attempt to extend a circuit fails (either because the first create failed or a subsequent extend failed) then the circuit is torn down and is diff --git a/doc/spec/proposals/151-path-selection-improvements.txt b/doc/spec/proposals/151-path-selection-improvements.txt index 363eebae84..af89f21193 100644 --- a/doc/spec/proposals/151-path-selection-improvements.txt +++ b/doc/spec/proposals/151-path-selection-improvements.txt @@ -3,6 +3,7 @@ Title: Improving Tor Path Selection Author: Fallon Chen, Mike Perry Created: 5-Jul-2008 Status: Finished +In-Spec: path-spec.txt Overview diff --git a/doc/tor.1.txt b/doc/tor.1.txt index 15ecb79eba..72b091bbbb 100644 --- a/doc/tor.1.txt +++ b/doc/tor.1.txt @@ -419,9 +419,16 @@ The following options are useful only for clients (that is, if fingerprint to look up the bridge descriptor at the bridge authority, if it's provided and if UpdateBridgesFromAuthority is set too. +**LearnCircuitBuildTimeout** **0**|**1**:: + If 0, CircuitBuildTimeout adaptive learning is disabled. (Default: 1) + **CircuitBuildTimeout** __NUM__:: + Try for at most NUM seconds when building circuits. If the circuit isn't - open in that time, give up on it. (Default: 1 minute.) + open in that time, give up on it. If LearnCircuitBuildTimeout is 1, this + value serves as the initial value to use before a timeout is learned. If + LearnCircuitBuildTimeout is 0, this value is the only value used. + (Default: 60 seconds.) **CircuitIdleTimeout** __NUM__:: If we have kept a clean (never used) circuit around for NUM seconds, then diff --git a/src/or/circuitbuild.c b/src/or/circuitbuild.c index 417d8ec8d8..1e51ffbe76 100644 --- a/src/or/circuitbuild.c +++ b/src/or/circuitbuild.c @@ -20,6 +20,8 @@ #define MIN(a,b) ((a)<(b)?(a):(b)) #endif +#define CBT_BIN_TO_MS(bin) ((bin)*CBT_BIN_WIDTH + (CBT_BIN_WIDTH/2)) + /********* START VARIABLES **********/ /** Global list of circuit build times */ // FIXME: Add this as a member for entry_guard_t instead of global? @@ -79,6 +81,32 @@ static int onion_append_hop(crypt_path_t **head_ptr, extend_info_t *choice); static void entry_guards_changed(void); +static int +circuit_build_times_disabled(void) +{ + if (unit_tests) { + return 0; + } else { + int consensus_disabled = networkstatus_get_param(NULL, "cbtdisabled", + 0); + int config_disabled = !get_options()->LearnCircuitBuildTimeout; + int dirauth_disabled = get_options()->AuthoritativeDir; + int state_disabled = (get_or_state()->LastWritten == -1); + + if (consensus_disabled || config_disabled || dirauth_disabled || + state_disabled) { + log_info(LD_CIRC, + "CircuitBuildTime learning is disabled. " + "Consensus=%d, Config=%d, AuthDir=%d, StateFile=%d", + consensus_disabled, config_disabled, dirauth_disabled, + state_disabled); + return 1; + } else { + return 0; + } + } +} + static int32_t circuit_build_times_max_timeouts(void) { @@ -88,6 +116,14 @@ circuit_build_times_max_timeouts(void) } static int32_t +circuit_build_times_default_num_xm_modes(void) +{ + int32_t num = networkstatus_get_param(NULL, "cbtnummodes", + CBT_DEFAULT_NUM_XM_MODES); + return num; +} + +static int32_t circuit_build_times_min_circs_to_observe(void) { int32_t num = networkstatus_get_param(NULL, "cbtmincircs", @@ -103,6 +139,15 @@ circuit_build_times_quantile_cutoff(void) return num/100.0; } +static double +circuit_build_times_close_quantile(void) +{ + int32_t num = networkstatus_get_param(NULL, "cbtclosequantile", + CBT_DEFAULT_CLOSE_QUANTILE); + + return num/100.0; +} + static int32_t circuit_build_times_test_frequency(void) { @@ -222,7 +267,6 @@ void circuit_build_times_reset(circuit_build_times_t *cbt) { memset(cbt->circuit_build_times, 0, sizeof(cbt->circuit_build_times)); - cbt->pre_timeouts = 0; cbt->total_build_times = 0; cbt->build_times_idx = 0; cbt->have_computed_timeout = 0; @@ -241,34 +285,18 @@ circuit_build_times_init(circuit_build_times_t *cbt) cbt->liveness.num_recent_circs = circuit_build_times_recent_circuit_count(); cbt->liveness.timeouts_after_firsthop = tor_malloc_zero(sizeof(int8_t)* cbt->liveness.num_recent_circs); - cbt->timeout_ms = circuit_build_times_get_initial_timeout(); + cbt->close_ms = cbt->timeout_ms = circuit_build_times_get_initial_timeout(); control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_RESET); } /** - * Rewind our timeout history by n timeout positions. + * Rewind our build time history by n positions. */ static void circuit_build_times_rewind_history(circuit_build_times_t *cbt, int n) { int i = 0; - if (cbt->pre_timeouts) { - /* If we have pre-timeouts, it means we're not yet storing - * timeouts in our normal array. Only rewind the counter. */ - if (cbt->pre_timeouts > n) { - cbt->pre_timeouts -= n; - } else { - cbt->pre_timeouts = 0; - } - log_info(LD_CIRC, - "Rewound history by %d places. Current index: %d. Total: %d. " - "Pre-timeouts: %d", n, cbt->build_times_idx, - cbt->total_build_times, cbt->pre_timeouts); - - return; - } - cbt->build_times_idx -= n; cbt->build_times_idx %= CBT_NCIRCUITS_TO_OBSERVE; @@ -289,7 +317,7 @@ circuit_build_times_rewind_history(circuit_build_times_t *cbt, int n) } /** - * Add a new timeout value <b>time</b> to the set of build times. Time + * Add a new build time value <b>time</b> to the set of build times. Time * units are milliseconds. * * circuit_build_times <b>cbt</a> is a circular array, so loop around when @@ -298,14 +326,14 @@ circuit_build_times_rewind_history(circuit_build_times_t *cbt, int n) int circuit_build_times_add_time(circuit_build_times_t *cbt, build_time_t time) { - tor_assert(time <= CBT_BUILD_TIME_MAX); - if (time <= 0) { - log_warn(LD_CIRC, "Circuit build time is %u!", time); + if (time <= 0 || time > CBT_BUILD_TIME_MAX) { + log_warn(LD_BUG, "Circuit build time is too large (%u)." + "This is probably a bug.", time); + tor_fragile_assert(); return -1; } - // XXX: Probably want to demote this to debug for the release. - log_info(LD_CIRC, "Adding circuit build time %u", time); + log_debug(LD_CIRC, "Adding circuit build time %u", time); cbt->circuit_build_times[cbt->build_times_idx] = time; cbt->build_times_idx = (cbt->build_times_idx + 1) % CBT_NCIRCUITS_TO_OBSERVE; @@ -330,7 +358,8 @@ circuit_build_times_max(circuit_build_times_t *cbt) int i = 0; build_time_t max_build_time = 0; for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { - if (cbt->circuit_build_times[i] > max_build_time) + if (cbt->circuit_build_times[i] > max_build_time + && cbt->circuit_build_times[i] != CBT_BUILD_ABANDONED) max_build_time = cbt->circuit_build_times[i]; } return max_build_time; @@ -377,7 +406,9 @@ circuit_build_times_create_histogram(circuit_build_times_t *cbt, // calculate histogram for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { - if (cbt->circuit_build_times[i] == 0) continue; /* 0 <-> uninitialized */ + if (cbt->circuit_build_times[i] == 0 + || cbt->circuit_build_times[i] == CBT_BUILD_ABANDONED) + continue; /* 0 <-> uninitialized */ c = (cbt->circuit_build_times[i] / CBT_BIN_WIDTH); histogram[c]++; @@ -387,25 +418,55 @@ circuit_build_times_create_histogram(circuit_build_times_t *cbt, } /** - * Return the most frequent build time (rounded to CBT_BIN_WIDTH ms). + * Return the Pareto start-of-curve parameter Xm. * - * Ties go in favor of the slower time. + * Because we are not a true Pareto curve, we compute this as the + * weighted average of the N=3 most frequent build time bins. */ static build_time_t -circuit_build_times_mode(circuit_build_times_t *cbt) +circuit_build_times_get_xm(circuit_build_times_t *cbt) { - build_time_t i, nbins, max_bin=0; + build_time_t i, nbins; + build_time_t *nth_max_bin; + int32_t bin_counts=0; + build_time_t ret = 0; uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins); + int n=0; + int num_modes = circuit_build_times_default_num_xm_modes(); + + // Only use one mode if < 1000 buildtimes. Not enough data + // for multiple. + if (cbt->total_build_times < CBT_NCIRCUITS_TO_OBSERVE) + num_modes = 1; + + nth_max_bin = (build_time_t*)tor_malloc_zero(num_modes*sizeof(build_time_t)); for (i = 0; i < nbins; i++) { - if (histogram[i] >= histogram[max_bin]) { - max_bin = i; + if (histogram[i] >= histogram[nth_max_bin[0]]) { + nth_max_bin[0] = i; } + + for (n = 1; n < num_modes; n++) { + if (histogram[i] >= histogram[nth_max_bin[n]] && + (!histogram[nth_max_bin[n-1]] + || histogram[i] < histogram[nth_max_bin[n-1]])) { + nth_max_bin[n] = i; + } + } + } + + for (n = 0; n < num_modes; n++) { + bin_counts += histogram[nth_max_bin[n]]; + ret += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]]; + log_info(LD_CIRC, "Xm mode #%d: %u %u", n, CBT_BIN_TO_MS(nth_max_bin[n]), + histogram[nth_max_bin[n]]); } + ret /= bin_counts; tor_free(histogram); + tor_free(nth_max_bin); - return max_bin*CBT_BIN_WIDTH+CBT_BIN_WIDTH/2; + return ret; } /** @@ -428,6 +489,12 @@ circuit_build_times_update_state(circuit_build_times_t *cbt, *next = NULL; state->TotalBuildTimes = cbt->total_build_times; + state->CircuitBuildAbandonedCount = 0; + + for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { + if (cbt->circuit_build_times[i] == CBT_BUILD_ABANDONED) + state->CircuitBuildAbandonedCount++; + } for (i = 0; i < nbins; i++) { // compress the histogram by skipping the blanks @@ -436,7 +503,7 @@ circuit_build_times_update_state(circuit_build_times_t *cbt, line->key = tor_strdup("CircuitBuildTimeBin"); line->value = tor_malloc(25); tor_snprintf(line->value, 25, "%d %d", - i*CBT_BIN_WIDTH+CBT_BIN_WIDTH/2, histogram[i]); + CBT_BIN_TO_MS(i), histogram[i]); next = &(line->next); } @@ -480,6 +547,43 @@ circuit_build_times_shuffle_and_store_array(circuit_build_times_t *cbt, } /** + * Filter old synthetic timeouts that were created before the + * new right-censored Pareto calculation was deployed. + * + * Once all clients before 0.2.1.13-alpha are gone, this code + * will be unused. + */ +static int +circuit_build_times_filter_timeouts(circuit_build_times_t *cbt) +{ + int num_filtered=0, i=0; + double timeout_rate = 0; + build_time_t max_timeout = 0; + + timeout_rate = circuit_build_times_timeout_rate(cbt); + max_timeout = (build_time_t)cbt->close_ms; + + for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { + if (cbt->circuit_build_times[i] > max_timeout) { + build_time_t replaced = cbt->circuit_build_times[i]; + num_filtered++; + cbt->circuit_build_times[i] = CBT_BUILD_ABANDONED; + + log_debug(LD_CIRC, "Replaced timeout %d with %d", replaced, + cbt->circuit_build_times[i]); + } + } + + log_info(LD_CIRC, + "We had %d timeouts out of %d build times, " + "and filtered %d above the max of %u", + (int)(cbt->total_build_times*timeout_rate), + cbt->total_build_times, num_filtered, max_timeout); + + return num_filtered; +} + +/** * Load histogram from <b>state</b>, shuffling the resulting array * after we do so. Use this result to estimate parameters and * calculate the timeout. @@ -493,12 +597,18 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt, int tot_values = 0; uint32_t loaded_cnt = 0, N = 0; config_line_t *line; - int i; - build_time_t *loaded_times = tor_malloc(sizeof(build_time_t) - * state->TotalBuildTimes); + unsigned int i; + build_time_t *loaded_times; circuit_build_times_init(cbt); *msg = NULL; + if (circuit_build_times_disabled()) { + return 0; + } + + /* build_time_t 0 means uninitialized */ + loaded_times = tor_malloc_zero(sizeof(build_time_t)*state->TotalBuildTimes); + for (line = state->BuildtimeHistogram; line; line = line->next) { smartlist_t *args = smartlist_create(); smartlist_split_string(args, line->value, " ", @@ -534,7 +644,8 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt, break; } - if (loaded_cnt+count > state->TotalBuildTimes) { + if (loaded_cnt+count+state->CircuitBuildAbandonedCount + > state->TotalBuildTimes) { log_warn(LD_CIRC, "Too many build times in state file. " "Stopping short before %d", @@ -553,10 +664,17 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt, } } + log_info(LD_CIRC, + "Adding %d timeouts.", state->CircuitBuildAbandonedCount); + for (i=0; i < state->CircuitBuildAbandonedCount; i++) { + loaded_times[loaded_cnt++] = CBT_BUILD_ABANDONED; + } + if (loaded_cnt != state->TotalBuildTimes) { log_warn(LD_CIRC, "Corrupt state file? Build times count mismatch. " - "Read %d, file says %d", loaded_cnt, state->TotalBuildTimes); + "Read %d times, but file says %d", loaded_cnt, + state->TotalBuildTimes); } circuit_build_times_shuffle_and_store_array(cbt, loaded_times, loaded_cnt); @@ -572,7 +690,13 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt, tot_values, cbt->total_build_times, N); tor_assert(cbt->total_build_times == tot_values); tor_assert(cbt->total_build_times <= CBT_NCIRCUITS_TO_OBSERVE); + circuit_build_times_set_timeout(cbt); + + if (!state->CircuitBuildAbandonedCount && cbt->total_build_times) { + circuit_build_times_filter_timeouts(cbt); + } + tor_free(loaded_times); return *msg ? -1 : 0; } @@ -591,12 +715,15 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt) { build_time_t *x=cbt->circuit_build_times; double a = 0; - int n=0,i=0; + int n=0,i=0,abandoned_count=0; + build_time_t max_time=0; /* http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation */ /* We sort of cheat here and make our samples slightly more pareto-like * and less frechet-like. */ - cbt->Xm = circuit_build_times_mode(cbt); + cbt->Xm = circuit_build_times_get_xm(cbt); + + tor_assert(cbt->Xm > 0); for (i=0; i< CBT_NCIRCUITS_TO_OBSERVE; i++) { if (!x[i]) { @@ -605,8 +732,12 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt) if (x[i] < cbt->Xm) { a += tor_mathlog(cbt->Xm); + } else if (x[i] == CBT_BUILD_ABANDONED) { + abandoned_count++; } else { a += tor_mathlog(x[i]); + if (x[i] > max_time) + max_time = x[i]; } n++; } @@ -617,8 +748,15 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt) } tor_assert(n==cbt->total_build_times); + tor_assert(max_time > 0); + + a += abandoned_count*tor_mathlog(max_time); + a -= n*tor_mathlog(cbt->Xm); - a = n/a; + // Estimator comes from Eq #4 in: + // "Bayesian estimation based on trimmed samples from Pareto populations" + // by Arturo J. Fernández. We are right-censored only. + a = (n-abandoned_count)/a; cbt->alpha = a; } @@ -698,35 +836,6 @@ circuit_build_times_generate_sample(circuit_build_times_t *cbt, return ret; } -/** Generate points in [cutoff, 1.0) on the CDF. */ -void -circuit_build_times_add_timeout_worker(circuit_build_times_t *cbt, - double quantile_cutoff) -{ - // XXX: This may be failing when the number of samples is small? - // Keep getting values for the largest timeout bucket over and over - // again... Probably because alpha is very very large in that case.. - build_time_t gentime = circuit_build_times_generate_sample(cbt, - quantile_cutoff, CBT_MAX_SYNTHETIC_QUANTILE); - - if (gentime < (build_time_t)tor_lround(cbt->timeout_ms)) { - log_warn(LD_CIRC, - "Generated a synthetic timeout LESS than the current timeout: " - "%ums vs %lfms using Xm: %d a: %lf, q: %lf", - gentime, cbt->timeout_ms, cbt->Xm, cbt->alpha, quantile_cutoff); - } else if (gentime > CBT_BUILD_TIME_MAX) { - log_info(LD_CIRC, - "Generated a synthetic timeout larger than the max: %u", - gentime); - gentime = CBT_BUILD_TIME_MAX; - } else { - log_info(LD_CIRC, "Generated synthetic circuit build time %u for timeout", - gentime); - } - - circuit_build_times_add_time(cbt, gentime); -} - /** * Estimate an initial alpha parameter by solving the quantile * function with a quantile point and a specific timeout value. @@ -749,33 +858,6 @@ circuit_build_times_initial_alpha(circuit_build_times_t *cbt, } /** - * Generate synthetic timeout values for the timeouts - * that have happened before we estimated our parameters. - */ -static void -circuit_build_times_count_pretimeouts(circuit_build_times_t *cbt) -{ - /* Store a timeout as a random position past the current - * cutoff on the pareto curve */ - if (cbt->pre_timeouts) { - double timeout_quantile = 1.0- - ((double)cbt->pre_timeouts)/ - (cbt->pre_timeouts+cbt->total_build_times); - /* Make sure it doesn't exceed the synthetic max */ - timeout_quantile *= CBT_MAX_SYNTHETIC_QUANTILE; - cbt->Xm = circuit_build_times_mode(cbt); - tor_assert(cbt->Xm > 0); - /* Use current timeout to get an estimate on alpha */ - circuit_build_times_initial_alpha(cbt, timeout_quantile, - cbt->timeout_ms); - while (cbt->pre_timeouts-- != 0) { - circuit_build_times_add_timeout_worker(cbt, timeout_quantile); - } - cbt->pre_timeouts = 0; - } -} - -/** * Returns true if we need circuits to be built */ int @@ -825,16 +907,27 @@ circuit_build_times_network_circ_success(circuit_build_times_t *cbt) } /** - * A circuit just timed out. If there has been no recent network activity - * at all, but this circuit was launched back when we thought the network - * was live, increment the number of "nonlive" circuit timeouts. - * - * Also distinguish between whether it failed before the first hop - * and record that in our history for later deciding if the network has - * changed. + * A circuit just timed out. If it failed after the first hop, record it + * in our history for later deciding if the network speed has changed. */ static void circuit_build_times_network_timeout(circuit_build_times_t *cbt, + int did_onehop) +{ + if (did_onehop) { + cbt->liveness.timeouts_after_firsthop[cbt->liveness.after_firsthop_idx]=1; + cbt->liveness.after_firsthop_idx++; + cbt->liveness.after_firsthop_idx %= cbt->liveness.num_recent_circs; + } +} + +/** + * A circuit was just forcibly closed. If there has been no recent network + * activity at all, but this circuit was launched back when we thought the + * network was live, increment the number of "nonlive" circuit timeouts. + */ +static void +circuit_build_times_network_close(circuit_build_times_t *cbt, int did_onehop, time_t start_time) { time_t now = time(NULL); @@ -843,16 +936,21 @@ circuit_build_times_network_timeout(circuit_build_times_t *cbt, * entire existence during a time where we have had no network activity. * * Also double check that it is a valid timeout after we have possibly - * just recently reset cbt->timeout_ms. + * just recently reset cbt->close_ms. + * + * We use close_ms here because timeouts aren't actually counted as timeouts + * until close_ms elapses. */ if (cbt->liveness.network_last_live <= start_time && - start_time <= (now - cbt->timeout_ms/1000.0)) { + start_time <= (now - cbt->close_ms/1000.0)) { + if (did_onehop) { + log_warn(LD_BUG, + "Circuit somehow completed a hop while the network was " + "not live. Network was last live at %ld, but circuit launched " + "at %ld. It's now %ld.", cbt->liveness.network_last_live, + start_time, now); + } cbt->liveness.nonlive_timeouts++; - } else if (did_onehop) { - /* Count a one-hop timeout */ - cbt->liveness.timeouts_after_firsthop[cbt->liveness.after_firsthop_idx]=1; - cbt->liveness.after_firsthop_idx++; - cbt->liveness.after_firsthop_idx %= cbt->liveness.num_recent_circs; } } @@ -888,17 +986,22 @@ circuit_build_times_network_check_live(circuit_build_times_t *cbt) "Temporarily raising timeout to %lds.", (long int)(now - cbt->liveness.network_last_live), tor_lround(circuit_build_times_get_initial_timeout()/1000)); - cbt->timeout_ms = circuit_build_times_get_initial_timeout(); - cbt->liveness.net_suspended = 1; + cbt->liveness.suspended_timeout = cbt->timeout_ms; + cbt->liveness.suspended_close_timeout = cbt->close_ms; + cbt->close_ms = cbt->timeout_ms + = circuit_build_times_get_initial_timeout(); control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_SUSPENDED); } return 0; - } else if (cbt->liveness.net_suspended) { + } else if (cbt->liveness.suspended_timeout > 0) { log_notice(LD_CIRC, "Network activity has resumed. " "Resuming circuit timeout calculations."); - cbt->liveness.net_suspended = 0; + cbt->timeout_ms = cbt->liveness.suspended_timeout; + cbt->close_ms = cbt->liveness.suspended_close_timeout; + cbt->liveness.suspended_timeout = 0; + cbt->liveness.suspended_close_timeout = 0; control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_RESUME); } @@ -942,9 +1045,17 @@ circuit_build_times_network_check_changed(circuit_build_times_t *cbt) /* Check to see if this has happened before. If so, double the timeout * to give people on abysmally bad network connections a shot at access */ if (cbt->timeout_ms >= circuit_build_times_get_initial_timeout()) { - cbt->timeout_ms *= 2; + if (cbt->timeout_ms > INT32_MAX/2 || cbt->close_ms > INT32_MAX/2) { + log_warn(LD_CIRC, "Insanely large circuit build timeout value. " + "(timeout = %lfmsec, close = %lfmsec)", + cbt->timeout_ms, cbt->close_ms); + } else { + cbt->timeout_ms *= 2; + cbt->close_ms *= 2; + } } else { - cbt->timeout_ms = circuit_build_times_get_initial_timeout(); + cbt->close_ms = cbt->timeout_ms + = circuit_build_times_get_initial_timeout(); } control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_RESET); @@ -959,77 +1070,180 @@ circuit_build_times_network_check_changed(circuit_build_times_t *cbt) } /** + * Count the number of timeouts in a set of cbt data. + */ +double +circuit_build_times_timeout_rate(const circuit_build_times_t *cbt) +{ + int i=0,timeouts=0; + for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { + if (cbt->circuit_build_times[i] >= cbt->timeout_ms) { + timeouts++; + } + } + + if (!cbt->total_build_times) + return 0; + + return ((double)timeouts)/cbt->total_build_times; +} + +/** + * Count the number of closed circuits in a set of cbt data. + */ +double +circuit_build_times_close_rate(const circuit_build_times_t *cbt) +{ + int i=0,closed=0; + for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) { + if (cbt->circuit_build_times[i] == CBT_BUILD_ABANDONED) { + closed++; + } + } + + if (!cbt->total_build_times) + return 0; + + return ((double)closed)/cbt->total_build_times; +} + +/** * Store a timeout as a synthetic value. * * Returns true if the store was successful and we should possibly * update our timeout estimate. */ int -circuit_build_times_add_timeout(circuit_build_times_t *cbt, +circuit_build_times_count_close(circuit_build_times_t *cbt, int did_onehop, time_t start_time) { - circuit_build_times_network_timeout(cbt, did_onehop, start_time); + if (circuit_build_times_disabled()) { + cbt->close_ms = cbt->timeout_ms + = circuit_build_times_get_initial_timeout(); + return 0; + } + + /* Record this force-close to help determine if the network is dead */ + circuit_build_times_network_close(cbt, did_onehop, start_time); /* Only count timeouts if network is live.. */ if (!circuit_build_times_network_check_live(cbt)) { return 0; } - /* If there are a ton of timeouts, we should reduce - * the circuit build timeout */ - if (circuit_build_times_network_check_changed(cbt)) { - return 0; - } + circuit_build_times_add_time(cbt, CBT_BUILD_ABANDONED); + return 1; +} - if (!cbt->have_computed_timeout) { - /* Store a timeout before we have enough data */ - cbt->pre_timeouts++; - log_info(LD_CIRC, - "Not enough circuits yet to calculate a new build timeout." - " Need %d more.", circuit_build_times_min_circs_to_observe() - - cbt->total_build_times); - return 0; +/** + * Update timeout counts to determine if we need to expire + * our build time history due to excessive timeouts. + */ +void +circuit_build_times_count_timeout(circuit_build_times_t *cbt, + int did_onehop) +{ + if (circuit_build_times_disabled()) { + cbt->close_ms = cbt->timeout_ms + = circuit_build_times_get_initial_timeout(); + return; } - circuit_build_times_count_pretimeouts(cbt); - circuit_build_times_add_timeout_worker(cbt, - circuit_build_times_quantile_cutoff()); + circuit_build_times_network_timeout(cbt, did_onehop); - return 1; + /* If there are a ton of timeouts, we should reset + * the circuit build timeout. + */ + circuit_build_times_network_check_changed(cbt); } /** * Estimate a new timeout based on history and set our timeout * variable accordingly. */ -void -circuit_build_times_set_timeout(circuit_build_times_t *cbt) +static int +circuit_build_times_set_timeout_worker(circuit_build_times_t *cbt) { if (cbt->total_build_times < circuit_build_times_min_circs_to_observe()) { - return; + return 0; } - circuit_build_times_count_pretimeouts(cbt); circuit_build_times_update_alpha(cbt); cbt->timeout_ms = circuit_build_times_calculate_timeout(cbt, circuit_build_times_quantile_cutoff()); + cbt->close_ms = circuit_build_times_calculate_timeout(cbt, + circuit_build_times_close_quantile()); + + /* Sometimes really fast guard nodes give us such a steep curve + * that this ends up being not that much greater than timeout_ms. + * Make it be at least 1 min to handle this case. */ + cbt->close_ms = MAX(cbt->close_ms, circuit_build_times_initial_timeout()); + cbt->have_computed_timeout = 1; + return 1; +} + +/** + * Exposed function to compute a new timeout. Dispatches events and + * also filters out extremely high timeout values. + */ +void +circuit_build_times_set_timeout(circuit_build_times_t *cbt) +{ + long prev_timeout = tor_lround(cbt->timeout_ms/1000); + double timeout_rate; + + if (!circuit_build_times_set_timeout_worker(cbt)) + return; if (cbt->timeout_ms < circuit_build_times_min_timeout()) { log_warn(LD_CIRC, "Set buildtimeout to low value %lfms. Setting to %dms", cbt->timeout_ms, circuit_build_times_min_timeout()); cbt->timeout_ms = circuit_build_times_min_timeout(); + if (cbt->close_ms < cbt->timeout_ms) { + /* This shouldn't happen because of MAX() in timeout_worker above, + * but doing it just in case */ + cbt->close_ms = circuit_build_times_initial_timeout(); + } } control_event_buildtimeout_set(cbt, BUILDTIMEOUT_SET_EVENT_COMPUTED); - log_info(LD_CIRC, - "Set circuit build timeout to %lds (%lfms, Xm: %d, a: %lf) " - "based on %d circuit times", tor_lround(cbt->timeout_ms/1000), - cbt->timeout_ms, cbt->Xm, cbt->alpha, cbt->total_build_times); + timeout_rate = circuit_build_times_timeout_rate(cbt); + + if (prev_timeout > tor_lround(cbt->timeout_ms/1000)) { + log_notice(LD_CIRC, + "Based on %d circuit times, it looks like we don't need to " + "wait so long for circuits to finish. We will now assume a " + "circuit is too slow to use after waiting %ld seconds.", + cbt->total_build_times, + tor_lround(cbt->timeout_ms/1000)); + log_info(LD_CIRC, + "Circuit timeout data: %lfms, %lfms, Xm: %d, a: %lf, r: %lf", + cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, + timeout_rate); + } else if (prev_timeout < tor_lround(cbt->timeout_ms/1000)) { + log_notice(LD_CIRC, + "Based on %d circuit times, it looks like we need to wait " + "longer for circuits to finish. We will now assume a " + "circuit is too slow to use after waiting %ld seconds.", + cbt->total_build_times, + tor_lround(cbt->timeout_ms/1000)); + log_info(LD_CIRC, + "Circuit timeout data: %lfms, %lfms, Xm: %d, a: %lf, r: %lf", + cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, + timeout_rate); + } else { + log_info(LD_CIRC, + "Set circuit build timeout to %lds (%lfms, %lfms, Xm: %d, a: %lf," + " r: %lf) based on %d circuit times", + tor_lround(cbt->timeout_ms/1000), + cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, timeout_rate, + cbt->total_build_times); + } } /** Iterate over values of circ_id, starting from conn-\>next_circ_id, @@ -1620,11 +1834,25 @@ circuit_send_next_onion_skin(origin_circuit_t *circ) long timediff; tor_gettimeofday(&end); timediff = tv_mdiff(&circ->_base.highres_created, &end); - if (timediff > INT32_MAX) - timediff = INT32_MAX; - circuit_build_times_add_time(&circ_times, (build_time_t)timediff); - circuit_build_times_network_circ_success(&circ_times); - circuit_build_times_set_timeout(&circ_times); + /* + * If the circuit build time is much greater than we would have cut + * it off at, we probably had a suspend event along this codepath, + * and we should discard the value. + */ + if (timediff < 0 || timediff > 2*circ_times.close_ms+1000) { + log_notice(LD_CIRC, "Strange value for circuit build time: %ldmsec. " + "Assuming clock jump.", timediff); + } else if (!circuit_build_times_disabled()) { + /* Don't count circuit times if the network was not live */ + if (circuit_build_times_network_check_live(&circ_times)) { + circuit_build_times_add_time(&circ_times, (build_time_t)timediff); + circuit_build_times_set_timeout(&circ_times); + } + + if (circ->_base.purpose != CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT) { + circuit_build_times_network_circ_success(&circ_times); + } + } } log_info(LD_CIRC,"circuit built!"); circuit_reset_failure_count(0); @@ -1646,6 +1874,10 @@ circuit_send_next_onion_skin(origin_circuit_t *circ) } circuit_rep_hist_note_result(circ); circuit_has_opened(circ); /* do other actions as necessary */ + + /* We're done with measurement circuits here. Just close them */ + if (circ->_base.purpose == CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT) + circuit_mark_for_close(TO_CIRCUIT(circ), END_CIRC_REASON_FINISHED); return 0; } diff --git a/src/or/circuituse.c b/src/or/circuituse.c index 7e47e60559..c1b39b18e7 100644 --- a/src/or/circuituse.c +++ b/src/or/circuituse.c @@ -270,6 +270,7 @@ circuit_expire_building(time_t now) * decided on a customized one yet */ time_t general_cutoff = now - lround(circ_times.timeout_ms/1000); time_t begindir_cutoff = now - lround(circ_times.timeout_ms/2000); + time_t close_cutoff = now - lround(circ_times.close_ms/1000); time_t introcirc_cutoff = begindir_cutoff; cpath_build_state_t *build_state; @@ -286,8 +287,11 @@ circuit_expire_building(time_t now) cutoff = begindir_cutoff; else if (victim->purpose == CIRCUIT_PURPOSE_C_INTRODUCING) cutoff = introcirc_cutoff; + else if (victim->purpose == CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT) + cutoff = close_cutoff; else cutoff = general_cutoff; + if (victim->timestamp_created > cutoff) continue; /* it's still young, leave it alone */ @@ -350,9 +354,39 @@ circuit_expire_building(time_t now) } else { /* circuit not open, consider recording failure as timeout */ int first_hop_succeeded = TO_ORIGIN_CIRCUIT(victim)->cpath && TO_ORIGIN_CIRCUIT(victim)->cpath->state == CPATH_STATE_OPEN; - if (circuit_build_times_add_timeout(&circ_times, first_hop_succeeded, - victim->timestamp_created)) + + if (TO_ORIGIN_CIRCUIT(victim)->p_streams != NULL) { + log_warn(LD_BUG, "Circuit %d (purpose %d) has timed out, " + "yet has attached streams!", + TO_ORIGIN_CIRCUIT(victim)->global_identifier, + victim->purpose); + tor_fragile_assert(); + continue; + } + + /* circuits are allowed to last longer for measurement. + * Switch their purpose and wait. */ + if (victim->purpose != CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT) { + victim->purpose = CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT; + circuit_build_times_count_timeout(&circ_times, + first_hop_succeeded); + continue; + } + + /* + * If the circuit build time is much greater than we would have cut + * it off at, we probably had a suspend event along this codepath, + * and we should discard the value. + */ + if (now - victim->timestamp_created > 2*circ_times.close_ms/1000+1) { + log_notice(LD_CIRC, + "Extremely large value for circuit build timeout: %lds. " + "Assuming clock jump.", now - victim->timestamp_created); + } else if (circuit_build_times_count_close(&circ_times, + first_hop_succeeded, + victim->timestamp_created)) { circuit_build_times_set_timeout(&circ_times); + } } if (victim->n_conn) @@ -683,14 +717,28 @@ circuit_expire_old_circuits_clientside(time_t now) circ->n_circ_id, (int)(now - circ->timestamp_dirty), circ->purpose); circuit_mark_for_close(circ, END_CIRC_REASON_FINISHED); - } else if (!circ->timestamp_dirty && - circ->state == CIRCUIT_STATE_OPEN && - circ->purpose == CIRCUIT_PURPOSE_C_GENERAL) { + } else if (!circ->timestamp_dirty && circ->state == CIRCUIT_STATE_OPEN) { if (circ->timestamp_created < cutoff) { - log_debug(LD_CIRC, - "Closing circuit that has been unused for %d seconds.", - (int)(now - circ->timestamp_created)); - circuit_mark_for_close(circ, END_CIRC_REASON_FINISHED); + if (circ->purpose == CIRCUIT_PURPOSE_C_GENERAL || + circ->purpose == CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT || + circ->purpose == CIRCUIT_PURPOSE_S_ESTABLISH_INTRO || + circ->purpose == CIRCUIT_PURPOSE_TESTING || + (circ->purpose >= CIRCUIT_PURPOSE_C_INTRODUCING && + circ->purpose <= CIRCUIT_PURPOSE_C_REND_READY_INTRO_ACKED) || + circ->purpose == CIRCUIT_PURPOSE_S_CONNECT_REND) { + log_debug(LD_CIRC, + "Closing circuit that has been unused for %d seconds.", + (int)(now - circ->timestamp_created)); + circuit_mark_for_close(circ, END_CIRC_REASON_FINISHED); + } else if (!TO_ORIGIN_CIRCUIT(circ)->is_ancient) { + log_notice(LD_CIRC, + "Ancient non-dirty circuit %d is still around after " + "%ld seconds. Purpose: %d", + TO_ORIGIN_CIRCUIT(circ)->global_identifier, + now - circ->timestamp_created, + circ->purpose); + TO_ORIGIN_CIRCUIT(circ)->is_ancient = 1; + } } } } diff --git a/src/or/config.c b/src/or/config.c index 400b5cbc7d..771fa03bf7 100644 --- a/src/or/config.c +++ b/src/or/config.c @@ -167,6 +167,7 @@ static config_var_t _option_vars[] = { V(BridgeRecordUsageByCountry, BOOL, "1"), V(BridgeRelay, BOOL, "0"), V(CellStatistics, BOOL, "0"), + V(LearnCircuitBuildTimeout, BOOL, "1"), V(CircuitBuildTimeout, INTERVAL, "0"), V(CircuitIdleTimeout, INTERVAL, "1 hour"), V(CircuitStreamTimeout, INTERVAL, "0"), @@ -425,6 +426,7 @@ static config_var_t _state_vars[] = { V(LastWritten, ISOTIME, NULL), V(TotalBuildTimes, UINT, NULL), + V(CircuitBuildAbandonedCount, UINT, "0"), VAR("CircuitBuildTimeBin", LINELIST_S, BuildtimeHistogram, NULL), VAR("BuildtimeHistogram", LINELIST_V, BuildtimeHistogram, NULL), @@ -4989,7 +4991,6 @@ or_state_save(time_t now) if (accounting_is_enabled(get_options())) accounting_run_housekeeping(now); - global_state->LastWritten = time(NULL); tor_free(global_state->TorVersion); tor_asprintf(&global_state->TorVersion, "Tor %s", get_version()); @@ -5004,10 +5005,13 @@ or_state_save(time_t now) fname = get_datadir_fname("state"); if (write_str_to_file(fname, contents, 0)<0) { log_warn(LD_FS, "Unable to write state to file \"%s\"", fname); + global_state->LastWritten = -1; tor_free(fname); tor_free(contents); return -1; } + + global_state->LastWritten = time(NULL); log_info(LD_GENERAL, "Saved state to \"%s\"", fname); tor_free(fname); tor_free(contents); diff --git a/src/or/control.c b/src/or/control.c index 61aaed4667..ea6ce4c464 100644 --- a/src/or/control.c +++ b/src/or/control.c @@ -3528,10 +3528,14 @@ control_event_buildtimeout_set(const circuit_build_times_t *cbt, send_control_event(EVENT_BUILDTIMEOUT_SET, ALL_FORMATS, "650 BUILDTIMEOUT_SET %s TOTAL_TIMES=%lu " - "TIMEOUT_MS=%lu XM=%lu ALPHA=%lf CUTOFF_QUANTILE=%lf\r\n", + "TIMEOUT_MS=%lu XM=%lu ALPHA=%lf CUTOFF_QUANTILE=%lf " + "TIMEOUT_RATE=%lf CLOSE_MS=%lu CLOSE_RATE=%lf\r\n", type_string, (unsigned long)cbt->total_build_times, (unsigned long)cbt->timeout_ms, - (unsigned long)cbt->Xm, cbt->alpha, qnt); + (unsigned long)cbt->Xm, cbt->alpha, qnt, + circuit_build_times_timeout_rate(cbt), + (unsigned long)cbt->close_ms, + circuit_build_times_close_rate(cbt)); return 0; } diff --git a/src/or/or.h b/src/or/or.h index f922de2d8a..5393b27d4d 100644 --- a/src/or/or.h +++ b/src/or/or.h @@ -158,7 +158,7 @@ #define MAX_DNS_TTL (3*60*60) /** How small can a TTL be before we stop believing it? Provides rudimentary * pinning. */ -#define MIN_DNS_TTL (60) +#define MIN_DNS_TTL 60 /** How often do we rotate onion keys? */ #define MIN_ONION_KEY_LIFETIME (7*24*60*60) @@ -467,23 +467,23 @@ typedef enum { #define CIRCUIT_PURPOSE_C_REND_READY_INTRO_ACKED 11 /** Client-side circuit purpose: at Alice, rendezvous established. */ #define CIRCUIT_PURPOSE_C_REND_JOINED 12 - -#define _CIRCUIT_PURPOSE_C_MAX 12 - +/** This circuit is used for build time measurement only */ +#define CIRCUIT_PURPOSE_C_MEASURE_TIMEOUT 13 +#define _CIRCUIT_PURPOSE_C_MAX 13 /** Hidden-service-side circuit purpose: at Bob, waiting for introductions. */ -#define CIRCUIT_PURPOSE_S_ESTABLISH_INTRO 13 +#define CIRCUIT_PURPOSE_S_ESTABLISH_INTRO 14 /** Hidden-service-side circuit purpose: at Bob, successfully established * intro. */ -#define CIRCUIT_PURPOSE_S_INTRO 14 +#define CIRCUIT_PURPOSE_S_INTRO 15 /** Hidden-service-side circuit purpose: at Bob, connecting to rend point. */ -#define CIRCUIT_PURPOSE_S_CONNECT_REND 15 +#define CIRCUIT_PURPOSE_S_CONNECT_REND 16 /** Hidden-service-side circuit purpose: at Bob, rendezvous established. */ -#define CIRCUIT_PURPOSE_S_REND_JOINED 16 +#define CIRCUIT_PURPOSE_S_REND_JOINED 17 /** A testing circuit; not meant to be used for actual traffic. */ -#define CIRCUIT_PURPOSE_TESTING 17 +#define CIRCUIT_PURPOSE_TESTING 18 /** A controller made this circuit and Tor should not use it. */ -#define CIRCUIT_PURPOSE_CONTROLLER 18 -#define _CIRCUIT_PURPOSE_MAX 18 +#define CIRCUIT_PURPOSE_CONTROLLER 19 +#define _CIRCUIT_PURPOSE_MAX 19 /** A catch-all for unrecognized purposes. Currently we don't expect * to make or see any circuits with this purpose. */ #define CIRCUIT_PURPOSE_UNKNOWN 255 @@ -2157,6 +2157,9 @@ typedef struct origin_circuit_t { * to the specification? */ unsigned int remaining_relay_early_cells : 4; + /** Set if this circuit insanely old and if we already informed the user */ + unsigned int is_ancient : 1; + /** What commands were sent over this circuit that decremented the * RELAY_EARLY counter? This is for debugging task 878. */ uint8_t relay_early_commands[MAX_RELAY_EARLY_CELLS_PER_CIRCUIT]; @@ -2496,9 +2499,12 @@ typedef struct { * connections alive? */ int SocksTimeout; /**< How long do we let a socks connection wait * unattached before we fail it? */ - int CircuitBuildTimeout; /**< If non-zero, cull non-open circuits that - * were born at least this many seconds ago. If - * zero, use the internal adaptive algorithm. */ + int LearnCircuitBuildTimeout; /**< If non-zero, we attempt to learn a value + * for CircuitBuildTimeout based on timeout + * history */ + int CircuitBuildTimeout; /**< Cull non-open circuits that were born at + * least this many seconds ago. Used until + * adaptive algorithm learns a new value. */ int CircuitIdleTimeout; /**< Cull open clean circuits that were born * at least this many seconds ago. */ int CircuitStreamTimeout; /**< If non-zero, detach streams from circuits @@ -2842,7 +2848,8 @@ typedef struct { /** Build time histogram */ config_line_t * BuildtimeHistogram; - uint16_t TotalBuildTimes; + unsigned int TotalBuildTimes; + unsigned int CircuitBuildAbandonedCount; /** What version of Tor wrote this state file? */ char *TorVersion; @@ -3021,16 +3028,20 @@ void entry_guards_free_all(void); * 1000 is approx 2.5 days worth of continual-use circuits. */ #define CBT_NCIRCUITS_TO_OBSERVE 1000 -/** Maximum quantile to use to generate synthetic timeouts. - * We want to stay a bit short of 1.0, because longtail is - * loooooooooooooooooooooooooooooooooooooooooooooooooooong. */ -#define CBT_MAX_SYNTHETIC_QUANTILE 0.985 - /** Width of the histogram bins in milliseconds */ #define CBT_BIN_WIDTH ((build_time_t)50) +/** Number of modes to use in the weighted-avg computation of Xm */ +#define CBT_DEFAULT_NUM_XM_MODES 3 + /** A build_time_t is milliseconds */ typedef uint32_t build_time_t; + +/** + * CBT_BUILD_ABANDONED is our flag value to represent a force-closed + * circuit (Aka a 'right-censored' pareto value). + */ +#define CBT_BUILD_ABANDONED ((build_time_t)(INT32_MAX-1)) #define CBT_BUILD_TIME_MAX ((build_time_t)(INT32_MAX)) /** Save state every 10 circuits */ @@ -3046,7 +3057,7 @@ typedef uint32_t build_time_t; * at which point we switch back to computing the timeout from * our saved history. */ -#define CBT_NETWORK_NONLIVE_TIMEOUT_COUNT (3) +#define CBT_NETWORK_NONLIVE_TIMEOUT_COUNT 3 /** * This tells us when to toss out the last streak of N timeouts. @@ -3059,6 +3070,12 @@ typedef uint32_t build_time_t; /* Circuit build times consensus parameters */ /** + * How long to wait before actually closing circuits that take too long to + * build in terms of CDF quantile. + */ +#define CBT_DEFAULT_CLOSE_QUANTILE 95 + +/** * How many circuits count as recent when considering if the * connection has gone gimpy or changed. */ @@ -3111,8 +3128,12 @@ typedef struct { int num_recent_circs; /** Index into circular array. */ int after_firsthop_idx; - /** The network is not live. Timeout gathering is suspended */ - int net_suspended; + /** Timeout gathering is suspended if non-zero. The old timeout value + * is stored here in that case. */ + double suspended_timeout; + /** Timeout gathering is suspended if non-zero. The old close value + * is stored here in that case. */ + double suspended_close_timeout; } network_liveness_t; /** Structure for circuit build times history */ @@ -3127,17 +3148,17 @@ typedef struct { network_liveness_t liveness; /** Last time we built a circuit. Used to decide to build new test circs */ time_t last_circ_at; - /** Number of timeouts that have happened before estimating pareto - * parameters */ - int pre_timeouts; /** "Minimum" value of our pareto distribution (actually mode) */ build_time_t Xm; /** alpha exponent for pareto dist. */ double alpha; /** Have we computed a timeout? */ int have_computed_timeout; - /** The exact value for that timeout in milliseconds */ + /** The exact value for that timeout in milliseconds. Stored as a double + * to maintain precision from calculations to and from quantile value. */ double timeout_ms; + /** How long we wait before actually closing the circuit. */ + double close_ms; } circuit_build_times_t; extern circuit_build_times_t circ_times; @@ -3145,7 +3166,9 @@ void circuit_build_times_update_state(circuit_build_times_t *cbt, or_state_t *state); int circuit_build_times_parse_state(circuit_build_times_t *cbt, or_state_t *state, char **msg); -int circuit_build_times_add_timeout(circuit_build_times_t *cbt, +void circuit_build_times_count_timeout(circuit_build_times_t *cbt, + int did_onehop); +int circuit_build_times_count_close(circuit_build_times_t *cbt, int did_onehop, time_t start_time); void circuit_build_times_set_timeout(circuit_build_times_t *cbt); int circuit_build_times_add_time(circuit_build_times_t *cbt, @@ -3155,6 +3178,8 @@ int circuit_build_times_needs_circuits_now(circuit_build_times_t *cbt); void circuit_build_times_init(circuit_build_times_t *cbt); void circuit_build_times_new_consensus_params(circuit_build_times_t *cbt, networkstatus_t *ns); +double circuit_build_times_timeout_rate(const circuit_build_times_t *cbt); +double circuit_build_times_close_rate(const circuit_build_times_t *cbt); #ifdef CIRCUIT_PRIVATE double circuit_build_times_calculate_timeout(circuit_build_times_t *cbt, @@ -3835,7 +3860,7 @@ download_status_mark_impossible(download_status_t *dl) /********************************* dirserv.c ***************************/ /** Maximum length of an exit policy summary. */ -#define MAX_EXITPOLICY_SUMMARY_LEN (1000) +#define MAX_EXITPOLICY_SUMMARY_LEN 1000 /** Maximum allowable length of a version line in a networkstatus. */ #define MAX_V_LINE_LEN 128 @@ -4393,7 +4418,7 @@ int nt_service_parse_options(int argc, char **argv, int *should_exit); int nt_service_is_stopping(void); void nt_service_set_state(DWORD state); #else -#define nt_service_is_stopping() (0) +#define nt_service_is_stopping() 0 #endif /********************************* onion.c ***************************/ diff --git a/src/test/test.c b/src/test/test.c index 8d596981ef..fbcfedbc9a 100644 --- a/src/test/test.c +++ b/src/test/test.c @@ -494,14 +494,13 @@ test_circuit_timeout(void) timeout1 = circuit_build_times_calculate_timeout(&estimate, CBT_DEFAULT_QUANTILE_CUTOFF/100.0); circuit_build_times_set_timeout(&estimate); - log_warn(LD_CIRC, "Timeout is %lf, Xm is %d", timeout1, estimate.Xm); - /* XXX: 5% distribution error may not be the right metric */ + log_warn(LD_CIRC, "Timeout1 is %lf, Xm is %d", timeout1, estimate.Xm); } while (fabs(circuit_build_times_cdf(&initial, timeout0) - - circuit_build_times_cdf(&initial, timeout1)) > 0.05 - /* 5% error */ + circuit_build_times_cdf(&initial, timeout1)) > 0.02 + /* 2% error */ && estimate.total_build_times < CBT_NCIRCUITS_TO_OBSERVE); - test_assert(estimate.total_build_times < CBT_NCIRCUITS_TO_OBSERVE); + test_assert(estimate.total_build_times <= CBT_NCIRCUITS_TO_OBSERVE); circuit_build_times_update_state(&estimate, &state); test_assert(circuit_build_times_parse_state(&final, &state, &msg) == 0); @@ -511,8 +510,9 @@ test_circuit_timeout(void) CBT_DEFAULT_QUANTILE_CUTOFF/100.0); circuit_build_times_set_timeout(&final); - log_warn(LD_CIRC, "Timeout is %lf, Xm is %d", timeout2, final.Xm); + log_warn(LD_CIRC, "Timeout2 is %lf, Xm is %d", timeout2, final.Xm); + /* 5% here because some accuracy is lost due to histogram conversion */ test_assert(fabs(circuit_build_times_cdf(&initial, timeout0) - circuit_build_times_cdf(&initial, timeout2)) < 0.05); @@ -520,20 +520,20 @@ test_circuit_timeout(void) int build_times_idx = 0; int total_build_times = 0; - final.timeout_ms = CBT_DEFAULT_TIMEOUT_INITIAL_VALUE; - estimate.timeout_ms = CBT_DEFAULT_TIMEOUT_INITIAL_VALUE; + final.close_ms = final.timeout_ms = CBT_DEFAULT_TIMEOUT_INITIAL_VALUE; + estimate.close_ms = estimate.timeout_ms + = CBT_DEFAULT_TIMEOUT_INITIAL_VALUE; for (i = 0; i < CBT_DEFAULT_RECENT_CIRCUITS*2; i++) { circuit_build_times_network_circ_success(&estimate); circuit_build_times_add_time(&estimate, circuit_build_times_generate_sample(&estimate, 0, CBT_DEFAULT_QUANTILE_CUTOFF/100.0)); - estimate.have_computed_timeout = 1; + circuit_build_times_network_circ_success(&estimate); circuit_build_times_add_time(&final, circuit_build_times_generate_sample(&final, 0, CBT_DEFAULT_QUANTILE_CUTOFF/100.0)); - final.have_computed_timeout = 1; } test_assert(!circuit_build_times_network_check_changed(&estimate)); @@ -549,26 +549,22 @@ test_circuit_timeout(void) test_assert(circuit_build_times_network_check_live(&estimate)); test_assert(circuit_build_times_network_check_live(&final)); - if (circuit_build_times_add_timeout(&estimate, 0, - (time_t)(approx_time()-estimate.timeout_ms/1000.0-1))) - estimate.have_computed_timeout = 1; - if (circuit_build_times_add_timeout(&final, 0, - (time_t)(approx_time()-final.timeout_ms/1000.0-1))) - final.have_computed_timeout = 1; + circuit_build_times_count_close(&estimate, 0, + (time_t)(approx_time()-estimate.close_ms/1000.0-1)); + circuit_build_times_count_close(&final, 0, + (time_t)(approx_time()-final.close_ms/1000.0-1)); } test_assert(!circuit_build_times_network_check_live(&estimate)); test_assert(!circuit_build_times_network_check_live(&final)); for ( ; i < CBT_NETWORK_NONLIVE_DISCARD_COUNT; i++) { - if (circuit_build_times_add_timeout(&estimate, 0, - (time_t)(approx_time()-estimate.timeout_ms/1000.0-1))) - estimate.have_computed_timeout = 1; + circuit_build_times_count_close(&estimate, 0, + (time_t)(approx_time()-estimate.close_ms/1000.0-1)); if (i < CBT_NETWORK_NONLIVE_DISCARD_COUNT-1) { - if (circuit_build_times_add_timeout(&final, 0, - (time_t)(approx_time()-final.timeout_ms/1000.0-1))) - final.have_computed_timeout = 1; + circuit_build_times_count_close(&final, 0, + (time_t)(approx_time()-final.close_ms/1000.0-1)); } } @@ -581,7 +577,10 @@ test_circuit_timeout(void) /* Check rollback index. Should match top of loop. */ test_assert(build_times_idx == estimate.build_times_idx); - test_assert(total_build_times == estimate.total_build_times); + // This can fail if estimate.total_build_times == 1000, because + // in that case, rewind actually causes us to lose timeouts + if (total_build_times != CBT_NCIRCUITS_TO_OBSERVE) + test_assert(total_build_times == estimate.total_build_times); /* Now simulate that the network has become live and we need * a change */ @@ -589,12 +588,10 @@ test_circuit_timeout(void) circuit_build_times_network_is_live(&final); for (i = 0; i < CBT_DEFAULT_MAX_RECENT_TIMEOUT_COUNT; i++) { - if (circuit_build_times_add_timeout(&estimate, 1, approx_time()-1)) - estimate.have_computed_timeout = 1; + circuit_build_times_count_timeout(&estimate, 1); if (i < CBT_DEFAULT_MAX_RECENT_TIMEOUT_COUNT-1) { - if (circuit_build_times_add_timeout(&final, 1, approx_time()-1)) - final.have_computed_timeout = 1; + circuit_build_times_count_timeout(&final, 1); } } @@ -605,8 +602,7 @@ test_circuit_timeout(void) test_assert(circuit_build_times_network_check_live(&estimate)); test_assert(circuit_build_times_network_check_live(&final)); - if (circuit_build_times_add_timeout(&final, 1, approx_time()-1)) - final.have_computed_timeout = 1; + circuit_build_times_count_timeout(&final, 1); } done: |