From cda7acb35d40c505dc4d2c3b55d611faab189477 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Mon, 13 Dec 2021 10:22:29 -0500 Subject: relay: Don't make DNS timeout trigger an overload Tor has configure libevent to attempt up to 3 times a DNS query for a maximum of 5 seconds each. Once that 5 seconds has elapsed, it consider the query "Timed Out" but tor only gets a timeout if all 3 attempts have failed. For example, using Unbound, it has a much higher threshold of timeout. It is well defined in https://www.nlnetlabs.nl/documentation/unbound/info-timeout/ and has some complexity to it. But the gist is that if it times out, it will be much more than 5 seconds. And so the Tor DNS timeouts are more of a "UX issue" rather than a "network issue". For this reason, we are removing this metric from the overload general signal. See https://gitlab.torproject.org/tpo/network-health/team/-/issues/139 for more information. Fixes #40527 Signed-off-by: David Goulet --- changes/ticket40527 | 5 +++ src/feature/stats/rephist.c | 12 ------- src/test/test_stats.c | 82 ++------------------------------------------- 3 files changed, 8 insertions(+), 91 deletions(-) create mode 100644 changes/ticket40527 diff --git a/changes/ticket40527 b/changes/ticket40527 new file mode 100644 index 0000000000..631b3d4bb9 --- /dev/null +++ b/changes/ticket40527 @@ -0,0 +1,5 @@ + o Major bugfixes (relay, overload): + - Don't make Tor DNS timeout trigger an overload general state. These + timeouts are different from DNS server timeout. They have to be seen as + timeout related to UX and not because of a network problem. Fixes bug + 40527; bugfix on 0.4.6.1-alpha. diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index c3a281a8c2..2bfa14d326 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -283,18 +283,6 @@ overload_general_dns_assessment(void) return; } - /* Lets see if we can signal a general overload. */ - double fraction = (double) overload_dns_stats.stats_n_error_timeout / - (double) overload_dns_stats.stats_n_request; - if (fraction >= overload_dns_timeout_fraction) { - log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") " - "fraction %.4f%% is above threshold of %.4f%%", - overload_dns_stats.stats_n_error_timeout, - fraction * 100.0, - overload_dns_timeout_fraction * 100.0); - rep_hist_note_overload(OVERLOAD_GENERAL); - } - reset: /* Reset counters for the next period. */ overload_dns_stats.stats_n_error_timeout = 0; diff --git a/src/test/test_stats.c b/src/test/test_stats.c index 3b9a192c75..af035ae54c 100644 --- a/src/test/test_stats.c +++ b/src/test/test_stats.c @@ -721,7 +721,7 @@ test_overload_stats(void *arg) stats_str = rep_hist_get_overload_stats_lines(); tt_assert(!stats_str); - /* Note a DNS overload */ + /* Note a overload */ rep_hist_note_overload(OVERLOAD_GENERAL); /* Move the time forward one hour */ @@ -742,7 +742,7 @@ test_overload_stats(void *arg) /* Now the time should be 2002-01-07 00:00:00 */ - /* Note a DNS overload */ + /* Note a overload */ rep_hist_note_overload(OVERLOAD_GENERAL); stats_str = rep_hist_get_overload_general_line(); @@ -760,7 +760,7 @@ test_overload_stats(void *arg) tt_str_op("overload-fd-exhausted 1 2002-01-07 00:00:00\n", OP_EQ, stats_str); tor_free(stats_str); - /* Move the time forward. Register DNS overload. See that the time changed */ + /* Move the time forward. Register overload. See that the time changed */ current_time += 3600*2; update_approx_time(current_time); @@ -867,81 +867,6 @@ test_overload_stats(void *arg) tor_free(stats_str); } -/** Test the overload stats logic. */ -static void -test_overload_dns_timeout(void *arg) -{ - char *stats_str = NULL; - (void) arg; - - /* Lets simulate a series of timeouts but below our default 1% threshold. */ - - for (int i = 0; i < 1000; i++) { - /* This should trigger 9 timeouts which is just below 1% (10) */ - if (i > 0 && !(i % 100)) { - rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT); - } else { - rep_hist_note_dns_query(0, DNS_ERR_NONE); - } - } - - /* No overload yet. */ - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* Move it 10 minutes in the future and see if we get a general overload. */ - update_approx_time(approx_time() + (10 * 60)); - - /* This query should NOT trigger the general overload because we are below - * our default of 1%. */ - rep_hist_note_dns_query(0, DNS_ERR_NONE); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* We'll now go above our 1% threshold. */ - for (int i = 0; i < 1000; i++) { - /* This should trigger 10 timeouts which is our threshold of 1% (10) */ - if (!(i % 10)) { - rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT); - } else { - rep_hist_note_dns_query(0, DNS_ERR_NONE); - } - } - - /* Move it 10 minutes in the future and see if we get a general overload. */ - update_approx_time(approx_time() + (10 * 60)); - - /* This query should trigger the general overload because we are above 1%. */ - rep_hist_note_dns_query(0, DNS_ERR_NONE); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(stats_str); - tor_free(stats_str); - - /* Move 72h in the future, we should NOT get an overload anymore. */ - update_approx_time(approx_time() + (72 * 3600)); - - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* This query should NOT trigger the general overload. */ - rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(!stats_str); - - /* Move it 10 minutes in the future and see if we get a general overload. We - * have now 100% of requests timing out. */ - update_approx_time(approx_time() + (10 * 60)); - - /* This query should trigger the general overload with 50% of timeouts. */ - rep_hist_note_dns_query(0, DNS_ERR_NONE); - stats_str = rep_hist_get_overload_general_line(); - tt_assert(stats_str); - tor_free(stats_str); - - done: - tor_free(stats_str); -} - #define ENT(name) \ { #name, test_ ## name , 0, NULL, NULL } #define FORK(name) \ @@ -958,7 +883,6 @@ struct testcase_t stats_tests[] = { FORK(rephist_v3_onions), FORK(load_stats_file), FORK(overload_stats), - FORK(overload_dns_timeout), END_OF_TESTCASES }; -- cgit v1.2.3-54-g00ecf