diff options
author | David Goulet <dgoulet@torproject.org> | 2021-10-19 14:13:33 -0400 |
---|---|---|
committer | David Goulet <dgoulet@torproject.org> | 2021-10-20 09:09:25 -0400 |
commit | de907893befdb2bf6bb0b95b183cc1f01b8a464c (patch) | |
tree | 2d69389904193985c5062406b3f27a653eda7ed7 /src/feature/stats/rephist.c | |
parent | af48f5736aec6ae67b52bc26e3cbb74d47dd8db1 (diff) | |
download | tor-de907893befdb2bf6bb0b95b183cc1f01b8a464c.tar.gz tor-de907893befdb2bf6bb0b95b183cc1f01b8a464c.zip |
relay: Overload state on DNS timeout is now X% over Y secs
With this commit, we will only report a general overload state if we've
seen more than X% of DNS timeout errors over Y seconds. Previous
behavior was to report when a single timeout occured which is really too
small of a threshold.
The value X is a consensus parameters called
"overload_dns_timeout_scale_percent" which is a scaled percentage
(factor of 1000) so we can represent decimal points for X like 0.5% for
instance. Its default is 1000 which ends up being 1%.
The value Y is a consensus parameters called
"overload_dns_timeout_period_secs" which is the time period for which
will gather DNS errors and once over, we assess if that X% has been
reached ultimately triggering a general overload signal.
Closes #40491
Signed-off-by: David Goulet <dgoulet@torproject.org>
Diffstat (limited to 'src/feature/stats/rephist.c')
-rw-r--r-- | src/feature/stats/rephist.c | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index e25c01331d..c3a281a8c2 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -84,6 +84,8 @@ #include "feature/nodelist/networkstatus_st.h" #include "core/or/or_circuit_st.h" +#include <event2/dns.h> + #ifdef HAVE_FCNTL_H #include <fcntl.h> #endif @@ -204,6 +206,54 @@ typedef struct { uint64_t overload_fd_exhausted; } overload_stats_t; +/***** DNS statistics *****/ + +/** Represents the statistics of DNS queries seen if it is an Exit. */ +typedef struct { + /** Total number of DNS request seen at an Exit. They might not all end + * successfully or might even be lost by tor. This counter is incremented + * right before the DNS request is initiated. */ + uint64_t stats_n_request; + + /** Total number of DNS timeout errors. */ + uint64_t stats_n_error_timeout; + + /** When is the next assessment time of the general overload for DNS errors. + * Once this time is reached, all stats are reset and this time is set to the + * next assessment time. */ + time_t next_assessment_time; +} overload_dns_stats_t; + +/** Keep track of the DNS requests for the general overload state. */ +static overload_dns_stats_t overload_dns_stats; + +/* We use a scale here so we can represent percentages with decimal points by + * scaling the value by this factor and so 0.5% becomes a value of 500. + * Default is 1% and thus min and max range is 0 to 100%. */ +#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000 + +/** Consensus parameter: indicate what fraction of DNS timeout errors over the + * total number of DNS requests must be reached before we trigger a general + * overload signal .*/ +static double overload_dns_timeout_fraction = + OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT / + OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; + +/* Number of seconds for the assessment period. Default is 10 minutes (600) and + * the min max range is within a 32bit value. */ +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60) +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0 +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX + +/** Consensus parameter: Period, in seconds, over which we count the number of + * DNS requests and timeout errors. After that period, we assess if we trigger + * an overload or not. */ +static int32_t overload_dns_timeout_period_secs = + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT; + /** Current state of overload stats */ static overload_stats_t overload_stats; @@ -218,6 +268,89 @@ overload_happened_recently(time_t overload_time, int n_hours) return false; } +/** Assess the DNS timeout errors and if we have enough to trigger a general + * overload. */ +static void +overload_general_dns_assessment(void) +{ + /* Initialize the time. Should be done once. */ + if (overload_dns_stats.next_assessment_time == 0) { + goto reset; + } + + /* Not the time yet. */ + if (overload_dns_stats.next_assessment_time > approx_time()) { + return; + } + + /* Lets see if we can signal a general overload. */ + double fraction = (double) overload_dns_stats.stats_n_error_timeout / + (double) overload_dns_stats.stats_n_request; + if (fraction >= overload_dns_timeout_fraction) { + log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") " + "fraction %.4f%% is above threshold of %.4f%%", + overload_dns_stats.stats_n_error_timeout, + fraction * 100.0, + overload_dns_timeout_fraction * 100.0); + rep_hist_note_overload(OVERLOAD_GENERAL); + } + + reset: + /* Reset counters for the next period. */ + overload_dns_stats.stats_n_error_timeout = 0; + overload_dns_stats.stats_n_request = 0; + overload_dns_stats.next_assessment_time = + approx_time() + overload_dns_timeout_period_secs; +} + +/** Called just before the consensus will be replaced. Update the consensus + * parameters in case they changed. */ +void +rep_hist_consensus_has_changed(const networkstatus_t *ns) +{ + overload_dns_timeout_fraction = + networkstatus_get_param(ns, "overload_dns_timeout_scale_percent", + OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT, + OVERLOAD_DNS_TIMEOUT_PERCENT_MIN, + OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) / + OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; + + overload_dns_timeout_period_secs = + networkstatus_get_param(ns, "overload_dns_timeout_period_secs", + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT, + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN, + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX); +} + +/** Note a DNS error for the given given libevent DNS record type and error + * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. + * + * IMPORTANT: Libevent is _not_ returning the type in case of an error and so + * if error is anything but DNS_ERR_NONE, the type is not usable and set to 0. + * + * See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */ +void +rep_hist_note_dns_query(int type, uint8_t error) +{ + (void) type; + + /* Assess if we need to trigger a general overload with regards to the DNS + * errors or not. */ + overload_general_dns_assessment(); + + /* We only care about timeouts for the moment. */ + switch (error) { + case DNS_ERR_TIMEOUT: + overload_dns_stats.stats_n_error_timeout++; + break; + default: + break; + } + + /* Increment total number of requests. */ + overload_dns_stats.stats_n_request++; +} + /* The current version of the overload stats version */ #define OVERLOAD_STATS_VERSION 1 |