diff options
Diffstat (limited to 'src/feature/stats/rephist.c')
-rw-r--r-- | src/feature/stats/rephist.c | 337 |
1 files changed, 300 insertions, 37 deletions
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index 2bfa14d326..68c9349bc3 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -206,26 +206,18 @@ typedef struct { uint64_t overload_fd_exhausted; } overload_stats_t; -/***** DNS statistics *****/ +/** Current state of overload stats */ +static overload_stats_t overload_stats; -/** Represents the statistics of DNS queries seen if it is an Exit. */ -typedef struct { - /** Total number of DNS request seen at an Exit. They might not all end - * successfully or might even be lost by tor. This counter is incremented - * right before the DNS request is initiated. */ - uint64_t stats_n_request; +/** Counters to count the number of times we've reached an overload for the + * global connection read/write limit. Reported on the MetricsPort. */ +static uint64_t stats_n_read_limit_reached = 0; +static uint64_t stats_n_write_limit_reached = 0; - /** Total number of DNS timeout errors. */ - uint64_t stats_n_error_timeout; +/** Total number of times we've reached TCP port exhaustion. */ +static uint64_t stats_n_tcp_exhaustion = 0; - /** When is the next assessment time of the general overload for DNS errors. - * Once this time is reached, all stats are reset and this time is set to the - * next assessment time. */ - time_t next_assessment_time; -} overload_dns_stats_t; - -/** Keep track of the DNS requests for the general overload state. */ -static overload_dns_stats_t overload_dns_stats; +/***** DNS statistics *****/ /* We use a scale here so we can represent percentages with decimal points by * scaling the value by this factor and so 0.5% becomes a value of 500. @@ -254,19 +246,96 @@ static double overload_dns_timeout_fraction = static int32_t overload_dns_timeout_period_secs = OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT; -/** Current state of overload stats */ -static overload_stats_t overload_stats; +/** Overload DNS statistics. The information in this object is used to assess + * if, due to DNS errors, we should emit a general overload signal or not. + * + * NOTE: This structure is _not_ per DNS query type like the statistics below + * because of a libevent bug + * (https://github.com/libevent/libevent/issues/1219), on error, the type is + * not propagated up back to the user and so we need to keep our own stats for + * the overload signal. */ +typedef struct { + /** Total number of DNS request seen at an Exit. They might not all end + * successfully or might even be lost by tor. This counter is incremented + * right before the DNS request is initiated. */ + uint64_t stats_n_request; -/** Return true if this overload happened within the last `n_hours`. */ -static bool -overload_happened_recently(time_t overload_time, int n_hours) + /** Total number of DNS timeout errors. */ + uint64_t stats_n_error_timeout; + + /** When is the next assessment time of the general overload for DNS errors. + * Once this time is reached, all stats are reset and this time is set to the + * next assessment time. */ + time_t next_assessment_time; +} overload_dns_stats_t; + +/** Keep track of the DNS requests for the general overload state. */ +static overload_dns_stats_t overload_dns_stats; + +/** Represents the statistics of DNS queries seen if it is an Exit. */ +typedef struct { + /* Total number of DNS errors found in RFC 1035 (from 0 to 5 code). */ + uint64_t stats_n_error_none; /* 0 */ + uint64_t stats_n_error_format; /* 1 */ + uint64_t stats_n_error_serverfailed; /* 2 */ + uint64_t stats_n_error_notexist; /* 3 */ + uint64_t stats_n_error_notimpl; /* 4 */ + uint64_t stats_n_error_refused; /* 5 */ + + /* Total number of DNS errors specific to libevent. */ + uint64_t stats_n_error_truncated; /* 65 */ + uint64_t stats_n_error_unknown; /* 66 */ + uint64_t stats_n_error_timeout; /* 67 */ + uint64_t stats_n_error_shutdown; /* 68 */ + uint64_t stats_n_error_cancel; /* 69 */ + uint64_t stats_n_error_nodata; /* 70 */ + + /* Total number of DNS request seen at an Exit. They might not all end + * successfully or might even be lost by tor. This counter is incremented + * right before the DNS request is initiated. */ + uint64_t stats_n_request; +} dns_stats_t; + +/* This is disabled because of the libevent bug where on error we don't get the + * DNS query type back. Once it is fixed, we can re-enable this. */ +#if 0 +/** DNS statistics store for each DNS record type for which tor supports only + * three at the moment: A, PTR and AAAA. */ +static dns_stats_t dns_A_stats; +static dns_stats_t dns_PTR_stats; +static dns_stats_t dns_AAAA_stats; +#endif + +/** DNS query statistics store. It covers all type of queries. */ +static dns_stats_t dns_all_stats; + +/** Return the point to the DNS statistics store. Ignore the type for now + * because of a libevent problem. */ +static inline dns_stats_t * +get_dns_stats_by_type(const int type) { - /* An overload is relevant if it happened in the last 72 hours */ - if (overload_time > approx_time() - 3600 * n_hours) { - return true; + (void) type; + return &dns_all_stats; +} + +#if 0 +/** From a libevent record type, return a pointer to the corresponding DNS + * statistics store. NULL is returned if the type is unhandled. */ +static inline dns_stats_t * +get_dns_stats_by_type(const int type) +{ + switch (type) { + case DNS_IPv4_A: + return &dns_A_stats; + case DNS_PTR: + return &dns_PTR_stats; + case DNS_IPv6_AAAA: + return &dns_AAAA_stats; + default: + return NULL; } - return false; } +#endif /** Assess the DNS timeout errors and if we have enough to trigger a general * overload. */ @@ -310,38 +379,175 @@ rep_hist_consensus_has_changed(const networkstatus_t *ns) OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX); } +/** Return the DNS error count for the given libevent DNS type and error code. + * The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */ +uint64_t +rep_hist_get_n_dns_error(int type, uint8_t error) +{ + dns_stats_t *dns_stats = get_dns_stats_by_type(type); + if (BUG(!dns_stats)) { + return 0; + } + + switch (error) { + case DNS_ERR_NONE: + return dns_stats->stats_n_error_none; + case DNS_ERR_FORMAT: + return dns_stats->stats_n_error_format; + case DNS_ERR_SERVERFAILED: + return dns_stats->stats_n_error_serverfailed; + case DNS_ERR_NOTEXIST: + return dns_stats->stats_n_error_notexist; + case DNS_ERR_NOTIMPL: + return dns_stats->stats_n_error_notimpl; + case DNS_ERR_REFUSED: + return dns_stats->stats_n_error_refused; + case DNS_ERR_TRUNCATED: + return dns_stats->stats_n_error_truncated; + case DNS_ERR_UNKNOWN: + return dns_stats->stats_n_error_unknown; + case DNS_ERR_TIMEOUT: + return dns_stats->stats_n_error_timeout; + case DNS_ERR_SHUTDOWN: + return dns_stats->stats_n_error_shutdown; + case DNS_ERR_CANCEL: + return dns_stats->stats_n_error_cancel; + case DNS_ERR_NODATA: + return dns_stats->stats_n_error_nodata; + default: + /* Unhandled code sent back by libevent. */ + return 0; + } +} + +/** Return the total number of DNS request seen for the given libevent DNS + * record type. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */ +uint64_t +rep_hist_get_n_dns_request(int type) +{ + dns_stats_t *dns_stats = get_dns_stats_by_type(type); + if (BUG(!dns_stats)) { + return 0; + } + return dns_stats->stats_n_request; +} + /** Note a DNS error for the given given libevent DNS record type and error * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. * - * IMPORTANT: Libevent is _not_ returning the type in case of an error and so - * if error is anything but DNS_ERR_NONE, the type is not usable and set to 0. + * NOTE: Libevent is _not_ returning the type in case of an error and so if + * error is anything but DNS_ERR_NONE, the type is not usable and set to 0. * * See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */ void -rep_hist_note_dns_query(int type, uint8_t error) +rep_hist_note_dns_error(int type, uint8_t error) { - (void) type; - /* Assess if we need to trigger a general overload with regards to the DNS * errors or not. */ overload_general_dns_assessment(); - /* We only care about timeouts for the moment. */ + /* Because of the libevent problem (see note in the function comment), we + * disregard the DNS query type and keep track of DNS timeout for the + * overload state. */ + if (error == DNS_ERR_TIMEOUT) { + overload_dns_stats.stats_n_error_timeout++; + } + overload_dns_stats.stats_n_request++; + + /* Again, the libevent bug (see function comment), for an error that is + * anything but DNS_ERR_NONE, the type is always 0 which means that we don't + * have a DNS stat object for it so this code will do nothing until libevent + * is fixed. */ + dns_stats_t *dns_stats = get_dns_stats_by_type(type); + /* Unsupported DNS query type. */ + if (!dns_stats) { + return; + } + switch (error) { + case DNS_ERR_NONE: + dns_stats->stats_n_error_none++; + break; + case DNS_ERR_FORMAT: + dns_stats->stats_n_error_format++; + break; + case DNS_ERR_SERVERFAILED: + dns_stats->stats_n_error_serverfailed++; + break; + case DNS_ERR_NOTEXIST: + dns_stats->stats_n_error_notexist++; + break; + case DNS_ERR_NOTIMPL: + dns_stats->stats_n_error_notimpl++; + break; + case DNS_ERR_REFUSED: + dns_stats->stats_n_error_refused++; + break; + case DNS_ERR_TRUNCATED: + dns_stats->stats_n_error_truncated++; + break; + case DNS_ERR_UNKNOWN: + dns_stats->stats_n_error_unknown++; + break; case DNS_ERR_TIMEOUT: - overload_dns_stats.stats_n_error_timeout++; + dns_stats->stats_n_error_timeout++; + break; + case DNS_ERR_SHUTDOWN: + dns_stats->stats_n_error_shutdown++; + break; + case DNS_ERR_CANCEL: + dns_stats->stats_n_error_cancel++; + break; + case DNS_ERR_NODATA: + dns_stats->stats_n_error_nodata++; break; default: + /* Unhandled code sent back by libevent. */ break; } +} - /* Increment total number of requests. */ - overload_dns_stats.stats_n_request++; +/** Note a DNS request for the given given libevent DNS record type. */ +void +rep_hist_note_dns_request(int type) +{ + dns_stats_t *dns_stats = get_dns_stats_by_type(type); + if (BUG(!dns_stats)) { + return; + } + dns_stats->stats_n_request++; +} + +/***** END of DNS statistics *****/ + +/** Return true if this overload happened within the last `n_hours`. */ +static bool +overload_happened_recently(time_t overload_time, int n_hours) +{ + /* An overload is relevant if it happened in the last 72 hours */ + if (overload_time > approx_time() - 3600 * n_hours) { + return true; + } + return false; } /* The current version of the overload stats version */ #define OVERLOAD_STATS_VERSION 1 +/** Return the stats_n_read_limit_reached counter. */ +uint64_t +rep_hist_get_n_read_limit_reached(void) +{ + return stats_n_read_limit_reached; +} + +/** Return the stats_n_write_limit_reached counter. */ +uint64_t +rep_hist_get_n_write_limit_reached(void) +{ + return stats_n_write_limit_reached; +} + /** Returns an allocated string for server descriptor for publising information * on whether we are overloaded or not. */ char * @@ -420,6 +626,7 @@ rep_hist_note_overload(overload_type_t overload) SET_TO_START_OF_HOUR(overload_stats.overload_general_time); break; case OVERLOAD_READ: { + stats_n_read_limit_reached++; SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time); if (approx_time() >= last_read_counted + 60) { /* Count once a minute */ overload_stats.overload_read_count++; @@ -428,6 +635,7 @@ rep_hist_note_overload(overload_type_t overload) break; } case OVERLOAD_WRITE: { + stats_n_write_limit_reached++; SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time); if (approx_time() >= last_write_counted + 60) { /* Count once a minute */ overload_stats.overload_write_count++; @@ -442,6 +650,22 @@ rep_hist_note_overload(overload_type_t overload) } } +/** Note down that we've reached a TCP port exhaustion. This triggers an + * overload general event. */ +void +rep_hist_note_tcp_exhaustion(void) +{ + stats_n_tcp_exhaustion++; + rep_hist_note_overload(OVERLOAD_GENERAL); +} + +/** Return the total number of TCP exhaustion times we've reached. */ +uint64_t +rep_hist_get_n_tcp_exhaustion(void) +{ + return stats_n_tcp_exhaustion; +} + /** Return the or_history_t for the OR with identity digest <b>id</b>, * creating it if necessary. */ static or_history_t * @@ -641,7 +865,7 @@ rep_hist_downrate_old_runs(time_t now) return stability_last_downrated + STABILITY_INTERVAL; /* Okay, we should downrate the data. By how much? */ - while (stability_last_downrated + STABILITY_INTERVAL < now) { + while (stability_last_downrated + STABILITY_INTERVAL <= now) { stability_last_downrated += STABILITY_INTERVAL; alpha *= STABILITY_ALPHA; } @@ -1908,11 +2132,18 @@ rep_hist_note_desc_served(const char * desc) /** Internal statistics to track how many requests of each type of * handshake we've received, and how many we've assigned to cpuworkers. * Useful for seeing trends in cpu load. + * + * They are reset at every heartbeat. * @{ */ STATIC int onion_handshakes_requested[MAX_ONION_HANDSHAKE_TYPE+1] = {0}; STATIC int onion_handshakes_assigned[MAX_ONION_HANDSHAKE_TYPE+1] = {0}; /**@}*/ +/** Counters keeping the same stats as above but for the entire duration of the + * process (not reset). */ +static uint64_t stats_n_onionskin_assigned[MAX_ONION_HANDSHAKE_TYPE+1] = {0}; +static uint64_t stats_n_onionskin_dropped[MAX_ONION_HANDSHAKE_TYPE+1] = {0}; + /** A new onionskin (using the <b>type</b> handshake) has arrived. */ void rep_hist_note_circuit_handshake_requested(uint16_t type) @@ -1926,8 +2157,20 @@ rep_hist_note_circuit_handshake_requested(uint16_t type) void rep_hist_note_circuit_handshake_assigned(uint16_t type) { - if (type <= MAX_ONION_HANDSHAKE_TYPE) + if (type <= MAX_ONION_HANDSHAKE_TYPE) { onion_handshakes_assigned[type]++; + stats_n_onionskin_assigned[type]++; + } +} + +/** We've just drop an onionskin (using the <b>type</b> handshake) due to being + * overloaded. */ +void +rep_hist_note_circuit_handshake_dropped(uint16_t type) +{ + if (type <= MAX_ONION_HANDSHAKE_TYPE) { + stats_n_onionskin_dropped[type]++; + } } /** Get the circuit handshake value that is requested. */ @@ -1950,6 +2193,26 @@ rep_hist_get_circuit_handshake_assigned, (uint16_t type)) return onion_handshakes_assigned[type]; } +/** Get the total number of circuit handshake value that is assigned. */ +MOCK_IMPL(uint64_t, +rep_hist_get_circuit_n_handshake_assigned, (uint16_t type)) +{ + if (BUG(type > MAX_ONION_HANDSHAKE_TYPE)) { + return 0; + } + return stats_n_onionskin_assigned[type]; +} + +/** Get the total number of circuit handshake value that is dropped. */ +MOCK_IMPL(uint64_t, +rep_hist_get_circuit_n_handshake_dropped, (uint16_t type)) +{ + if (BUG(type > MAX_ONION_HANDSHAKE_TYPE)) { + return 0; + } + return stats_n_onionskin_dropped[type]; +} + /** Log our onionskin statistics since the last time we were called. */ void rep_hist_log_circuit_handshake_stats(time_t now) |