summaryrefslogtreecommitdiff
path: root/src/feature/stats/rephist.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/feature/stats/rephist.c')
-rw-r--r--src/feature/stats/rephist.c337
1 files changed, 300 insertions, 37 deletions
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index 2bfa14d326..68c9349bc3 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -206,26 +206,18 @@ typedef struct {
uint64_t overload_fd_exhausted;
} overload_stats_t;
-/***** DNS statistics *****/
+/** Current state of overload stats */
+static overload_stats_t overload_stats;
-/** Represents the statistics of DNS queries seen if it is an Exit. */
-typedef struct {
- /** Total number of DNS request seen at an Exit. They might not all end
- * successfully or might even be lost by tor. This counter is incremented
- * right before the DNS request is initiated. */
- uint64_t stats_n_request;
+/** Counters to count the number of times we've reached an overload for the
+ * global connection read/write limit. Reported on the MetricsPort. */
+static uint64_t stats_n_read_limit_reached = 0;
+static uint64_t stats_n_write_limit_reached = 0;
- /** Total number of DNS timeout errors. */
- uint64_t stats_n_error_timeout;
+/** Total number of times we've reached TCP port exhaustion. */
+static uint64_t stats_n_tcp_exhaustion = 0;
- /** When is the next assessment time of the general overload for DNS errors.
- * Once this time is reached, all stats are reset and this time is set to the
- * next assessment time. */
- time_t next_assessment_time;
-} overload_dns_stats_t;
-
-/** Keep track of the DNS requests for the general overload state. */
-static overload_dns_stats_t overload_dns_stats;
+/***** DNS statistics *****/
/* We use a scale here so we can represent percentages with decimal points by
* scaling the value by this factor and so 0.5% becomes a value of 500.
@@ -254,19 +246,96 @@ static double overload_dns_timeout_fraction =
static int32_t overload_dns_timeout_period_secs =
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT;
-/** Current state of overload stats */
-static overload_stats_t overload_stats;
+/** Overload DNS statistics. The information in this object is used to assess
+ * if, due to DNS errors, we should emit a general overload signal or not.
+ *
+ * NOTE: This structure is _not_ per DNS query type like the statistics below
+ * because of a libevent bug
+ * (https://github.com/libevent/libevent/issues/1219), on error, the type is
+ * not propagated up back to the user and so we need to keep our own stats for
+ * the overload signal. */
+typedef struct {
+ /** Total number of DNS request seen at an Exit. They might not all end
+ * successfully or might even be lost by tor. This counter is incremented
+ * right before the DNS request is initiated. */
+ uint64_t stats_n_request;
-/** Return true if this overload happened within the last `n_hours`. */
-static bool
-overload_happened_recently(time_t overload_time, int n_hours)
+ /** Total number of DNS timeout errors. */
+ uint64_t stats_n_error_timeout;
+
+ /** When is the next assessment time of the general overload for DNS errors.
+ * Once this time is reached, all stats are reset and this time is set to the
+ * next assessment time. */
+ time_t next_assessment_time;
+} overload_dns_stats_t;
+
+/** Keep track of the DNS requests for the general overload state. */
+static overload_dns_stats_t overload_dns_stats;
+
+/** Represents the statistics of DNS queries seen if it is an Exit. */
+typedef struct {
+ /* Total number of DNS errors found in RFC 1035 (from 0 to 5 code). */
+ uint64_t stats_n_error_none; /* 0 */
+ uint64_t stats_n_error_format; /* 1 */
+ uint64_t stats_n_error_serverfailed; /* 2 */
+ uint64_t stats_n_error_notexist; /* 3 */
+ uint64_t stats_n_error_notimpl; /* 4 */
+ uint64_t stats_n_error_refused; /* 5 */
+
+ /* Total number of DNS errors specific to libevent. */
+ uint64_t stats_n_error_truncated; /* 65 */
+ uint64_t stats_n_error_unknown; /* 66 */
+ uint64_t stats_n_error_timeout; /* 67 */
+ uint64_t stats_n_error_shutdown; /* 68 */
+ uint64_t stats_n_error_cancel; /* 69 */
+ uint64_t stats_n_error_nodata; /* 70 */
+
+ /* Total number of DNS request seen at an Exit. They might not all end
+ * successfully or might even be lost by tor. This counter is incremented
+ * right before the DNS request is initiated. */
+ uint64_t stats_n_request;
+} dns_stats_t;
+
+/* This is disabled because of the libevent bug where on error we don't get the
+ * DNS query type back. Once it is fixed, we can re-enable this. */
+#if 0
+/** DNS statistics store for each DNS record type for which tor supports only
+ * three at the moment: A, PTR and AAAA. */
+static dns_stats_t dns_A_stats;
+static dns_stats_t dns_PTR_stats;
+static dns_stats_t dns_AAAA_stats;
+#endif
+
+/** DNS query statistics store. It covers all type of queries. */
+static dns_stats_t dns_all_stats;
+
+/** Return the point to the DNS statistics store. Ignore the type for now
+ * because of a libevent problem. */
+static inline dns_stats_t *
+get_dns_stats_by_type(const int type)
{
- /* An overload is relevant if it happened in the last 72 hours */
- if (overload_time > approx_time() - 3600 * n_hours) {
- return true;
+ (void) type;
+ return &dns_all_stats;
+}
+
+#if 0
+/** From a libevent record type, return a pointer to the corresponding DNS
+ * statistics store. NULL is returned if the type is unhandled. */
+static inline dns_stats_t *
+get_dns_stats_by_type(const int type)
+{
+ switch (type) {
+ case DNS_IPv4_A:
+ return &dns_A_stats;
+ case DNS_PTR:
+ return &dns_PTR_stats;
+ case DNS_IPv6_AAAA:
+ return &dns_AAAA_stats;
+ default:
+ return NULL;
}
- return false;
}
+#endif
/** Assess the DNS timeout errors and if we have enough to trigger a general
* overload. */
@@ -310,38 +379,175 @@ rep_hist_consensus_has_changed(const networkstatus_t *ns)
OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX);
}
+/** Return the DNS error count for the given libevent DNS type and error code.
+ * The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
+uint64_t
+rep_hist_get_n_dns_error(int type, uint8_t error)
+{
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ if (BUG(!dns_stats)) {
+ return 0;
+ }
+
+ switch (error) {
+ case DNS_ERR_NONE:
+ return dns_stats->stats_n_error_none;
+ case DNS_ERR_FORMAT:
+ return dns_stats->stats_n_error_format;
+ case DNS_ERR_SERVERFAILED:
+ return dns_stats->stats_n_error_serverfailed;
+ case DNS_ERR_NOTEXIST:
+ return dns_stats->stats_n_error_notexist;
+ case DNS_ERR_NOTIMPL:
+ return dns_stats->stats_n_error_notimpl;
+ case DNS_ERR_REFUSED:
+ return dns_stats->stats_n_error_refused;
+ case DNS_ERR_TRUNCATED:
+ return dns_stats->stats_n_error_truncated;
+ case DNS_ERR_UNKNOWN:
+ return dns_stats->stats_n_error_unknown;
+ case DNS_ERR_TIMEOUT:
+ return dns_stats->stats_n_error_timeout;
+ case DNS_ERR_SHUTDOWN:
+ return dns_stats->stats_n_error_shutdown;
+ case DNS_ERR_CANCEL:
+ return dns_stats->stats_n_error_cancel;
+ case DNS_ERR_NODATA:
+ return dns_stats->stats_n_error_nodata;
+ default:
+ /* Unhandled code sent back by libevent. */
+ return 0;
+ }
+}
+
+/** Return the total number of DNS request seen for the given libevent DNS
+ * record type. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
+uint64_t
+rep_hist_get_n_dns_request(int type)
+{
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ if (BUG(!dns_stats)) {
+ return 0;
+ }
+ return dns_stats->stats_n_request;
+}
+
/** Note a DNS error for the given given libevent DNS record type and error
* code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA.
*
- * IMPORTANT: Libevent is _not_ returning the type in case of an error and so
- * if error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
+ * NOTE: Libevent is _not_ returning the type in case of an error and so if
+ * error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
*
* See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */
void
-rep_hist_note_dns_query(int type, uint8_t error)
+rep_hist_note_dns_error(int type, uint8_t error)
{
- (void) type;
-
/* Assess if we need to trigger a general overload with regards to the DNS
* errors or not. */
overload_general_dns_assessment();
- /* We only care about timeouts for the moment. */
+ /* Because of the libevent problem (see note in the function comment), we
+ * disregard the DNS query type and keep track of DNS timeout for the
+ * overload state. */
+ if (error == DNS_ERR_TIMEOUT) {
+ overload_dns_stats.stats_n_error_timeout++;
+ }
+ overload_dns_stats.stats_n_request++;
+
+ /* Again, the libevent bug (see function comment), for an error that is
+ * anything but DNS_ERR_NONE, the type is always 0 which means that we don't
+ * have a DNS stat object for it so this code will do nothing until libevent
+ * is fixed. */
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ /* Unsupported DNS query type. */
+ if (!dns_stats) {
+ return;
+ }
+
switch (error) {
+ case DNS_ERR_NONE:
+ dns_stats->stats_n_error_none++;
+ break;
+ case DNS_ERR_FORMAT:
+ dns_stats->stats_n_error_format++;
+ break;
+ case DNS_ERR_SERVERFAILED:
+ dns_stats->stats_n_error_serverfailed++;
+ break;
+ case DNS_ERR_NOTEXIST:
+ dns_stats->stats_n_error_notexist++;
+ break;
+ case DNS_ERR_NOTIMPL:
+ dns_stats->stats_n_error_notimpl++;
+ break;
+ case DNS_ERR_REFUSED:
+ dns_stats->stats_n_error_refused++;
+ break;
+ case DNS_ERR_TRUNCATED:
+ dns_stats->stats_n_error_truncated++;
+ break;
+ case DNS_ERR_UNKNOWN:
+ dns_stats->stats_n_error_unknown++;
+ break;
case DNS_ERR_TIMEOUT:
- overload_dns_stats.stats_n_error_timeout++;
+ dns_stats->stats_n_error_timeout++;
+ break;
+ case DNS_ERR_SHUTDOWN:
+ dns_stats->stats_n_error_shutdown++;
+ break;
+ case DNS_ERR_CANCEL:
+ dns_stats->stats_n_error_cancel++;
+ break;
+ case DNS_ERR_NODATA:
+ dns_stats->stats_n_error_nodata++;
break;
default:
+ /* Unhandled code sent back by libevent. */
break;
}
+}
- /* Increment total number of requests. */
- overload_dns_stats.stats_n_request++;
+/** Note a DNS request for the given given libevent DNS record type. */
+void
+rep_hist_note_dns_request(int type)
+{
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ if (BUG(!dns_stats)) {
+ return;
+ }
+ dns_stats->stats_n_request++;
+}
+
+/***** END of DNS statistics *****/
+
+/** Return true if this overload happened within the last `n_hours`. */
+static bool
+overload_happened_recently(time_t overload_time, int n_hours)
+{
+ /* An overload is relevant if it happened in the last 72 hours */
+ if (overload_time > approx_time() - 3600 * n_hours) {
+ return true;
+ }
+ return false;
}
/* The current version of the overload stats version */
#define OVERLOAD_STATS_VERSION 1
+/** Return the stats_n_read_limit_reached counter. */
+uint64_t
+rep_hist_get_n_read_limit_reached(void)
+{
+ return stats_n_read_limit_reached;
+}
+
+/** Return the stats_n_write_limit_reached counter. */
+uint64_t
+rep_hist_get_n_write_limit_reached(void)
+{
+ return stats_n_write_limit_reached;
+}
+
/** Returns an allocated string for server descriptor for publising information
* on whether we are overloaded or not. */
char *
@@ -420,6 +626,7 @@ rep_hist_note_overload(overload_type_t overload)
SET_TO_START_OF_HOUR(overload_stats.overload_general_time);
break;
case OVERLOAD_READ: {
+ stats_n_read_limit_reached++;
SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
if (approx_time() >= last_read_counted + 60) { /* Count once a minute */
overload_stats.overload_read_count++;
@@ -428,6 +635,7 @@ rep_hist_note_overload(overload_type_t overload)
break;
}
case OVERLOAD_WRITE: {
+ stats_n_write_limit_reached++;
SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
if (approx_time() >= last_write_counted + 60) { /* Count once a minute */
overload_stats.overload_write_count++;
@@ -442,6 +650,22 @@ rep_hist_note_overload(overload_type_t overload)
}
}
+/** Note down that we've reached a TCP port exhaustion. This triggers an
+ * overload general event. */
+void
+rep_hist_note_tcp_exhaustion(void)
+{
+ stats_n_tcp_exhaustion++;
+ rep_hist_note_overload(OVERLOAD_GENERAL);
+}
+
+/** Return the total number of TCP exhaustion times we've reached. */
+uint64_t
+rep_hist_get_n_tcp_exhaustion(void)
+{
+ return stats_n_tcp_exhaustion;
+}
+
/** Return the or_history_t for the OR with identity digest <b>id</b>,
* creating it if necessary. */
static or_history_t *
@@ -641,7 +865,7 @@ rep_hist_downrate_old_runs(time_t now)
return stability_last_downrated + STABILITY_INTERVAL;
/* Okay, we should downrate the data. By how much? */
- while (stability_last_downrated + STABILITY_INTERVAL < now) {
+ while (stability_last_downrated + STABILITY_INTERVAL <= now) {
stability_last_downrated += STABILITY_INTERVAL;
alpha *= STABILITY_ALPHA;
}
@@ -1908,11 +2132,18 @@ rep_hist_note_desc_served(const char * desc)
/** Internal statistics to track how many requests of each type of
* handshake we've received, and how many we've assigned to cpuworkers.
* Useful for seeing trends in cpu load.
+ *
+ * They are reset at every heartbeat.
* @{ */
STATIC int onion_handshakes_requested[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
STATIC int onion_handshakes_assigned[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
/**@}*/
+/** Counters keeping the same stats as above but for the entire duration of the
+ * process (not reset). */
+static uint64_t stats_n_onionskin_assigned[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
+static uint64_t stats_n_onionskin_dropped[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
+
/** A new onionskin (using the <b>type</b> handshake) has arrived. */
void
rep_hist_note_circuit_handshake_requested(uint16_t type)
@@ -1926,8 +2157,20 @@ rep_hist_note_circuit_handshake_requested(uint16_t type)
void
rep_hist_note_circuit_handshake_assigned(uint16_t type)
{
- if (type <= MAX_ONION_HANDSHAKE_TYPE)
+ if (type <= MAX_ONION_HANDSHAKE_TYPE) {
onion_handshakes_assigned[type]++;
+ stats_n_onionskin_assigned[type]++;
+ }
+}
+
+/** We've just drop an onionskin (using the <b>type</b> handshake) due to being
+ * overloaded. */
+void
+rep_hist_note_circuit_handshake_dropped(uint16_t type)
+{
+ if (type <= MAX_ONION_HANDSHAKE_TYPE) {
+ stats_n_onionskin_dropped[type]++;
+ }
}
/** Get the circuit handshake value that is requested. */
@@ -1950,6 +2193,26 @@ rep_hist_get_circuit_handshake_assigned, (uint16_t type))
return onion_handshakes_assigned[type];
}
+/** Get the total number of circuit handshake value that is assigned. */
+MOCK_IMPL(uint64_t,
+rep_hist_get_circuit_n_handshake_assigned, (uint16_t type))
+{
+ if (BUG(type > MAX_ONION_HANDSHAKE_TYPE)) {
+ return 0;
+ }
+ return stats_n_onionskin_assigned[type];
+}
+
+/** Get the total number of circuit handshake value that is dropped. */
+MOCK_IMPL(uint64_t,
+rep_hist_get_circuit_n_handshake_dropped, (uint16_t type))
+{
+ if (BUG(type > MAX_ONION_HANDSHAKE_TYPE)) {
+ return 0;
+ }
+ return stats_n_onionskin_dropped[type];
+}
+
/** Log our onionskin statistics since the last time we were called. */
void
rep_hist_log_circuit_handshake_stats(time_t now)