summaryrefslogtreecommitdiff
path: root/src/feature/stats/rephist.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/feature/stats/rephist.c')
-rw-r--r--src/feature/stats/rephist.c351
1 files changed, 266 insertions, 85 deletions
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index 2bfa14d326..5ff4ef1d2e 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -206,18 +206,33 @@ typedef struct {
uint64_t overload_fd_exhausted;
} overload_stats_t;
+/** Current state of overload stats */
+static overload_stats_t overload_stats;
+
+/** Counters to count the number of times we've reached an overload for the
+ * global connection read/write limit. Reported on the MetricsPort. */
+static uint64_t stats_n_read_limit_reached = 0;
+static uint64_t stats_n_write_limit_reached = 0;
+
+/** Total number of times we've reached TCP port exhaustion. */
+static uint64_t stats_n_tcp_exhaustion = 0;
+
/***** DNS statistics *****/
-/** Represents the statistics of DNS queries seen if it is an Exit. */
+/** Overload DNS statistics. The information in this object is used to assess
+ * if, due to DNS errors, we should emit a general overload signal or not.
+ *
+ * NOTE: This structure is _not_ per DNS query type like the statistics below
+ * because of a libevent bug
+ * (https://github.com/libevent/libevent/issues/1219), on error, the type is
+ * not propagated up back to the user and so we need to keep our own stats for
+ * the overload signal. */
typedef struct {
/** Total number of DNS request seen at an Exit. They might not all end
* successfully or might even be lost by tor. This counter is incremented
* right before the DNS request is initiated. */
uint64_t stats_n_request;
- /** Total number of DNS timeout errors. */
- uint64_t stats_n_error_timeout;
-
/** When is the next assessment time of the general overload for DNS errors.
* Once this time is reached, all stats are reset and this time is set to the
* next assessment time. */
@@ -227,121 +242,230 @@ typedef struct {
/** Keep track of the DNS requests for the general overload state. */
static overload_dns_stats_t overload_dns_stats;
-/* We use a scale here so we can represent percentages with decimal points by
- * scaling the value by this factor and so 0.5% becomes a value of 500.
- * Default is 1% and thus min and max range is 0 to 100%. */
-#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0
-#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000
-#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0
-#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000
-
-/** Consensus parameter: indicate what fraction of DNS timeout errors over the
- * total number of DNS requests must be reached before we trigger a general
- * overload signal .*/
-static double overload_dns_timeout_fraction =
- OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT /
- OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
-
-/* Number of seconds for the assessment period. Default is 10 minutes (600) and
- * the min max range is within a 32bit value. */
-#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60)
-#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0
-#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX
-
-/** Consensus parameter: Period, in seconds, over which we count the number of
- * DNS requests and timeout errors. After that period, we assess if we trigger
- * an overload or not. */
-static int32_t overload_dns_timeout_period_secs =
- OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT;
+/** Represents the statistics of DNS queries seen if it is an Exit. */
+typedef struct {
+ /* Total number of DNS errors found in RFC 1035 (from 0 to 5 code). */
+ uint64_t stats_n_error_none; /* 0 */
+ uint64_t stats_n_error_format; /* 1 */
+ uint64_t stats_n_error_serverfailed; /* 2 */
+ uint64_t stats_n_error_notexist; /* 3 */
+ uint64_t stats_n_error_notimpl; /* 4 */
+ uint64_t stats_n_error_refused; /* 5 */
+
+ /* Total number of DNS errors specific to libevent. */
+ uint64_t stats_n_error_truncated; /* 65 */
+ uint64_t stats_n_error_unknown; /* 66 */
+ uint64_t stats_n_error_tor_timeout; /* 67 */
+ uint64_t stats_n_error_shutdown; /* 68 */
+ uint64_t stats_n_error_cancel; /* 69 */
+ uint64_t stats_n_error_nodata; /* 70 */
+
+ /* Total number of DNS request seen at an Exit. They might not all end
+ * successfully or might even be lost by tor. This counter is incremented
+ * right before the DNS request is initiated. */
+ uint64_t stats_n_request;
+} dns_stats_t;
+
+/* This is disabled because of the libevent bug where on error we don't get the
+ * DNS query type back. Once it is fixed, we can re-enable this. */
+#if 0
+/** DNS statistics store for each DNS record type for which tor supports only
+ * three at the moment: A, PTR and AAAA. */
+static dns_stats_t dns_A_stats;
+static dns_stats_t dns_PTR_stats;
+static dns_stats_t dns_AAAA_stats;
+#endif
-/** Current state of overload stats */
-static overload_stats_t overload_stats;
+/** DNS query statistics store. It covers all type of queries. */
+static dns_stats_t dns_all_stats;
-/** Return true if this overload happened within the last `n_hours`. */
-static bool
-overload_happened_recently(time_t overload_time, int n_hours)
+/** Return the point to the DNS statistics store. Ignore the type for now
+ * because of a libevent problem. */
+static inline dns_stats_t *
+get_dns_stats_by_type(const int type)
{
- /* An overload is relevant if it happened in the last 72 hours */
- if (overload_time > approx_time() - 3600 * n_hours) {
- return true;
- }
- return false;
+ (void) type;
+ return &dns_all_stats;
}
-/** Assess the DNS timeout errors and if we have enough to trigger a general
- * overload. */
-static void
-overload_general_dns_assessment(void)
+#if 0
+/** From a libevent record type, return a pointer to the corresponding DNS
+ * statistics store. NULL is returned if the type is unhandled. */
+static inline dns_stats_t *
+get_dns_stats_by_type(const int type)
{
- /* Initialize the time. Should be done once. */
- if (overload_dns_stats.next_assessment_time == 0) {
- goto reset;
+ switch (type) {
+ case DNS_IPv4_A:
+ return &dns_A_stats;
+ case DNS_PTR:
+ return &dns_PTR_stats;
+ case DNS_IPv6_AAAA:
+ return &dns_AAAA_stats;
+ default:
+ return NULL;
}
+}
+#endif
- /* Not the time yet. */
- if (overload_dns_stats.next_assessment_time > approx_time()) {
- return;
+/** Return the DNS error count for the given libevent DNS type and error code.
+ * The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
+uint64_t
+rep_hist_get_n_dns_error(int type, uint8_t error)
+{
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ if (BUG(!dns_stats)) {
+ return 0;
}
- reset:
- /* Reset counters for the next period. */
- overload_dns_stats.stats_n_error_timeout = 0;
- overload_dns_stats.stats_n_request = 0;
- overload_dns_stats.next_assessment_time =
- approx_time() + overload_dns_timeout_period_secs;
+ switch (error) {
+ case DNS_ERR_NONE:
+ return dns_stats->stats_n_error_none;
+ case DNS_ERR_FORMAT:
+ return dns_stats->stats_n_error_format;
+ case DNS_ERR_SERVERFAILED:
+ return dns_stats->stats_n_error_serverfailed;
+ case DNS_ERR_NOTEXIST:
+ return dns_stats->stats_n_error_notexist;
+ case DNS_ERR_NOTIMPL:
+ return dns_stats->stats_n_error_notimpl;
+ case DNS_ERR_REFUSED:
+ return dns_stats->stats_n_error_refused;
+ case DNS_ERR_TRUNCATED:
+ return dns_stats->stats_n_error_truncated;
+ case DNS_ERR_UNKNOWN:
+ return dns_stats->stats_n_error_unknown;
+ case DNS_ERR_TIMEOUT:
+ return dns_stats->stats_n_error_tor_timeout;
+ case DNS_ERR_SHUTDOWN:
+ return dns_stats->stats_n_error_shutdown;
+ case DNS_ERR_CANCEL:
+ return dns_stats->stats_n_error_cancel;
+ case DNS_ERR_NODATA:
+ return dns_stats->stats_n_error_nodata;
+ default:
+ /* Unhandled code sent back by libevent. */
+ return 0;
+ }
}
-/** Called just before the consensus will be replaced. Update the consensus
- * parameters in case they changed. */
-void
-rep_hist_consensus_has_changed(const networkstatus_t *ns)
+/** Return the total number of DNS request seen for the given libevent DNS
+ * record type. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
+uint64_t
+rep_hist_get_n_dns_request(int type)
{
- overload_dns_timeout_fraction =
- networkstatus_get_param(ns, "overload_dns_timeout_scale_percent",
- OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT,
- OVERLOAD_DNS_TIMEOUT_PERCENT_MIN,
- OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) /
- OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
-
- overload_dns_timeout_period_secs =
- networkstatus_get_param(ns, "overload_dns_timeout_period_secs",
- OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT,
- OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN,
- OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX);
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ if (BUG(!dns_stats)) {
+ return 0;
+ }
+ return dns_stats->stats_n_request;
}
/** Note a DNS error for the given given libevent DNS record type and error
* code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA.
*
- * IMPORTANT: Libevent is _not_ returning the type in case of an error and so
- * if error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
+ * NOTE: Libevent is _not_ returning the type in case of an error and so if
+ * error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
*
* See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */
void
-rep_hist_note_dns_query(int type, uint8_t error)
+rep_hist_note_dns_error(int type, uint8_t error)
{
- (void) type;
+ overload_dns_stats.stats_n_request++;
- /* Assess if we need to trigger a general overload with regards to the DNS
- * errors or not. */
- overload_general_dns_assessment();
+ /* Again, the libevent bug (see function comment), for an error that is
+ * anything but DNS_ERR_NONE, the type is always 0 which means that we don't
+ * have a DNS stat object for it so this code will do nothing until libevent
+ * is fixed. */
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ /* Unsupported DNS query type. */
+ if (!dns_stats) {
+ return;
+ }
- /* We only care about timeouts for the moment. */
switch (error) {
+ case DNS_ERR_NONE:
+ dns_stats->stats_n_error_none++;
+ break;
+ case DNS_ERR_FORMAT:
+ dns_stats->stats_n_error_format++;
+ break;
+ case DNS_ERR_SERVERFAILED:
+ dns_stats->stats_n_error_serverfailed++;
+ break;
+ case DNS_ERR_NOTEXIST:
+ dns_stats->stats_n_error_notexist++;
+ break;
+ case DNS_ERR_NOTIMPL:
+ dns_stats->stats_n_error_notimpl++;
+ break;
+ case DNS_ERR_REFUSED:
+ dns_stats->stats_n_error_refused++;
+ break;
+ case DNS_ERR_TRUNCATED:
+ dns_stats->stats_n_error_truncated++;
+ break;
+ case DNS_ERR_UNKNOWN:
+ dns_stats->stats_n_error_unknown++;
+ break;
case DNS_ERR_TIMEOUT:
- overload_dns_stats.stats_n_error_timeout++;
+ dns_stats->stats_n_error_tor_timeout++;
+ break;
+ case DNS_ERR_SHUTDOWN:
+ dns_stats->stats_n_error_shutdown++;
+ break;
+ case DNS_ERR_CANCEL:
+ dns_stats->stats_n_error_cancel++;
+ break;
+ case DNS_ERR_NODATA:
+ dns_stats->stats_n_error_nodata++;
break;
default:
+ /* Unhandled code sent back by libevent. */
break;
}
+}
- /* Increment total number of requests. */
- overload_dns_stats.stats_n_request++;
+/** Note a DNS request for the given given libevent DNS record type. */
+void
+rep_hist_note_dns_request(int type)
+{
+ dns_stats_t *dns_stats = get_dns_stats_by_type(type);
+ if (BUG(!dns_stats)) {
+ return;
+ }
+ dns_stats->stats_n_request++;
+}
+
+/***** END of DNS statistics *****/
+
+/** Return true if this overload happened within the last `n_hours`. */
+static bool
+overload_happened_recently(time_t overload_time, int n_hours)
+{
+ /* An overload is relevant if it happened in the last 72 hours */
+ if (overload_time > approx_time() - 3600 * n_hours) {
+ return true;
+ }
+ return false;
}
/* The current version of the overload stats version */
#define OVERLOAD_STATS_VERSION 1
+/** Return the stats_n_read_limit_reached counter. */
+uint64_t
+rep_hist_get_n_read_limit_reached(void)
+{
+ return stats_n_read_limit_reached;
+}
+
+/** Return the stats_n_write_limit_reached counter. */
+uint64_t
+rep_hist_get_n_write_limit_reached(void)
+{
+ return stats_n_write_limit_reached;
+}
+
/** Returns an allocated string for server descriptor for publising information
* on whether we are overloaded or not. */
char *
@@ -420,6 +544,7 @@ rep_hist_note_overload(overload_type_t overload)
SET_TO_START_OF_HOUR(overload_stats.overload_general_time);
break;
case OVERLOAD_READ: {
+ stats_n_read_limit_reached++;
SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
if (approx_time() >= last_read_counted + 60) { /* Count once a minute */
overload_stats.overload_read_count++;
@@ -428,6 +553,7 @@ rep_hist_note_overload(overload_type_t overload)
break;
}
case OVERLOAD_WRITE: {
+ stats_n_write_limit_reached++;
SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
if (approx_time() >= last_write_counted + 60) { /* Count once a minute */
overload_stats.overload_write_count++;
@@ -442,6 +568,22 @@ rep_hist_note_overload(overload_type_t overload)
}
}
+/** Note down that we've reached a TCP port exhaustion. This triggers an
+ * overload general event. */
+void
+rep_hist_note_tcp_exhaustion(void)
+{
+ stats_n_tcp_exhaustion++;
+ rep_hist_note_overload(OVERLOAD_GENERAL);
+}
+
+/** Return the total number of TCP exhaustion times we've reached. */
+uint64_t
+rep_hist_get_n_tcp_exhaustion(void)
+{
+ return stats_n_tcp_exhaustion;
+}
+
/** Return the or_history_t for the OR with identity digest <b>id</b>,
* creating it if necessary. */
static or_history_t *
@@ -641,7 +783,7 @@ rep_hist_downrate_old_runs(time_t now)
return stability_last_downrated + STABILITY_INTERVAL;
/* Okay, we should downrate the data. By how much? */
- while (stability_last_downrated + STABILITY_INTERVAL < now) {
+ while (stability_last_downrated + STABILITY_INTERVAL <= now) {
stability_last_downrated += STABILITY_INTERVAL;
alpha *= STABILITY_ALPHA;
}
@@ -1908,11 +2050,18 @@ rep_hist_note_desc_served(const char * desc)
/** Internal statistics to track how many requests of each type of
* handshake we've received, and how many we've assigned to cpuworkers.
* Useful for seeing trends in cpu load.
+ *
+ * They are reset at every heartbeat.
* @{ */
STATIC int onion_handshakes_requested[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
STATIC int onion_handshakes_assigned[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
/**@}*/
+/** Counters keeping the same stats as above but for the entire duration of the
+ * process (not reset). */
+static uint64_t stats_n_onionskin_assigned[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
+static uint64_t stats_n_onionskin_dropped[MAX_ONION_HANDSHAKE_TYPE+1] = {0};
+
/** A new onionskin (using the <b>type</b> handshake) has arrived. */
void
rep_hist_note_circuit_handshake_requested(uint16_t type)
@@ -1926,8 +2075,20 @@ rep_hist_note_circuit_handshake_requested(uint16_t type)
void
rep_hist_note_circuit_handshake_assigned(uint16_t type)
{
- if (type <= MAX_ONION_HANDSHAKE_TYPE)
+ if (type <= MAX_ONION_HANDSHAKE_TYPE) {
onion_handshakes_assigned[type]++;
+ stats_n_onionskin_assigned[type]++;
+ }
+}
+
+/** We've just drop an onionskin (using the <b>type</b> handshake) due to being
+ * overloaded. */
+void
+rep_hist_note_circuit_handshake_dropped(uint16_t type)
+{
+ if (type <= MAX_ONION_HANDSHAKE_TYPE) {
+ stats_n_onionskin_dropped[type]++;
+ }
}
/** Get the circuit handshake value that is requested. */
@@ -1950,6 +2111,26 @@ rep_hist_get_circuit_handshake_assigned, (uint16_t type))
return onion_handshakes_assigned[type];
}
+/** Get the total number of circuit handshake value that is assigned. */
+MOCK_IMPL(uint64_t,
+rep_hist_get_circuit_n_handshake_assigned, (uint16_t type))
+{
+ if (BUG(type > MAX_ONION_HANDSHAKE_TYPE)) {
+ return 0;
+ }
+ return stats_n_onionskin_assigned[type];
+}
+
+/** Get the total number of circuit handshake value that is dropped. */
+MOCK_IMPL(uint64_t,
+rep_hist_get_circuit_n_handshake_dropped, (uint16_t type))
+{
+ if (BUG(type > MAX_ONION_HANDSHAKE_TYPE)) {
+ return 0;
+ }
+ return stats_n_onionskin_dropped[type];
+}
+
/** Log our onionskin statistics since the last time we were called. */
void
rep_hist_log_circuit_handshake_stats(time_t now)