aboutsummaryrefslogtreecommitdiff
path: root/src/feature/stats/rephist.c
diff options
context:
space:
mode:
authorDavid Goulet <dgoulet@torproject.org>2021-10-20 09:59:04 -0400
committerDavid Goulet <dgoulet@torproject.org>2021-10-20 10:00:03 -0400
commit7a8108ea87320da3008e65747baa43c1bbcf13c2 (patch)
tree380e171d32b66081529847027a2eb4415061df10 /src/feature/stats/rephist.c
parent3752a71ded247cd6b32e0ee6c3dfaf906a63a915 (diff)
downloadtor-7a8108ea87320da3008e65747baa43c1bbcf13c2.tar.gz
tor-7a8108ea87320da3008e65747baa43c1bbcf13c2.zip
relay: Overload state on DNS timeout is now X% over Y secs
With this commit, we will only report a general overload state if we've seen more than X% of DNS timeout errors over Y seconds. Previous behavior was to report when a single timeout occured which is really too small of a threshold. The value X is a consensus parameters called "overload_dns_timeout_scale_percent" which is a scaled percentage (factor of 1000) so we can represent decimal points for X like 0.5% for instance. Its default is 1000 which ends up being 1%. The value Y is a consensus parameters called "overload_dns_timeout_period_secs" which is the time period for which will gather DNS errors and once over, we assess if that X% has been reached ultimately triggering a general overload signal. Closes #40491 Signed-off-by: David Goulet <dgoulet@torproject.org>
Diffstat (limited to 'src/feature/stats/rephist.c')
-rw-r--r--src/feature/stats/rephist.c130
1 files changed, 129 insertions, 1 deletions
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index cb3ccdc91e..aef12e5628 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -219,6 +219,59 @@ static uint64_t stats_n_tcp_exhaustion = 0;
/***** DNS statistics *****/
+/* We use a scale here so we can represent percentages with decimal points by
+ * scaling the value by this factor and so 0.5% becomes a value of 500.
+ * Default is 1% and thus min and max range is 0 to 100%. */
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000
+
+/** Consensus parameter: indicate what fraction of DNS timeout errors over the
+ * total number of DNS requests must be reached before we trigger a general
+ * overload signal .*/
+static double overload_dns_timeout_fraction =
+ OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT /
+ OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
+
+/* Number of seconds for the assessment period. Default is 10 minutes (600) and
+ * the min max range is within a 32bit value. */
+#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60)
+#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0
+#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX
+
+/** Consensus parameter: Period, in seconds, over which we count the number of
+ * DNS requests and timeout errors. After that period, we assess if we trigger
+ * an overload or not. */
+static int32_t overload_dns_timeout_period_secs =
+ OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT;
+
+/** Overload DNS statistics. The information in this object is used to assess
+ * if, due to DNS errors, we should emit a general overload signal or not.
+ *
+ * NOTE: This structure is _not_ per DNS query type like the statistics below
+ * because of a libevent bug
+ * (https://github.com/libevent/libevent/issues/1219), on error, the type is
+ * not propagated up back to the user and so we need to keep our own stats for
+ * the overload signal. */
+typedef struct {
+ /** Total number of DNS request seen at an Exit. They might not all end
+ * successfully or might even be lost by tor. This counter is incremented
+ * right before the DNS request is initiated. */
+ uint64_t stats_n_request;
+
+ /** Total number of DNS timeout errors. */
+ uint64_t stats_n_error_timeout;
+
+ /** When is the next assessment time of the general overload for DNS errors.
+ * Once this time is reached, all stats are reset and this time is set to the
+ * next assessment time. */
+ time_t next_assessment_time;
+} overload_dns_stats_t;
+
+/** Keep track of the DNS requests for the general overload state. */
+static overload_dns_stats_t overload_dns_stats;
+
/** Represents the statistics of DNS queries seen if it is an Exit. */
typedef struct {
/* Total number of DNS errors found in RFC 1035 (from 0 to 5 code). */
@@ -266,6 +319,60 @@ get_dns_stats_by_type(const int type)
}
}
+/** Assess the DNS timeout errors and if we have enough to trigger a general
+ * overload. */
+static void
+overload_general_dns_assessment(void)
+{
+ /* Initialize the time. Should be done once. */
+ if (overload_dns_stats.next_assessment_time == 0) {
+ goto reset;
+ }
+
+ /* Not the time yet. */
+ if (overload_dns_stats.next_assessment_time > approx_time()) {
+ return;
+ }
+
+ /* Lets see if we can signal a general overload. */
+ double fraction = (double) overload_dns_stats.stats_n_error_timeout /
+ (double) overload_dns_stats.stats_n_request;
+ if (fraction >= overload_dns_timeout_fraction) {
+ log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") "
+ "fraction %.4f%% is above threshold of %.4f%%",
+ overload_dns_stats.stats_n_error_timeout,
+ fraction * 100.0,
+ overload_dns_timeout_fraction * 100.0);
+ rep_hist_note_overload(OVERLOAD_GENERAL);
+ }
+
+ reset:
+ /* Reset counters for the next period. */
+ overload_dns_stats.stats_n_error_timeout = 0;
+ overload_dns_stats.stats_n_request = 0;
+ overload_dns_stats.next_assessment_time =
+ approx_time() + overload_dns_timeout_period_secs;
+}
+
+/** Called just before the consensus will be replaced. Update the consensus
+ * parameters in case they changed. */
+void
+rep_hist_consensus_has_changed(const networkstatus_t *ns)
+{
+ overload_dns_timeout_fraction =
+ networkstatus_get_param(ns, "overload_dns_timeout_scale_percent",
+ OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT,
+ OVERLOAD_DNS_TIMEOUT_PERCENT_MIN,
+ OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) /
+ OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
+
+ overload_dns_timeout_period_secs =
+ networkstatus_get_param(ns, "overload_dns_timeout_period_secs",
+ OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT,
+ OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN,
+ OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX);
+}
+
/** Return the DNS error count for the given libevent DNS type and error code.
* The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
uint64_t
@@ -320,10 +427,31 @@ rep_hist_get_n_dns_request(int type)
}
/** Note a DNS error for the given given libevent DNS record type and error
- * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
+ * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA.
+ *
+ * NOTE: Libevent is _not_ returning the type in case of an error and so if
+ * error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
+ *
+ * See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */
void
rep_hist_note_dns_error(int type, uint8_t error)
{
+ /* Assess if we need to trigger a general overload with regards to the DNS
+ * errors or not. */
+ overload_general_dns_assessment();
+
+ /* Because of the libevent problem (see note in the function comment), we
+ * disregard the DNS query type and keep track of DNS timeout for the
+ * overload state. */
+ if (error == DNS_ERR_TIMEOUT) {
+ overload_dns_stats.stats_n_error_timeout++;
+ }
+ overload_dns_stats.stats_n_request++;
+
+ /* Again, the libevent bug (see function comment), for an error that is
+ * anything but DNS_ERR_NONE, the type is always 0 which means that we don't
+ * have a DNS stat object for it so this code will do nothing until libevent
+ * is fixed. */
dns_stats_t *dns_stats = get_dns_stats_by_type(type);
/* Unsupported DNS query type. */
if (!dns_stats) {