Fix cases where edge connections can stall.

We discovered two cases where edge connections can stall during testing: 1. Due to final data sitting in the edge inbuf when it was resumed 2. Due to flag synchronization between the token bucket and XON/XOFF The first issue has always existed in C-Tor, but we were able to tickle it in scp testing. If the last data from the protocol is able to fit in the inbuf, but not large enough to send, if an XOFF or connection block comes in at exactly that point, when the edge connection resumes, there will be no data to read from the socket, but the inbuf can just sit there, never draining. We noticed the second issue along the way to finding the first. It seems wrong, but it didn't seem to affect anything in practice. These are extremely rare in normal operation, but with conflux, XON/XOFF activity is more common, so we hit these. Signed-off-by: David Goulet <dgoulet@torproject.org>
author: David Goulet <dgoulet@torproject.org> 2023-04-02 21:18:25 +0000
committer: Mike Perry <mikeperry-git@torproject.org> 2023-04-06 15:57:11 +0000
commit: 2bb8988629b6d7ddb5a15d5490154c9a92e0c866 (patch)
tree: 1669656b2ba01699a1de8089f4ca92d82bf9180c /src/core/mainloop/connection.c
parent: 7c70f713c31c0989a0008c7d0d92a1f12d498e32 (diff)
download: tor-2bb8988629b6d7ddb5a15d5490154c9a92e0c866.tar.gz
tor-2bb8988629b6d7ddb5a15d5490154c9a92e0c866.zip
1 files changed, 20 insertions, 7 deletions
diff --git a/src/core/mainloop/connection.c b/src/core/mainloop/connection.c
index 7204b69e54..5a769f38be 100644
--- a/src/core/mainloop/connection.c
+++ b/src/core/mainloop/connection.c
@@ -187,7 +187,6 @@ static int connection_reached_eof(connection_t *conn);
 static int connection_buf_read_from_socket(connection_t *conn,
                                            ssize_t *max_to_read,
                                            int *socket_error);
-static int connection_process_inbuf(connection_t *conn, int package_partial);
 static void client_check_address_changed(tor_socket_t sock);
 static void set_constrained_socket_buffers(tor_socket_t sock, int size);
 
@@ -3744,9 +3743,16 @@ void
 connection_read_bw_exhausted(connection_t *conn, bool is_global_bw)
 {
   (void)is_global_bw;
-  conn->read_blocked_on_bw = 1;
-  connection_stop_reading(conn);
-  reenable_blocked_connection_schedule();
+  // Double-calls to stop-reading are correlated with stalling for
+  // ssh uploads. Might as well prevent this from happening,
+  // especially the read_blocked_on_bw flag. That was clearly getting
+  // set when it should not be, during an already-blocked XOFF
+  // condition.
+  if (!CONN_IS_EDGE(conn) || !TO_EDGE_CONN(conn)->xoff_received) {
+    conn->read_blocked_on_bw = 1;
+    connection_stop_reading(conn);
+    reenable_blocked_connection_schedule();
+  }
 }
 
 /**
@@ -3923,10 +3929,17 @@ reenable_blocked_connections_cb(mainloop_event_t *ev, void *arg)
   (void)ev;
   (void)arg;
   SMARTLIST_FOREACH_BEGIN(get_connection_array(), connection_t *, conn) {
-    if (conn->read_blocked_on_bw == 1) {
+    /* For conflux, we noticed logs of connection_start_reading() called
+     * multiple times while we were blocked from a previous XOFF, and this
+     * was log was correlated with stalls during ssh uploads. So we added
+     * this additional check, to avoid connection_start_reading() without
+     * getting an XON. The most important piece is always allowing
+     * the read_blocked_on_bw to get cleared, either way. */
+    if (conn->read_blocked_on_bw == 1 &&
+        (!CONN_IS_EDGE(conn) || !TO_EDGE_CONN(conn)->xoff_received)) {
       connection_start_reading(conn);
-      conn->read_blocked_on_bw = 0;
     }
+    conn->read_blocked_on_bw = 0;
     if (conn->write_blocked_on_bw == 1) {
       connection_start_writing(conn);
       conn->write_blocked_on_bw = 0;
@@ -5198,7 +5211,7 @@ set_constrained_socket_buffers(tor_socket_t sock, int size)
  * connection_*_process_inbuf() function. It also passes in
  * package_partial if wanted.
  */
-static int
+int
 connection_process_inbuf(connection_t *conn, int package_partial)
 {
   tor_assert(conn);
author	David Goulet <dgoulet@torproject.org>	2023-04-02 21:18:25 +0000
committer	Mike Perry <mikeperry-git@torproject.org>	2023-04-06 15:57:11 +0000
commit	2bb8988629b6d7ddb5a15d5490154c9a92e0c866 (patch)
tree	1669656b2ba01699a1de8089f4ca92d82bf9180c /src/core/mainloop/connection.c
parent	7c70f713c31c0989a0008c7d0d92a1f12d498e32 (diff)
download	tor-2bb8988629b6d7ddb5a15d5490154c9a92e0c866.tar.gz tor-2bb8988629b6d7ddb5a15d5490154c9a92e0c866.zip