aboutsummaryrefslogtreecommitdiff
path: root/src/or/scheduler_kist.c
blob: 97722cb25497452ba8ee67f7838280987e8b1fba (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
/* Copyright (c) 2017, The Tor Project, Inc. */
/* See LICENSE for licensing information */

#include <event2/event.h>
#include <netinet/tcp.h>

#include "or.h"
#include "buffers.h"
#include "config.h"
#include "connection.h"
#include "networkstatus.h"
#define TOR_CHANNEL_INTERNAL_
#include "channel.h"
#include "channeltls.h"
#define SCHEDULER_PRIVATE_
#include "scheduler.h"

#define TLS_PER_CELL_OVERHEAD 29

/* Kernel interface needed for KIST. */
#include <linux/sockios.h>

/*****************************************************************************
 * Data structures and supporting functions
 *****************************************************************************/

/* Socket_table hash table stuff. The socket_table keeps track of per-socket
 * limit information imposed by kist and used by kist. */

static uint32_t
socket_table_ent_hash(const socket_table_ent_t *ent)
{
  return (uint32_t)ent->chan->global_identifier;
}

static unsigned
socket_table_ent_eq(const socket_table_ent_t *a, const socket_table_ent_t *b)
{
  return a->chan->global_identifier == b->chan->global_identifier;
}

typedef HT_HEAD(socket_table_s, socket_table_ent_s) socket_table_t;

static socket_table_t socket_table = HT_INITIALIZER();

HT_PROTOTYPE(socket_table_s, socket_table_ent_s, node, socket_table_ent_hash,
             socket_table_ent_eq)
HT_GENERATE2(socket_table_s, socket_table_ent_s, node, socket_table_ent_hash,
             socket_table_ent_eq, 0.6, tor_reallocarray, tor_free_)

/* outbuf_table hash table stuff. The outbuf_table keeps track of which
 * channels have data sitting in their outbuf so the kist scheduler can force
 * a write from outbuf to kernel periodically during a run and at the end of a
 * run. */

typedef struct outbuf_table_ent_s {
  HT_ENTRY(outbuf_table_ent_s) node;
  channel_t *chan;
} outbuf_table_ent_t;

static uint32_t
outbuf_table_ent_hash(const outbuf_table_ent_t *ent)
{
  return (uint32_t)ent->chan->global_identifier;
}

static unsigned
outbuf_table_ent_eq(const outbuf_table_ent_t *a, const outbuf_table_ent_t *b)
{
  return a->chan->global_identifier == b->chan->global_identifier;
}

static outbuf_table_t outbuf_table = HT_INITIALIZER();

HT_PROTOTYPE(outbuf_table_s, outbuf_table_ent_s, node, outbuf_table_ent_hash,
             outbuf_table_ent_eq)
HT_GENERATE2(outbuf_table_s, outbuf_table_ent_s, node, outbuf_table_ent_hash,
             outbuf_table_ent_eq, 0.6, tor_reallocarray, tor_free_)

/*****************************************************************************
 * Other internal data
 *****************************************************************************/

static struct timeval scheduler_last_run = {0, 0};
static double sock_buf_size_factor = 1.0;
STATIC int32_t sched_run_interval = 10;
static scheduler_t *kist_scheduler = NULL;

/*****************************************************************************
 * Internally called function implementations
 *****************************************************************************/

/* Little helper function to get the length of a channel's output buffer */
static inline size_t
channel_outbuf_length(channel_t *chan)
{
  return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf);
}

/* Little helper function for HT_FOREACH_FN. */
static int
each_channel_write_to_kernel(outbuf_table_ent_t *ent, void *data)
{
  (void) data; /* Make compiler happy. */
  channel_write_to_kernel(ent->chan);
  return 0; /* Returning non-zero removes the element from the table. */
}

/* Free the given outbuf table entry ent. */
static int
free_outbuf_info_by_ent(outbuf_table_ent_t *ent, void *data)
{
  (void) data; /* Make compiler happy. */
  log_debug(LD_SCHED, "Freeing outbuf table entry from chan=%" PRIu64,
            ent->chan->global_identifier);
  tor_free(ent);
  return 1; /* So HT_FOREACH_FN will remove the element */
}

/* Clean up outbuf_table. Probably because the KIST sched impl is going away */
static void
free_all_outbuf_info(void)
{
  HT_FOREACH_FN(outbuf_table_s, &outbuf_table, free_outbuf_info_by_ent, NULL);
}

/* Free the given socket table entry ent. */
static int
free_socket_info_by_ent(socket_table_ent_t *ent, void *data)
{
  (void) data; /* Make compiler happy. */
  log_debug(LD_SCHED, "Freeing socket table entry from chan=%" PRIu64,
            ent->chan->global_identifier);
  tor_free(ent);
  return 1; /* So HT_FOREACH_FN will remove the element */
}

/* Clean up socket_table. Probably because the KIST sched impl is going away */
static void
free_all_socket_info(void)
{
  HT_FOREACH_FN(socket_table_s, &socket_table, free_socket_info_by_ent, NULL);
}

static socket_table_ent_t *
socket_table_search(socket_table_t *table, const channel_t *chan)
{
  socket_table_ent_t search, *ent = NULL;
  search.chan = chan;
  ent = HT_FIND(socket_table_s, table, &search);
  return ent;
}

/* Free a socket entry in table for the given chan. */
static void
free_socket_info_by_chan(socket_table_t *table, const channel_t *chan)
{
  socket_table_ent_t *ent = NULL;
  ent = socket_table_search(table, chan);
  if (!ent)
    return;
  log_debug(LD_SCHED, "scheduler free socket info for chan=%" PRIu64,
            chan->global_identifier);
  HT_REMOVE(socket_table_s, table, ent);
  free_socket_info_by_ent(ent, NULL);
}

MOCK_IMPL(void,
update_socket_info_impl, (socket_table_ent_t *ent))
{
  int64_t tcp_space, extra_space;
  const tor_socket_t sock =
    TO_CONN(BASE_CHAN_TO_TLS((channel_t *) ent->chan)->conn)->s;
  struct tcp_info tcp;
  socklen_t tcp_info_len = sizeof(tcp);

  /* Gather information */
  getsockopt(sock, SOL_TCP, TCP_INFO, (void *)&(tcp), &tcp_info_len);
  ioctl(sock, SIOCOUTQNSD, &(ent->notsent));
  ent->cwnd = tcp.tcpi_snd_cwnd;
  ent->unacked = tcp.tcpi_unacked;
  ent->mss = tcp.tcpi_snd_mss;

  tcp_space = (ent->cwnd - ent->unacked) * ent->mss;
  if (tcp_space < 0) {
    tcp_space = 0;
  }
  extra_space =
    clamp_double_to_int64((ent->cwnd * ent->mss) * sock_buf_size_factor) -
    ent->notsent;
  if (extra_space < 0) {
    extra_space = 0;
  }
  ent->limit = tcp_space + extra_space;
  return;
}

/* Given a socket that isn't in the table, add it.
 * Given a socket that is in the table, reinit values that need init-ing
 * every scheduling run
 */
static void
init_socket_info(socket_table_t *table, const channel_t *chan)
{
  socket_table_ent_t *ent = NULL;
  ent = socket_table_search(table, chan);
  if (!ent) {
    log_debug(LD_SCHED, "scheduler init socket info for chan=%" PRIu64,
              chan->global_identifier);
    ent = tor_malloc_zero(sizeof(*ent));
    ent->chan = chan;
    HT_INSERT(socket_table_s, table, ent);
  }
  ent->written = 0;
}

/* Add chan to the outbuf table if it isn't already in it. If it is, then don't
 * do anything */
static void
outbuf_table_add(outbuf_table_t *table, channel_t *chan)
{
  outbuf_table_ent_t search, *ent;
  search.chan = chan;
  ent = HT_FIND(outbuf_table_s, table, &search);
  if (!ent) {
    log_debug(LD_SCHED, "scheduler init outbuf info for chan=%" PRIu64,
              chan->global_identifier);
    ent = tor_malloc_zero(sizeof(*ent));
    ent->chan = chan;
    HT_INSERT(outbuf_table_s, table, ent);
  }
}

static void
outbuf_table_remove(outbuf_table_t *table, channel_t *chan)
{
  outbuf_table_ent_t search, *ent;
  search.chan = chan;
  ent = HT_FIND(outbuf_table_s, table, &search);
  if (ent) {
    HT_REMOVE(outbuf_table_s, table, ent);
    free_outbuf_info_by_ent(ent, NULL);
  }
}

/* Set the scheduler running interval. */
static void
set_scheduler_run_interval(const networkstatus_t *ns)
{
  int32_t old_sched_run_interval = sched_run_interval;
  sched_run_interval = kist_scheduler_run_interval(ns);
  if (old_sched_run_interval != sched_run_interval) {
    log_info(LD_SCHED, "Scheduler KIST changing its running interval "
                       "from %" PRId32 " to %" PRId32,
             old_sched_run_interval, sched_run_interval);
  }
}

/* Return true iff the channel associated socket can write to the kernel that
 * is hasn't reach the limit. */
static int
socket_can_write(socket_table_t *table, const channel_t *chan)
{
  socket_table_ent_t *ent = NULL;
  ent = socket_table_search(table, chan);
  tor_assert(ent);

  int64_t kist_limit_space =
    (int64_t) (ent->limit - ent->written) /
    (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD);
  return kist_limit_space > 0;
}

/* Update the channel's socket kernel information. */
static void
update_socket_info(socket_table_t *table, const channel_t *chan)
{
  socket_table_ent_t *ent = NULL;
  ent = socket_table_search(table, chan);
  tor_assert(ent);
  update_socket_info_impl(ent);
}

/* Increament the channel's socket written value by the number of bytes. */
static void
update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes)
{
  socket_table_ent_t *ent = NULL;
  ent = socket_table_search(table, chan);
  tor_assert(ent);

  log_debug(LD_SCHED, "chan=%" PRIu64 " wrote %lu bytes, old was %" PRIi64,
            chan->global_identifier, bytes, ent->written);

  ent->written += bytes;
}

/*
 * A naive KIST impl would write every single cell all the way to the kernel.
 * That would take a lot of system calls. A less bad KIST impl would write a
 * channel's outbuf to the kernel only when we are switching to a different
 * channel. But if we have two channels with equal priority, we end up writing
 * one cell for each and bouncing back and forth. This KIST impl avoids that
 * by only writing a channel's outbuf to the kernel if it has 8 cells or more
 * in it.
 */
MOCK_IMPL(int, channel_should_write_to_kernel,
          (outbuf_table_t *table, channel_t *chan))
{
  outbuf_table_add(table, chan);
  /* CELL_MAX_NETWORK_SIZE * 8 because we only want to write the outbuf to the
   * kernel if there's 8 or more cells waiting */
  return channel_outbuf_length(chan) > (CELL_MAX_NETWORK_SIZE * 8);
}

/* Little helper function to write a channel's outbuf all the way to the
 * kernel */
MOCK_IMPL(void, channel_write_to_kernel, (channel_t *chan))
{
  log_debug(LD_SCHED, "Writing %lu bytes to kernel for chan %" PRIu64,
            channel_outbuf_length(chan), chan->global_identifier);
  connection_handle_write(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn), 0);
}

/* Return true iff the scheduler has work to perform. */
static int
have_work(void)
{
  smartlist_t *cp = get_channels_pending();
  tor_assert(cp);
  return smartlist_len(cp) > 0;
}

/* Function of the scheduler interface: free_all() */
static void
kist_free_all(void)
{
  free_all_outbuf_info();
  free_all_socket_info();
}

/* Function of the scheduler interface: on_channel_free() */
static void
kist_on_channel_free(const channel_t *chan)
{
  free_socket_info_by_chan(&socket_table, chan);
}

/* Function of the scheduler interface: on_new_consensus() */
static void
kist_scheduler_on_new_consensus(const networkstatus_t *old_c,
                                const networkstatus_t *new_c)
{
  (void) old_c;
  (void) new_c;

  set_scheduler_run_interval(new_c);
}

/* Function of the scheduler interface: run() */
static void
kist_scheduler_on_new_options(void)
{
  sock_buf_size_factor = get_options()->KISTSockBufSizeFactor;

  /* Calls kist_scheduler_run_interval which calls get_options(). */
  set_scheduler_run_interval(NULL);
}

/* Function of the scheduler interface: init() */
static void
kist_scheduler_init(void)
{
  kist_scheduler_on_new_options();
  tor_assert(sched_run_interval > 0);
}

/* Function of the scheduler interface: schedule() */
static void
kist_scheduler_schedule(void)
{
  struct timeval now, next_run;
  int32_t diff;
  struct event *ev = get_run_sched_ev();
  tor_assert(ev);
  if (!have_work()) {
    return;
  }
  tor_gettimeofday(&now);
  diff = (int32_t) tv_mdiff(&scheduler_last_run, &now);
  if (diff < sched_run_interval) {
    next_run.tv_sec = 0;
    /* 1000 for ms -> us */
    next_run.tv_usec = (sched_run_interval - diff) * 1000;
    /* Readding an event reschedules it. It does not duplicate it. */
    event_add(ev, &next_run);
  } else {
    event_active(ev, EV_TIMEOUT, 1);
  }
}

/* Function of the scheduler interface: run() */
static void
kist_scheduler_run(void)
{
  /* Define variables */
  channel_t *chan = NULL; // current working channel
  /* The last distinct chan served in a sched loop. */
  channel_t *prev_chan = NULL;
  int flush_result; // temporarily store results from flush calls
  /* Channels to be readding to pending at the end */
  smartlist_t *to_readd = NULL;
  smartlist_t *cp = get_channels_pending();

  /* For each pending channel, collect new kernel information */
  SMARTLIST_FOREACH_BEGIN(cp, const channel_t *, pchan) {
      init_socket_info(&socket_table, pchan);
      update_socket_info(&socket_table, pchan);
  } SMARTLIST_FOREACH_END(pchan);

  log_debug(LD_SCHED, "Running the scheduler. %d channels pending",
            smartlist_len(cp));

  /* The main scheduling loop. Loop until there are no more pending channels */
  while (smartlist_len(cp) > 0) {
    /* get best channel */
    chan = smartlist_pqueue_pop(cp, scheduler_compare_channels,
                                offsetof(channel_t, sched_heap_idx));
    tor_assert(chan);
    outbuf_table_add(&outbuf_table, chan);

    /* if we have switched to a new channel, consider writing the previous
     * channel's outbuf to the kernel. */
    if (!prev_chan) prev_chan = chan;
    if (prev_chan != chan) {
      if (channel_should_write_to_kernel(&outbuf_table, prev_chan)) {
        channel_write_to_kernel(prev_chan);
        outbuf_table_remove(&outbuf_table, prev_chan);
      }
      prev_chan = chan;
    }

    /* Only flush and write if the per-socket limit hasn't been hit */
    if (socket_can_write(&socket_table, chan)) {
      /* flush to channel queue/outbuf */
      flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells
      /* flush_result has the # cells flushed */
      if (flush_result > 0) {
        update_socket_written(&socket_table, chan, flush_result *
                              (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD));
      }
      /* XXX What if we didn't flush? */
    }

    /* Decide what to do with the channel now */

    if (!channel_more_to_flush(chan) &&
        !socket_can_write(&socket_table, chan)) {

      /* Case 1: no more cells to send, and cannot write */

      /*
       * You might think we should put the channel in SCHED_CHAN_IDLE. And
       * you're probably correct. While implementing KIST, we found that the
       * scheduling system would sometimes lose track of channels when we did
       * that. We suspect it has to do with the difference between "can't
       * write because socket/outbuf is full" and KIST's "can't write because
       * we've arbitrarily decided that that's enough for now." Sometimes
       * channels run out of cells at the same time they hit their
       * kist-imposed write limit and maybe the rest of Tor doesn't put the
       * channel back in pending when it is supposed to.
       *
       * This should be investigated again. It is as simple as changing
       * SCHED_CHAN_WAITING_FOR_CELLS to SCHED_CHAN_IDLE and seeing if Tor
       * starts having serious throughput issues. Best done in shadow/chutney.
       */
      chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
      log_debug(LD_SCHED, "chan=%" PRIu64 " now waiting_for_cells",
                chan->global_identifier);
    } else if (!channel_more_to_flush(chan)) {

      /* Case 2: no more cells to send, but still open for writes */

      chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
      log_debug(LD_SCHED, "chan=%" PRIu64 " now waiting_for_cells",
                chan->global_identifier);
    } else if (!socket_can_write(&socket_table, chan)) {

      /* Case 3: cells to send, but cannot write */

      chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
      if (!to_readd)
        to_readd = smartlist_new();
      smartlist_add(to_readd, chan);
      log_debug(LD_SCHED, "chan=%" PRIu64 " now waiting_to_write",
                chan->global_identifier);
    } else {

      /* Case 4: cells to send, and still open for writes */

      chan->scheduler_state = SCHED_CHAN_PENDING;
      smartlist_pqueue_add(cp, scheduler_compare_channels,
                           offsetof(channel_t, sched_heap_idx), chan);
    }
  } /* End of main scheduling loop */

  /* Write the outbuf of any channels that still have data */
  HT_FOREACH_FN(outbuf_table_s, &outbuf_table, each_channel_write_to_kernel,
                NULL);
  free_all_outbuf_info();
  HT_CLEAR(outbuf_table_s, &outbuf_table);

  log_debug(LD_SCHED, "len pending=%d, len to_readd=%d",
            smartlist_len(cp),
            (to_readd ? smartlist_len(to_readd) : -1));

  /* Readd any channels we need to */
  if (to_readd) {
    SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
      readd_chan->scheduler_state = SCHED_CHAN_PENDING;
      if (!smartlist_contains(cp, readd_chan)) {
        smartlist_pqueue_add(cp, scheduler_compare_channels,
                             offsetof(channel_t, sched_heap_idx), readd_chan);
      }
    } SMARTLIST_FOREACH_END(readd_chan);
    smartlist_free(to_readd);
  }

  tor_gettimeofday(&scheduler_last_run);
}

/*****************************************************************************
 * Externally called function implementations not called through scheduler_t
 *****************************************************************************/

/* Return the KIST scheduler object. If it didn't exists, return a newly
 * allocated one but init() is not called. */
scheduler_t *
get_kist_scheduler(void)
{
  if (!kist_scheduler) {
    log_debug(LD_SCHED, "Allocating kist scheduler struct");
    kist_scheduler = tor_malloc_zero(sizeof(*kist_scheduler));
    kist_scheduler->free_all = kist_free_all;
    kist_scheduler->on_channel_free = kist_on_channel_free;
    kist_scheduler->init = kist_scheduler_init;
    kist_scheduler->on_new_consensus = kist_scheduler_on_new_consensus;
    kist_scheduler->schedule = kist_scheduler_schedule;
    kist_scheduler->run = kist_scheduler_run;
    kist_scheduler->on_new_options = kist_scheduler_on_new_options;
  }
  return kist_scheduler;
}

/* Default interval that KIST runs (in ms). */
#define KIST_SCHED_RUN_INTERVAL_DEFAULT 10
/* Minimum interval that KIST runs. This value disables KIST. */
#define KIST_SCHED_RUN_INTERVAL_MIN 0
/* Maximum interval that KIST runs (in ms). */
#define KIST_SCHED_RUN_INTERVAL_MAX 100

/* Check the torrc for the configured KIST scheduler run frequency.
 * - If torrc < 0, then return the negative torrc value (shouldn't even be
 *   using KIST)
 * - If torrc > 0, then return the positive torrc value (should use KIST, and
 *   should use the set value)
 * - If torrc == 0, then look in the consensus for what the value should be.
 *   - If == 0, then return -1 (don't use KIST)
 *   - If > 0, then return the positive consensus value
 *   - If consensus doesn't say anything, return 10 milliseconds
 */
int32_t
kist_scheduler_run_interval(const networkstatus_t *ns)
{
  int32_t run_interval = (int32_t)get_options()->KISTSchedRunInterval;
  if (run_interval != 0) {
    log_debug(LD_SCHED, "Found KISTSchedRunInterval in torrc. Using that.");
    return run_interval;
  }

  log_debug(LD_SCHED, "Turning to the consensus for KISTSchedRunInterval");
  run_interval = networkstatus_get_param(ns, "KISTSchedRunInterval",
                                         KIST_SCHED_RUN_INTERVAL_DEFAULT,
                                         KIST_SCHED_RUN_INTERVAL_MIN,
                                         KIST_SCHED_RUN_INTERVAL_MAX);
  if (run_interval <= 0)
    return -1;
  return run_interval;
}