Line data Source code
1 : /* Copyright (c) 2017-2021, The Tor Project, Inc. */
2 : /* See LICENSE for licensing information */
3 :
4 : /**
5 : * @file scheduler_kist.c
6 : * @brief Implements the KIST cell scheduler.
7 : **/
8 :
9 : #define SCHEDULER_KIST_PRIVATE
10 :
11 : #include "core/or/or.h"
12 : #include "lib/buf/buffers.h"
13 : #include "app/config/config.h"
14 : #include "core/mainloop/connection.h"
15 : #include "feature/nodelist/networkstatus.h"
16 : #define CHANNEL_OBJECT_PRIVATE
17 : #include "core/or/channel.h"
18 : #include "core/or/channeltls.h"
19 : #define SCHEDULER_PRIVATE
20 : #include "core/or/scheduler.h"
21 : #include "lib/math/fp.h"
22 :
23 : #include "core/or/or_connection_st.h"
24 :
25 : #ifdef HAVE_SYS_IOCTL_H
26 : #include <sys/ioctl.h>
27 : #endif
28 :
29 : #ifdef HAVE_KIST_SUPPORT
30 : /* Kernel interface needed for KIST. */
31 : #include <netinet/tcp.h>
32 : #include <linux/sockios.h>
33 : #endif /* HAVE_KIST_SUPPORT */
34 :
35 : /*****************************************************************************
36 : * Data structures and supporting functions
37 : *****************************************************************************/
38 :
39 : /* Socket_table hash table stuff. The socket_table keeps track of per-socket
40 : * limit information imposed by kist and used by kist. */
41 :
42 : static uint32_t
43 146 : socket_table_ent_hash(const socket_table_ent_t *ent)
44 : {
45 146 : return (uint32_t)ent->chan->global_identifier;
46 : }
47 :
48 : static unsigned
49 120 : socket_table_ent_eq(const socket_table_ent_t *a, const socket_table_ent_t *b)
50 : {
51 120 : return a->chan == b->chan;
52 : }
53 :
54 : typedef HT_HEAD(socket_table_s, socket_table_ent_t) socket_table_t;
55 :
56 : static socket_table_t socket_table = HT_INITIALIZER();
57 :
58 413 : HT_PROTOTYPE(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash,
59 : socket_table_ent_eq);
60 12 : HT_GENERATE2(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash,
61 : socket_table_ent_eq, 0.6, tor_reallocarray, tor_free_);
62 :
63 : /* outbuf_table hash table stuff. The outbuf_table keeps track of which
64 : * channels have data sitting in their outbuf so the kist scheduler can force
65 : * a write from outbuf to kernel periodically during a run and at the end of a
66 : * run. */
67 :
68 : typedef struct outbuf_table_ent_t {
69 : HT_ENTRY(outbuf_table_ent_t) node;
70 : channel_t *chan;
71 : } outbuf_table_ent_t;
72 :
73 : static uint32_t
74 98 : outbuf_table_ent_hash(const outbuf_table_ent_t *ent)
75 : {
76 98 : return (uint32_t)ent->chan->global_identifier;
77 : }
78 :
79 : static unsigned
80 42 : outbuf_table_ent_eq(const outbuf_table_ent_t *a, const outbuf_table_ent_t *b)
81 : {
82 42 : return a->chan->global_identifier == b->chan->global_identifier;
83 : }
84 :
85 1167 : HT_PROTOTYPE(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash,
86 : outbuf_table_ent_eq);
87 18 : HT_GENERATE2(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash,
88 : outbuf_table_ent_eq, 0.6, tor_reallocarray, tor_free_);
89 :
90 : /*****************************************************************************
91 : * Other internal data
92 : *****************************************************************************/
93 :
94 : /* Store the last time the scheduler was run so we can decide when to next run
95 : * the scheduler based on it. */
96 : static monotime_t scheduler_last_run;
97 : /* This is a factor for the extra_space calculation in kist per-socket limits.
98 : * It is the number of extra congestion windows we want to write to the kernel.
99 : */
100 : static double sock_buf_size_factor = 1.0;
101 : /* How often the scheduler runs. */
102 : STATIC int sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
103 :
104 : #ifdef HAVE_KIST_SUPPORT
105 : /* Indicate if KIST lite mode is on or off. We can disable it at runtime.
106 : * Important to have because of the KISTLite -> KIST possible transition. */
107 : static unsigned int kist_lite_mode = 0;
108 : /* Indicate if we don't have the kernel support. This can happen if the kernel
109 : * changed and it doesn't recognized the values passed to the syscalls needed
110 : * by KIST. In that case, fallback to the naive approach. */
111 : static unsigned int kist_no_kernel_support = 0;
112 : #else /* !defined(HAVE_KIST_SUPPORT) */
113 : static unsigned int kist_lite_mode = 1;
114 : #endif /* defined(HAVE_KIST_SUPPORT) */
115 :
116 : /*****************************************************************************
117 : * Internally called function implementations
118 : *****************************************************************************/
119 :
120 : /* Little helper function to get the length of a channel's output buffer */
121 : static inline size_t
122 0 : channel_outbuf_length(channel_t *chan)
123 : {
124 0 : tor_assert(chan);
125 : /* In theory, this can not happen because we can not scheduler a channel
126 : * without a connection that has its outbuf initialized. Just in case, bug
127 : * on this so we can understand a bit more why it happened. */
128 0 : if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) {
129 0 : return 0;
130 : }
131 0 : return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf);
132 : }
133 :
134 : /* Little helper function for HT_FOREACH_FN. */
135 : static int
136 9 : each_channel_write_to_kernel(outbuf_table_ent_t *ent, void *data)
137 : {
138 9 : (void) data; /* Make compiler happy. */
139 9 : channel_write_to_kernel(ent->chan);
140 9 : return 0; /* Returning non-zero removes the element from the table. */
141 : }
142 :
143 : /* Free the given outbuf table entry ent. */
144 : static int
145 28 : free_outbuf_info_by_ent(outbuf_table_ent_t *ent, void *data)
146 : {
147 28 : (void) data; /* Make compiler happy. */
148 28 : log_debug(LD_SCHED, "Freeing outbuf table entry from chan=%" PRIu64,
149 : ent->chan->global_identifier);
150 28 : tor_free(ent);
151 28 : return 1; /* So HT_FOREACH_FN will remove the element */
152 : }
153 :
154 : /* Free the given socket table entry ent. */
155 : static int
156 5 : free_socket_info_by_ent(socket_table_ent_t *ent, void *data)
157 : {
158 5 : (void) data; /* Make compiler happy. */
159 5 : log_debug(LD_SCHED, "Freeing socket table entry from chan=%" PRIu64,
160 : ent->chan->global_identifier);
161 5 : tor_free(ent);
162 5 : return 1; /* So HT_FOREACH_FN will remove the element */
163 : }
164 :
165 : /* Clean up socket_table. Probably because the KIST sched impl is going away */
166 : static void
167 10 : free_all_socket_info(void)
168 : {
169 10 : HT_FOREACH_FN(socket_table_s, &socket_table, free_socket_info_by_ent, NULL);
170 10 : HT_CLEAR(socket_table_s, &socket_table);
171 10 : }
172 :
173 : static socket_table_ent_t *
174 136 : socket_table_search(socket_table_t *table, const channel_t *chan)
175 : {
176 136 : socket_table_ent_t search, *ent = NULL;
177 136 : search.chan = chan;
178 136 : ent = HT_FIND(socket_table_s, table, &search);
179 136 : return ent;
180 : }
181 :
182 : /* Free a socket entry in table for the given chan. */
183 : static void
184 21 : free_socket_info_by_chan(socket_table_t *table, const channel_t *chan)
185 : {
186 21 : socket_table_ent_t *ent = NULL;
187 21 : ent = socket_table_search(table, chan);
188 21 : if (!ent)
189 : return;
190 5 : log_debug(LD_SCHED, "scheduler free socket info for chan=%" PRIu64,
191 : chan->global_identifier);
192 5 : HT_REMOVE(socket_table_s, table, ent);
193 5 : free_socket_info_by_ent(ent, NULL);
194 : }
195 :
196 : /* Perform system calls for the given socket in order to calculate kist's
197 : * per-socket limit as documented in the function body. */
198 0 : MOCK_IMPL(void,
199 : update_socket_info_impl, (socket_table_ent_t *ent))
200 : {
201 : #ifdef HAVE_KIST_SUPPORT
202 0 : int64_t tcp_space, extra_space;
203 0 : tor_assert(ent);
204 0 : tor_assert(ent->chan);
205 0 : const tor_socket_t sock =
206 0 : TO_CONN(CONST_BASE_CHAN_TO_TLS(ent->chan)->conn)->s;
207 0 : struct tcp_info tcp;
208 0 : socklen_t tcp_info_len = sizeof(tcp);
209 :
210 0 : if (kist_no_kernel_support || kist_lite_mode) {
211 0 : goto fallback;
212 : }
213 :
214 : /* Gather information */
215 0 : if (getsockopt(sock, SOL_TCP, TCP_INFO, (void *)&(tcp), &tcp_info_len) < 0) {
216 0 : if (errno == EINVAL) {
217 : /* Oops, this option is not provided by the kernel, we'll have to
218 : * disable KIST entirely. This can happen if tor was built on a machine
219 : * with the support previously or if the kernel was updated and lost the
220 : * support. */
221 0 : log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
222 : "for KIST anymore. We will fallback to the naive "
223 : "approach. Remove KIST from the Schedulers list "
224 : "to disable.");
225 0 : kist_no_kernel_support = 1;
226 : }
227 0 : goto fallback;
228 : }
229 0 : if (ioctl(sock, SIOCOUTQNSD, &(ent->notsent)) < 0) {
230 0 : if (errno == EINVAL) {
231 0 : log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
232 : "for KIST anymore. We will fallback to the naive "
233 : "approach. Remove KIST from the Schedulers list "
234 : "to disable.");
235 : /* Same reason as the above. */
236 0 : kist_no_kernel_support = 1;
237 : }
238 0 : goto fallback;
239 : }
240 0 : ent->cwnd = tcp.tcpi_snd_cwnd;
241 0 : ent->unacked = tcp.tcpi_unacked;
242 0 : ent->mss = tcp.tcpi_snd_mss;
243 :
244 : /* In order to reduce outbound kernel queuing delays and thus improve Tor's
245 : * ability to prioritize circuits, KIST wants to set a socket write limit
246 : * that is near the amount that the socket would be able to immediately send
247 : * into the Internet.
248 : *
249 : * We first calculate how much the socket could send immediately (assuming
250 : * completely full packets) according to the congestion window and the number
251 : * of unacked packets.
252 : *
253 : * Then we add a little extra space in a controlled way. We do this so any
254 : * when the kernel gets ACKs back for data currently sitting in the "TCP
255 : * space", it will already have some more data to send immediately. It will
256 : * not have to wait for the scheduler to run again. The amount of extra space
257 : * is a factor of the current congestion window. With the suggested
258 : * sock_buf_size_factor value of 1.0, we allow at most 2*cwnd bytes to sit in
259 : * the kernel: 1 cwnd on the wire waiting for ACKs and 1 cwnd ready and
260 : * waiting to be sent when those ACKs finally come.
261 : *
262 : * In the below diagram, we see some bytes in the TCP-space (denoted by '*')
263 : * that have be sent onto the wire and are waiting for ACKs. We have a little
264 : * more room in "TCP space" that we can fill with data that will be
265 : * immediately sent. We also see the "extra space" KIST calculates. The sum
266 : * of the empty "TCP space" and the "extra space" is the kist-imposed write
267 : * limit for this socket.
268 : *
269 : * <----------------kernel-outbound-socket-queue----------------|
270 : * <*********---------------------------------------------------|
271 : * |----TCP-space-----|----extra-space-----|
272 : * |------------------|
273 : * ^ ((cwnd - unacked) * mss) bytes
274 : * |--------------------|
275 : * ^ ((cwnd * mss) * factor) bytes
276 : */
277 :
278 : /* These values from the kernel are uint32_t, they will always fit into a
279 : * int64_t tcp_space variable but if the congestion window cwnd is smaller
280 : * than the unacked packets, the remaining TCP space is set to 0. */
281 0 : if (ent->cwnd >= ent->unacked) {
282 0 : tcp_space = (ent->cwnd - ent->unacked) * (int64_t)(ent->mss);
283 : } else {
284 : tcp_space = 0;
285 : }
286 :
287 : /* The clamp_double_to_int64 makes sure the first part fits into an int64_t.
288 : * In fact, if sock_buf_size_factor is still forced to be >= 0 in config.c,
289 : * then it will be positive for sure. Then we subtract a uint32_t. Getting a
290 : * negative value is OK, see after how it is being handled. */
291 0 : extra_space =
292 0 : clamp_double_to_int64(
293 0 : (ent->cwnd * (int64_t)ent->mss) * sock_buf_size_factor) -
294 0 : ent->notsent - (int64_t)channel_outbuf_length((channel_t *) ent->chan);
295 0 : if ((tcp_space + extra_space) < 0) {
296 : /* This means that the "notsent" queue is just too big so we shouldn't put
297 : * more in the kernel for now. */
298 0 : ent->limit = 0;
299 : } else {
300 : /* The positive sum of two int64_t will always fit into an uint64_t.
301 : * And we know this will always be positive, since we checked above. */
302 0 : ent->limit = (uint64_t)tcp_space + (uint64_t)extra_space;
303 : }
304 0 : return;
305 :
306 : #else /* !defined(HAVE_KIST_SUPPORT) */
307 : goto fallback;
308 : #endif /* defined(HAVE_KIST_SUPPORT) */
309 :
310 0 : fallback:
311 : /* If all of a sudden we don't have kist support, we just zero out all the
312 : * variables for this socket since we don't know what they should be. We
313 : * also allow the socket to write as much as it can from the estimated
314 : * number of cells the lower layer can accept, effectively returning it to
315 : * Vanilla scheduler behavior. */
316 0 : ent->cwnd = ent->unacked = ent->mss = ent->notsent = 0;
317 : /* This function calls the specialized channel object (currently channeltls)
318 : * and ask how many cells it can write on the outbuf which we then multiply
319 : * by the size of the cells for this channel. The cast is because this
320 : * function requires a non-const channel object, meh. */
321 0 : ent->limit = channel_num_cells_writeable((channel_t *) ent->chan) *
322 0 : (get_cell_network_size(ent->chan->wide_circ_ids) +
323 : TLS_PER_CELL_OVERHEAD);
324 : }
325 :
326 : /* Given a socket that isn't in the table, add it.
327 : * Given a socket that is in the table, re-init values that need init-ing
328 : * every scheduling run
329 : */
330 : static void
331 12 : init_socket_info(socket_table_t *table, const channel_t *chan)
332 : {
333 12 : socket_table_ent_t *ent = NULL;
334 12 : ent = socket_table_search(table, chan);
335 12 : if (!ent) {
336 5 : log_debug(LD_SCHED, "scheduler init socket info for chan=%" PRIu64,
337 : chan->global_identifier);
338 5 : ent = tor_malloc_zero(sizeof(*ent));
339 5 : ent->chan = chan;
340 5 : HT_INSERT(socket_table_s, table, ent);
341 : }
342 12 : ent->written = 0;
343 12 : }
344 :
345 : /* Add chan to the outbuf table if it isn't already in it. If it is, then don't
346 : * do anything */
347 : static void
348 32 : outbuf_table_add(outbuf_table_t *table, channel_t *chan)
349 : {
350 32 : outbuf_table_ent_t search, *ent;
351 32 : search.chan = chan;
352 32 : ent = HT_FIND(outbuf_table_s, table, &search);
353 32 : if (!ent) {
354 28 : log_debug(LD_SCHED, "scheduler init outbuf info for chan=%" PRIu64,
355 : chan->global_identifier);
356 28 : ent = tor_malloc_zero(sizeof(*ent));
357 28 : ent->chan = chan;
358 28 : HT_INSERT(outbuf_table_s, table, ent);
359 : }
360 32 : }
361 :
362 : static void
363 19 : outbuf_table_remove(outbuf_table_t *table, channel_t *chan)
364 : {
365 19 : outbuf_table_ent_t search, *ent;
366 19 : search.chan = chan;
367 19 : ent = HT_FIND(outbuf_table_s, table, &search);
368 19 : if (ent) {
369 19 : HT_REMOVE(outbuf_table_s, table, ent);
370 19 : free_outbuf_info_by_ent(ent, NULL);
371 : }
372 19 : }
373 :
374 : /* Set the scheduler running interval. */
375 : static void
376 16 : set_scheduler_run_interval(void)
377 : {
378 16 : int old_sched_run_interval = sched_run_interval;
379 16 : sched_run_interval = kist_scheduler_run_interval();
380 16 : if (old_sched_run_interval != sched_run_interval) {
381 3 : log_info(LD_SCHED, "Scheduler KIST changing its running interval "
382 : "from %" PRId32 " to %" PRId32,
383 : old_sched_run_interval, sched_run_interval);
384 : }
385 16 : }
386 :
387 : /* Return true iff the channel hasn't hit its kist-imposed write limit yet */
388 : static int
389 62 : socket_can_write(socket_table_t *table, const channel_t *chan)
390 : {
391 62 : socket_table_ent_t *ent = NULL;
392 62 : ent = socket_table_search(table, chan);
393 62 : if (SCHED_BUG(!ent, chan)) {
394 0 : return 1; // Just return true, saying that kist wouldn't limit the socket
395 : }
396 :
397 : /* We previously calculated a write limit for this socket. In the below
398 : * calculation, first determine how much room is left in bytes. Then divide
399 : * that by the amount of space a cell takes. If there's room for at least 1
400 : * cell, then KIST will allow the socket to write. */
401 62 : int64_t kist_limit_space =
402 62 : (int64_t) (ent->limit - ent->written) /
403 : (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD);
404 62 : return kist_limit_space > 0;
405 : }
406 :
407 : /* Update the channel's socket kernel information. */
408 : static void
409 12 : update_socket_info(socket_table_t *table, const channel_t *chan)
410 : {
411 12 : socket_table_ent_t *ent = NULL;
412 12 : ent = socket_table_search(table, chan);
413 12 : if (SCHED_BUG(!ent, chan)) {
414 0 : return; // Whelp. Entry didn't exist for some reason so nothing to do.
415 : }
416 12 : update_socket_info_impl(ent);
417 12 : log_debug(LD_SCHED, "chan=%" PRIu64 " updated socket info, limit: %" PRIu64
418 : ", cwnd: %" PRIu32 ", unacked: %" PRIu32
419 : ", notsent: %" PRIu32 ", mss: %" PRIu32,
420 : ent->chan->global_identifier, ent->limit, ent->cwnd, ent->unacked,
421 : ent->notsent, ent->mss);
422 : }
423 :
424 : /* Increment the channel's socket written value by the number of bytes. */
425 : static void
426 29 : update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes)
427 : {
428 29 : socket_table_ent_t *ent = NULL;
429 29 : ent = socket_table_search(table, chan);
430 29 : if (SCHED_BUG(!ent, chan)) {
431 0 : return; // Whelp. Entry didn't exist so nothing to do.
432 : }
433 :
434 29 : log_debug(LD_SCHED, "chan=%" PRIu64 " wrote %lu bytes, old was %" PRIi64,
435 : chan->global_identifier, (unsigned long) bytes, ent->written);
436 :
437 29 : ent->written += bytes;
438 : }
439 :
440 : /*
441 : * A naive KIST impl would write every single cell all the way to the kernel.
442 : * That would take a lot of system calls. A less bad KIST impl would write a
443 : * channel's outbuf to the kernel only when we are switching to a different
444 : * channel. But if we have two channels with equal priority, we end up writing
445 : * one cell for each and bouncing back and forth. This KIST impl avoids that
446 : * by only writing a channel's outbuf to the kernel if it has 8 cells or more
447 : * in it.
448 : *
449 : * Note: The number 8 has been picked for no particular reasons except that it
450 : * is 4096 bytes which is a common number for buffering. A TLS record can hold
451 : * up to 16KiB thus using 8 cells means that a relay will at most send a TLS
452 : * record of 4KiB or 1/4 of the maximum capacity of a TLS record.
453 : */
454 0 : MOCK_IMPL(int, channel_should_write_to_kernel,
455 : (outbuf_table_t *table, channel_t *chan))
456 : {
457 0 : outbuf_table_add(table, chan);
458 : /* CELL_MAX_NETWORK_SIZE * 8 because we only want to write the outbuf to the
459 : * kernel if there's 8 or more cells waiting */
460 0 : return channel_outbuf_length(chan) > (CELL_MAX_NETWORK_SIZE * 8);
461 : }
462 :
463 : /* Little helper function to write a channel's outbuf all the way to the
464 : * kernel */
465 0 : MOCK_IMPL(void, channel_write_to_kernel, (channel_t *chan))
466 : {
467 0 : tor_assert(chan);
468 0 : log_debug(LD_SCHED, "Writing %lu bytes to kernel for chan %" PRIu64,
469 : (unsigned long)channel_outbuf_length(chan),
470 : chan->global_identifier);
471 : /* Note that 'connection_handle_write()' may change the scheduler state of
472 : * the channel during the scheduling loop with
473 : * 'connection_or_flushed_some()' -> 'scheduler_channel_wants_writes()'.
474 : * This side-effect will only occur if the channel is currently in the
475 : * 'SCHED_CHAN_WAITING_TO_WRITE' or 'SCHED_CHAN_IDLE' states, which KIST
476 : * rarely uses, so it should be fine unless KIST begins using these states
477 : * in the future. */
478 0 : connection_handle_write(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn), 0);
479 0 : }
480 :
481 : /* Return true iff the scheduler has work to perform. */
482 : static int
483 17 : have_work(void)
484 : {
485 17 : smartlist_t *cp = get_channels_pending();
486 17 : IF_BUG_ONCE(!cp) {
487 : return 0; // channels_pending doesn't exist so... no work?
488 : }
489 17 : return smartlist_len(cp) > 0;
490 : }
491 :
492 : /* Function of the scheduler interface: free_all() */
493 : static void
494 10 : kist_free_all(void)
495 : {
496 10 : free_all_socket_info();
497 10 : }
498 :
499 : /* Function of the scheduler interface: on_channel_free() */
500 : static void
501 21 : kist_on_channel_free_fn(const channel_t *chan)
502 : {
503 21 : free_socket_info_by_chan(&socket_table, chan);
504 21 : }
505 :
506 : /* Function of the scheduler interface: on_new_consensus() */
507 : static void
508 2 : kist_scheduler_on_new_consensus(void)
509 : {
510 2 : set_scheduler_run_interval();
511 2 : }
512 :
513 : /* Function of the scheduler interface: on_new_options() */
514 : static void
515 14 : kist_scheduler_on_new_options(void)
516 : {
517 14 : sock_buf_size_factor = get_options()->KISTSockBufSizeFactor;
518 :
519 : /* Calls kist_scheduler_run_interval which calls get_options(). */
520 14 : set_scheduler_run_interval();
521 14 : }
522 :
523 : /* Function of the scheduler interface: init() */
524 : static void
525 10 : kist_scheduler_init(void)
526 : {
527 : /* When initializing the scheduler, the last run could be 0 because it is
528 : * declared static or a value in the past that was set when it was last
529 : * used. In both cases, we want to initialize it to now so we don't risk
530 : * using the value 0 which doesn't play well with our monotonic time
531 : * interface.
532 : *
533 : * One side effect is that the first scheduler run will be at the next tick
534 : * that is in now + 10 msec (KIST_SCHED_RUN_INTERVAL_DEFAULT) by default. */
535 10 : monotime_get(&scheduler_last_run);
536 :
537 10 : kist_scheduler_on_new_options();
538 10 : IF_BUG_ONCE(sched_run_interval == 0) {
539 0 : log_warn(LD_SCHED, "We are initing the KIST scheduler and noticed the "
540 : "KISTSchedRunInterval is telling us to not use KIST. That's "
541 : "weird! We'll continue using KIST, but at %" PRId32 "ms.",
542 : KIST_SCHED_RUN_INTERVAL_DEFAULT);
543 0 : sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
544 : }
545 10 : }
546 :
547 : /* Function of the scheduler interface: schedule() */
548 : static void
549 17 : kist_scheduler_schedule(void)
550 : {
551 17 : struct monotime_t now;
552 17 : struct timeval next_run;
553 17 : int64_t diff;
554 :
555 17 : if (!have_work()) {
556 0 : return;
557 : }
558 17 : monotime_get(&now);
559 :
560 : /* If time is really monotonic, we can never have now being smaller than the
561 : * last scheduler run. The scheduler_last_run at first is set to 0.
562 : * Unfortunately, not all platforms guarantee monotonic time so we log at
563 : * info level but don't make it more noisy. */
564 17 : diff = monotime_diff_msec(&scheduler_last_run, &now);
565 17 : if (diff < 0) {
566 0 : log_info(LD_SCHED, "Monotonic time between now and last run of scheduler "
567 : "is negative: %" PRId64 ". Setting diff to 0.", diff);
568 0 : diff = 0;
569 : }
570 17 : if (diff < sched_run_interval) {
571 17 : next_run.tv_sec = 0;
572 : /* Takes 1000 ms -> us. This will always be valid because diff can NOT be
573 : * negative and can NOT be bigger than sched_run_interval so values can
574 : * only go from 1000 usec (diff set to interval - 1) to 100000 usec (diff
575 : * set to 0) for the maximum allowed run interval (100ms). */
576 17 : next_run.tv_usec = (int) ((sched_run_interval - diff) * 1000);
577 : /* Re-adding an event reschedules it. It does not duplicate it. */
578 17 : scheduler_ev_add(&next_run);
579 : } else {
580 0 : scheduler_ev_active();
581 : }
582 : }
583 :
584 : /* Function of the scheduler interface: run() */
585 : static void
586 9 : kist_scheduler_run(void)
587 : {
588 : /* Define variables */
589 9 : channel_t *chan = NULL; // current working channel
590 : /* The last distinct chan served in a sched loop. */
591 9 : channel_t *prev_chan = NULL;
592 9 : int flush_result; // temporarily store results from flush calls
593 : /* Channels to be re-adding to pending at the end */
594 9 : smartlist_t *to_readd = NULL;
595 9 : smartlist_t *cp = get_channels_pending();
596 :
597 9 : outbuf_table_t outbuf_table = HT_INITIALIZER();
598 :
599 : /* For each pending channel, collect new kernel information */
600 21 : SMARTLIST_FOREACH_BEGIN(cp, const channel_t *, pchan) {
601 12 : init_socket_info(&socket_table, pchan);
602 12 : update_socket_info(&socket_table, pchan);
603 12 : } SMARTLIST_FOREACH_END(pchan);
604 :
605 9 : log_debug(LD_SCHED, "Running the scheduler. %d channels pending",
606 : smartlist_len(cp));
607 :
608 : /* The main scheduling loop. Loop until there are no more pending channels */
609 41 : while (smartlist_len(cp) > 0) {
610 : /* get best channel */
611 32 : chan = smartlist_pqueue_pop(cp, scheduler_compare_channels,
612 : offsetof(channel_t, sched_heap_idx));
613 32 : if (SCHED_BUG(!chan, NULL)) {
614 : /* Some-freaking-how a NULL got into the channels_pending. That should
615 : * never happen, but it should be harmless to ignore it and keep looping.
616 : */
617 0 : continue;
618 : }
619 32 : outbuf_table_add(&outbuf_table, chan);
620 :
621 : /* if we have switched to a new channel, consider writing the previous
622 : * channel's outbuf to the kernel. */
623 32 : if (!prev_chan) {
624 9 : prev_chan = chan;
625 : }
626 32 : if (prev_chan != chan) {
627 19 : if (channel_should_write_to_kernel(&outbuf_table, prev_chan)) {
628 19 : channel_write_to_kernel(prev_chan);
629 19 : outbuf_table_remove(&outbuf_table, prev_chan);
630 : }
631 : prev_chan = chan;
632 : }
633 :
634 : /* Only flush and write if the per-socket limit hasn't been hit */
635 32 : if (socket_can_write(&socket_table, chan)) {
636 : /* flush to channel queue/outbuf */
637 31 : flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells
638 : /* XXX: While flushing cells, it is possible that the connection write
639 : * fails leading to the channel to be closed which triggers a release
640 : * and free its entry in the socket table. And because of a engineering
641 : * design issue, the error is not propagated back so we don't get an
642 : * error at this point. So before we continue, make sure the channel is
643 : * open and if not just ignore it. See #23751. */
644 31 : if (!CHANNEL_IS_OPEN(chan)) {
645 : /* Channel isn't open so we put it back in IDLE mode. It is either
646 : * renegotiating its TLS session or about to be released. */
647 1 : scheduler_set_channel_state(chan, SCHED_CHAN_IDLE);
648 1 : continue;
649 : }
650 : /* flush_result has the # cells flushed */
651 30 : if (flush_result > 0) {
652 29 : update_socket_written(&socket_table, chan, flush_result *
653 : (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD));
654 : } else {
655 : /* XXX: This can happen because tor sometimes does flush in an
656 : * opportunistic way cells from the circuit to the outbuf so the
657 : * channel can end up here without having anything to flush nor needed
658 : * to write to the kernel. Hopefully we'll fix that soon but for now
659 : * we have to handle this case which happens kind of often. */
660 1 : log_debug(LD_SCHED,
661 : "We didn't flush anything on a chan that we think "
662 : "can write and wants to write. The channel's state is '%s' "
663 : "and in scheduler state '%s'. We're going to mark it as "
664 : "waiting_for_cells (as that's most likely the issue) and "
665 : "stop scheduling it this round.",
666 : channel_state_to_string(chan->state),
667 : get_scheduler_state_string(chan->scheduler_state));
668 1 : scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
669 1 : continue;
670 : }
671 : }
672 :
673 : /* Decide what to do with the channel now */
674 :
675 37 : if (!channel_more_to_flush(chan) &&
676 7 : !socket_can_write(&socket_table, chan)) {
677 :
678 : /* Case 1: no more cells to send, and cannot write */
679 :
680 : /*
681 : * You might think we should put the channel in SCHED_CHAN_IDLE. And
682 : * you're probably correct. While implementing KIST, we found that the
683 : * scheduling system would sometimes lose track of channels when we did
684 : * that. We suspect it has to do with the difference between "can't
685 : * write because socket/outbuf is full" and KIST's "can't write because
686 : * we've arbitrarily decided that that's enough for now." Sometimes
687 : * channels run out of cells at the same time they hit their
688 : * kist-imposed write limit and maybe the rest of Tor doesn't put the
689 : * channel back in pending when it is supposed to.
690 : *
691 : * This should be investigated again. It is as simple as changing
692 : * SCHED_CHAN_WAITING_FOR_CELLS to SCHED_CHAN_IDLE and seeing if Tor
693 : * starts having serious throughput issues. Best done in shadow/chutney.
694 : */
695 0 : scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
696 30 : } else if (!channel_more_to_flush(chan)) {
697 :
698 : /* Case 2: no more cells to send, but still open for writes */
699 :
700 7 : scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
701 23 : } else if (!socket_can_write(&socket_table, chan)) {
702 :
703 : /* Case 3: cells to send, but cannot write */
704 :
705 : /*
706 : * We want to write, but can't. If we left the channel in
707 : * channels_pending, we would never exit the scheduling loop. We need to
708 : * add it to a temporary list of channels to be added to channels_pending
709 : * after the scheduling loop is over. They can hopefully be taken care of
710 : * in the next scheduling round.
711 : */
712 3 : if (!to_readd) {
713 2 : to_readd = smartlist_new();
714 : }
715 3 : smartlist_add(to_readd, chan);
716 : } else {
717 :
718 : /* Case 4: cells to send, and still open for writes */
719 :
720 20 : scheduler_set_channel_state(chan, SCHED_CHAN_PENDING);
721 20 : if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) {
722 20 : smartlist_pqueue_add(cp, scheduler_compare_channels,
723 : offsetof(channel_t, sched_heap_idx), chan);
724 : }
725 : }
726 : } /* End of main scheduling loop */
727 :
728 : /* Write the outbuf of any channels that still have data */
729 9 : HT_FOREACH_FN(outbuf_table_s, &outbuf_table, each_channel_write_to_kernel,
730 : NULL);
731 : /* We are done with it. */
732 9 : HT_FOREACH_FN(outbuf_table_s, &outbuf_table, free_outbuf_info_by_ent, NULL);
733 9 : HT_CLEAR(outbuf_table_s, &outbuf_table);
734 :
735 9 : log_debug(LD_SCHED, "len pending=%d, len to_readd=%d",
736 : smartlist_len(cp),
737 : (to_readd ? smartlist_len(to_readd) : -1));
738 :
739 : /* Re-add any channels we need to */
740 9 : if (to_readd) {
741 5 : SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
742 3 : scheduler_set_channel_state(readd_chan, SCHED_CHAN_PENDING);
743 3 : if (!smartlist_contains(cp, readd_chan)) {
744 3 : if (!SCHED_BUG(readd_chan->sched_heap_idx != -1, readd_chan)) {
745 : /* XXXX Note that the check above is in theory redundant with
746 : * the smartlist_contains check. But let's make sure we're
747 : * not messing anything up, and leave them both for now. */
748 3 : smartlist_pqueue_add(cp, scheduler_compare_channels,
749 : offsetof(channel_t, sched_heap_idx), readd_chan);
750 : }
751 : }
752 3 : } SMARTLIST_FOREACH_END(readd_chan);
753 2 : smartlist_free(to_readd);
754 : }
755 :
756 9 : monotime_get(&scheduler_last_run);
757 9 : }
758 :
759 : /*****************************************************************************
760 : * Externally called function implementations not called through scheduler_t
761 : *****************************************************************************/
762 :
763 : /* Stores the kist scheduler function pointers. */
764 : static scheduler_t kist_scheduler = {
765 : .type = SCHEDULER_KIST,
766 : .free_all = kist_free_all,
767 : .on_channel_free = kist_on_channel_free_fn,
768 : .init = kist_scheduler_init,
769 : .on_new_consensus = kist_scheduler_on_new_consensus,
770 : .schedule = kist_scheduler_schedule,
771 : .run = kist_scheduler_run,
772 : .on_new_options = kist_scheduler_on_new_options,
773 : };
774 :
775 : /* Return the KIST scheduler object. If it didn't exists, return a newly
776 : * allocated one but init() is not called. */
777 : scheduler_t *
778 20 : get_kist_scheduler(void)
779 : {
780 20 : return &kist_scheduler;
781 : }
782 :
783 : /* Check the torrc (and maybe consensus) for the configured KIST scheduler run
784 : * interval.
785 : * - If torrc > 0, then return the positive torrc value (should use KIST, and
786 : * should use the set value)
787 : * - If torrc == 0, then look in the consensus for what the value should be.
788 : * - If == 0, then return 0 (don't use KIST)
789 : * - If > 0, then return the positive consensus value
790 : * - If consensus doesn't say anything, return 10 milliseconds, default.
791 : */
792 : int
793 40 : kist_scheduler_run_interval(void)
794 : {
795 40 : int run_interval = get_options()->KISTSchedRunInterval;
796 :
797 40 : if (run_interval != 0) {
798 9 : log_debug(LD_SCHED, "Found KISTSchedRunInterval=%" PRId32 " in torrc. "
799 : "Using that.", run_interval);
800 9 : return run_interval;
801 : }
802 :
803 31 : log_debug(LD_SCHED, "KISTSchedRunInterval=0, turning to the consensus.");
804 :
805 : /* Will either be the consensus value or the default. Note that 0 can be
806 : * returned which means the consensus wants us to NOT use KIST. */
807 31 : return networkstatus_get_param(NULL, "KISTSchedRunInterval",
808 : KIST_SCHED_RUN_INTERVAL_DEFAULT,
809 : KIST_SCHED_RUN_INTERVAL_MIN,
810 : KIST_SCHED_RUN_INTERVAL_MAX);
811 : }
812 :
813 : /* Set KISTLite mode that is KIST without kernel support. */
814 : void
815 1 : scheduler_kist_set_lite_mode(void)
816 : {
817 1 : kist_lite_mode = 1;
818 1 : kist_scheduler.type = SCHEDULER_KIST_LITE;
819 1 : log_info(LD_SCHED,
820 : "Setting KIST scheduler without kernel support (KISTLite mode)");
821 1 : }
822 :
823 : /* Set KIST mode that is KIST with kernel support. */
824 : void
825 14 : scheduler_kist_set_full_mode(void)
826 : {
827 14 : kist_lite_mode = 0;
828 14 : kist_scheduler.type = SCHEDULER_KIST;
829 14 : log_info(LD_SCHED,
830 : "Setting KIST scheduler with kernel support (KIST mode)");
831 14 : }
832 :
833 : #ifdef HAVE_KIST_SUPPORT
834 :
835 : /* Return true iff the scheduler subsystem should use KIST. */
836 : int
837 20 : scheduler_can_use_kist(void)
838 : {
839 20 : if (kist_no_kernel_support) {
840 : /* We have no kernel support so we can't use KIST. */
841 : return 0;
842 : }
843 :
844 : /* We do have the support, time to check if we can get the interval that the
845 : * consensus can be disabling. */
846 20 : int run_interval = kist_scheduler_run_interval();
847 23 : log_debug(LD_SCHED, "Determined KIST sched_run_interval should be "
848 : "%" PRId32 ". Can%s use KIST.",
849 : run_interval, (run_interval > 0 ? "" : " not"));
850 20 : return run_interval > 0;
851 : }
852 :
853 : #else /* !defined(HAVE_KIST_SUPPORT) */
854 :
855 : int
856 : scheduler_can_use_kist(void)
857 : {
858 : return 0;
859 : }
860 :
861 : #endif /* defined(HAVE_KIST_SUPPORT) */
|