LCOV - lcov.info - core/or/scheduler

LCOV - code coverage report

Current view:	top level - core/or - scheduler_kist.c (source / functions)		Hit	Total	Coverage
Test:	lcov.info	Lines:	210	271	77.5 %
Date:	2021-11-24 03:28:48	Functions:	46	52	88.5 %

          Line data    Source code

       1             : /* Copyright (c) 2017-2021, The Tor Project, Inc. */
       2             : /* See LICENSE for licensing information */
       3             : 
       4             : /**
       5             :  * @file scheduler_kist.c
       6             :  * @brief Implements the KIST cell scheduler.
       7             :  **/
       8             : 
       9             : #define SCHEDULER_KIST_PRIVATE
      10             : 
      11             : #include "core/or/or.h"
      12             : #include "lib/buf/buffers.h"
      13             : #include "app/config/config.h"
      14             : #include "core/mainloop/connection.h"
      15             : #include "feature/nodelist/networkstatus.h"
      16             : #define CHANNEL_OBJECT_PRIVATE
      17             : #include "core/or/channel.h"
      18             : #include "core/or/channeltls.h"
      19             : #define SCHEDULER_PRIVATE
      20             : #include "core/or/scheduler.h"
      21             : #include "lib/math/fp.h"
      22             : 
      23             : #include "core/or/or_connection_st.h"
      24             : 
      25             : #ifdef HAVE_SYS_IOCTL_H
      26             : #include <sys/ioctl.h>
      27             : #endif
      28             : 
      29             : #ifdef HAVE_KIST_SUPPORT
      30             : /* Kernel interface needed for KIST. */
      31             : #include <netinet/tcp.h>
      32             : #include <linux/sockios.h>
      33             : #endif /* HAVE_KIST_SUPPORT */
      34             : 
      35             : /*****************************************************************************
      36             :  * Data structures and supporting functions
      37             :  *****************************************************************************/
      38             : 
      39             : /* Socket_table hash table stuff. The socket_table keeps track of per-socket
      40             :  * limit information imposed by kist and used by kist. */
      41             : 
      42             : static uint32_t
      43         146 : socket_table_ent_hash(const socket_table_ent_t *ent)
      44             : {
      45         146 :   return (uint32_t)ent->chan->global_identifier;
      46             : }
      47             : 
      48             : static unsigned
      49         120 : socket_table_ent_eq(const socket_table_ent_t *a, const socket_table_ent_t *b)
      50             : {
      51         120 :   return a->chan == b->chan;
      52             : }
      53             : 
      54             : typedef HT_HEAD(socket_table_s, socket_table_ent_t) socket_table_t;
      55             : 
      56             : static socket_table_t socket_table = HT_INITIALIZER();
      57             : 
      58         413 : HT_PROTOTYPE(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash,
      59             :              socket_table_ent_eq);
      60          12 : HT_GENERATE2(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash,
      61             :              socket_table_ent_eq, 0.6, tor_reallocarray, tor_free_);
      62             : 
      63             : /* outbuf_table hash table stuff. The outbuf_table keeps track of which
      64             :  * channels have data sitting in their outbuf so the kist scheduler can force
      65             :  * a write from outbuf to kernel periodically during a run and at the end of a
      66             :  * run. */
      67             : 
      68             : typedef struct outbuf_table_ent_t {
      69             :   HT_ENTRY(outbuf_table_ent_t) node;
      70             :   channel_t *chan;
      71             : } outbuf_table_ent_t;
      72             : 
      73             : static uint32_t
      74          98 : outbuf_table_ent_hash(const outbuf_table_ent_t *ent)
      75             : {
      76          98 :   return (uint32_t)ent->chan->global_identifier;
      77             : }
      78             : 
      79             : static unsigned
      80          42 : outbuf_table_ent_eq(const outbuf_table_ent_t *a, const outbuf_table_ent_t *b)
      81             : {
      82          42 :   return a->chan->global_identifier == b->chan->global_identifier;
      83             : }
      84             : 
      85        1167 : HT_PROTOTYPE(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash,
      86             :              outbuf_table_ent_eq);
      87          18 : HT_GENERATE2(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash,
      88             :              outbuf_table_ent_eq, 0.6, tor_reallocarray, tor_free_);
      89             : 
      90             : /*****************************************************************************
      91             :  * Other internal data
      92             :  *****************************************************************************/
      93             : 
      94             : /* Store the last time the scheduler was run so we can decide when to next run
      95             :  * the scheduler based on it. */
      96             : static monotime_t scheduler_last_run;
      97             : /* This is a factor for the extra_space calculation in kist per-socket limits.
      98             :  * It is the number of extra congestion windows we want to write to the kernel.
      99             :  */
     100             : static double sock_buf_size_factor = 1.0;
     101             : /* How often the scheduler runs. */
     102             : STATIC int sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
     103             : 
     104             : #ifdef HAVE_KIST_SUPPORT
     105             : /* Indicate if KIST lite mode is on or off. We can disable it at runtime.
     106             :  * Important to have because of the KISTLite -> KIST possible transition. */
     107             : static unsigned int kist_lite_mode = 0;
     108             : /* Indicate if we don't have the kernel support. This can happen if the kernel
     109             :  * changed and it doesn't recognized the values passed to the syscalls needed
     110             :  * by KIST. In that case, fallback to the naive approach. */
     111             : static unsigned int kist_no_kernel_support = 0;
     112             : #else /* !defined(HAVE_KIST_SUPPORT) */
     113             : static unsigned int kist_lite_mode = 1;
     114             : #endif /* defined(HAVE_KIST_SUPPORT) */
     115             : 
     116             : /*****************************************************************************
     117             :  * Internally called function implementations
     118             :  *****************************************************************************/
     119             : 
     120             : /* Little helper function to get the length of a channel's output buffer */
     121             : static inline size_t
     122           0 : channel_outbuf_length(channel_t *chan)
     123             : {
     124           0 :   tor_assert(chan);
     125             :   /* In theory, this can not happen because we can not scheduler a channel
     126             :    * without a connection that has its outbuf initialized. Just in case, bug
     127             :    * on this so we can understand a bit more why it happened. */
     128           0 :   if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) {
     129           0 :     return 0;
     130             :   }
     131           0 :   return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf);
     132             : }
     133             : 
     134             : /* Little helper function for HT_FOREACH_FN. */
     135             : static int
     136           9 : each_channel_write_to_kernel(outbuf_table_ent_t *ent, void *data)
     137             : {
     138           9 :   (void) data; /* Make compiler happy. */
     139           9 :   channel_write_to_kernel(ent->chan);
     140           9 :   return 0; /* Returning non-zero removes the element from the table. */
     141             : }
     142             : 
     143             : /* Free the given outbuf table entry ent. */
     144             : static int
     145          28 : free_outbuf_info_by_ent(outbuf_table_ent_t *ent, void *data)
     146             : {
     147          28 :   (void) data; /* Make compiler happy. */
     148          28 :   log_debug(LD_SCHED, "Freeing outbuf table entry from chan=%" PRIu64,
     149             :             ent->chan->global_identifier);
     150          28 :   tor_free(ent);
     151          28 :   return 1; /* So HT_FOREACH_FN will remove the element */
     152             : }
     153             : 
     154             : /* Free the given socket table entry ent. */
     155             : static int
     156           5 : free_socket_info_by_ent(socket_table_ent_t *ent, void *data)
     157             : {
     158           5 :   (void) data; /* Make compiler happy. */
     159           5 :   log_debug(LD_SCHED, "Freeing socket table entry from chan=%" PRIu64,
     160             :             ent->chan->global_identifier);
     161           5 :   tor_free(ent);
     162           5 :   return 1; /* So HT_FOREACH_FN will remove the element */
     163             : }
     164             : 
     165             : /* Clean up socket_table. Probably because the KIST sched impl is going away */
     166             : static void
     167          10 : free_all_socket_info(void)
     168             : {
     169          10 :   HT_FOREACH_FN(socket_table_s, &socket_table, free_socket_info_by_ent, NULL);
     170          10 :   HT_CLEAR(socket_table_s, &socket_table);
     171          10 : }
     172             : 
     173             : static socket_table_ent_t *
     174         136 : socket_table_search(socket_table_t *table, const channel_t *chan)
     175             : {
     176         136 :   socket_table_ent_t search, *ent = NULL;
     177         136 :   search.chan = chan;
     178         136 :   ent = HT_FIND(socket_table_s, table, &search);
     179         136 :   return ent;
     180             : }
     181             : 
     182             : /* Free a socket entry in table for the given chan. */
     183             : static void
     184          21 : free_socket_info_by_chan(socket_table_t *table, const channel_t *chan)
     185             : {
     186          21 :   socket_table_ent_t *ent = NULL;
     187          21 :   ent = socket_table_search(table, chan);
     188          21 :   if (!ent)
     189             :     return;
     190           5 :   log_debug(LD_SCHED, "scheduler free socket info for chan=%" PRIu64,
     191             :             chan->global_identifier);
     192           5 :   HT_REMOVE(socket_table_s, table, ent);
     193           5 :   free_socket_info_by_ent(ent, NULL);
     194             : }
     195             : 
     196             : /* Perform system calls for the given socket in order to calculate kist's
     197             :  * per-socket limit as documented in the function body. */
     198           0 : MOCK_IMPL(void,
     199             : update_socket_info_impl, (socket_table_ent_t *ent))
     200             : {
     201             : #ifdef HAVE_KIST_SUPPORT
     202           0 :   int64_t tcp_space, extra_space;
     203           0 :   tor_assert(ent);
     204           0 :   tor_assert(ent->chan);
     205           0 :   const tor_socket_t sock =
     206           0 :     TO_CONN(CONST_BASE_CHAN_TO_TLS(ent->chan)->conn)->s;
     207           0 :   struct tcp_info tcp;
     208           0 :   socklen_t tcp_info_len = sizeof(tcp);
     209             : 
     210           0 :   if (kist_no_kernel_support || kist_lite_mode) {
     211           0 :     goto fallback;
     212             :   }
     213             : 
     214             :   /* Gather information */
     215           0 :   if (getsockopt(sock, SOL_TCP, TCP_INFO, (void *)&(tcp), &tcp_info_len) < 0) {
     216           0 :     if (errno == EINVAL) {
     217             :       /* Oops, this option is not provided by the kernel, we'll have to
     218             :        * disable KIST entirely. This can happen if tor was built on a machine
     219             :        * with the support previously or if the kernel was updated and lost the
     220             :        * support. */
     221           0 :       log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
     222             :                            "for KIST anymore. We will fallback to the naive "
     223             :                            "approach. Remove KIST from the Schedulers list "
     224             :                            "to disable.");
     225           0 :       kist_no_kernel_support = 1;
     226             :     }
     227           0 :     goto fallback;
     228             :   }
     229           0 :   if (ioctl(sock, SIOCOUTQNSD, &(ent->notsent)) < 0) {
     230           0 :     if (errno == EINVAL) {
     231           0 :       log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
     232             :                            "for KIST anymore. We will fallback to the naive "
     233             :                            "approach. Remove KIST from the Schedulers list "
     234             :                            "to disable.");
     235             :       /* Same reason as the above. */
     236           0 :       kist_no_kernel_support = 1;
     237             :     }
     238           0 :     goto fallback;
     239             :   }
     240           0 :   ent->cwnd = tcp.tcpi_snd_cwnd;
     241           0 :   ent->unacked = tcp.tcpi_unacked;
     242           0 :   ent->mss = tcp.tcpi_snd_mss;
     243             : 
     244             :   /* In order to reduce outbound kernel queuing delays and thus improve Tor's
     245             :    * ability to prioritize circuits, KIST wants to set a socket write limit
     246             :    * that is near the amount that the socket would be able to immediately send
     247             :    * into the Internet.
     248             :    *
     249             :    * We first calculate how much the socket could send immediately (assuming
     250             :    * completely full packets) according to the congestion window and the number
     251             :    * of unacked packets.
     252             :    *
     253             :    * Then we add a little extra space in a controlled way. We do this so any
     254             :    * when the kernel gets ACKs back for data currently sitting in the "TCP
     255             :    * space", it will already have some more data to send immediately. It will
     256             :    * not have to wait for the scheduler to run again. The amount of extra space
     257             :    * is a factor of the current congestion window. With the suggested
     258             :    * sock_buf_size_factor value of 1.0, we allow at most 2*cwnd bytes to sit in
     259             :    * the kernel: 1 cwnd on the wire waiting for ACKs and 1 cwnd ready and
     260             :    * waiting to be sent when those ACKs finally come.
     261             :    *
     262             :    * In the below diagram, we see some bytes in the TCP-space (denoted by '*')
     263             :    * that have be sent onto the wire and are waiting for ACKs. We have a little
     264             :    * more room in "TCP space" that we can fill with data that will be
     265             :    * immediately sent. We also see the "extra space" KIST calculates. The sum
     266             :    * of the empty "TCP space" and the "extra space" is the kist-imposed write
     267             :    * limit for this socket.
     268             :    *
     269             :    * <----------------kernel-outbound-socket-queue----------------|
     270             :    * <*********---------------------------------------------------|
     271             :    * |----TCP-space-----|----extra-space-----|
     272             :    * |------------------|
     273             :    *                    ^ ((cwnd - unacked) * mss) bytes
     274             :    *                    |--------------------|
     275             :    *                                         ^ ((cwnd * mss) * factor) bytes
     276             :    */
     277             : 
     278             :   /* These values from the kernel are uint32_t, they will always fit into a
     279             :    * int64_t tcp_space variable but if the congestion window cwnd is smaller
     280             :    * than the unacked packets, the remaining TCP space is set to 0. */
     281           0 :   if (ent->cwnd >= ent->unacked) {
     282           0 :     tcp_space = (ent->cwnd - ent->unacked) * (int64_t)(ent->mss);
     283             :   } else {
     284             :     tcp_space = 0;
     285             :   }
     286             : 
     287             :   /* The clamp_double_to_int64 makes sure the first part fits into an int64_t.
     288             :    * In fact, if sock_buf_size_factor is still forced to be >= 0 in config.c,
     289             :    * then it will be positive for sure. Then we subtract a uint32_t. Getting a
     290             :    * negative value is OK, see after how it is being handled. */
     291           0 :   extra_space =
     292           0 :     clamp_double_to_int64(
     293           0 :                  (ent->cwnd * (int64_t)ent->mss) * sock_buf_size_factor) -
     294           0 :     ent->notsent - (int64_t)channel_outbuf_length((channel_t *) ent->chan);
     295           0 :   if ((tcp_space + extra_space) < 0) {
     296             :     /* This means that the "notsent" queue is just too big so we shouldn't put
     297             :      * more in the kernel for now. */
     298           0 :     ent->limit = 0;
     299             :   } else {
     300             :     /* The positive sum of two int64_t will always fit into an uint64_t.
     301             :      * And we know this will always be positive, since we checked above. */
     302           0 :     ent->limit = (uint64_t)tcp_space + (uint64_t)extra_space;
     303             :   }
     304           0 :   return;
     305             : 
     306             : #else /* !defined(HAVE_KIST_SUPPORT) */
     307             :   goto fallback;
     308             : #endif /* defined(HAVE_KIST_SUPPORT) */
     309             : 
     310           0 :  fallback:
     311             :   /* If all of a sudden we don't have kist support, we just zero out all the
     312             :    * variables for this socket since we don't know what they should be. We
     313             :    * also allow the socket to write as much as it can from the estimated
     314             :    * number of cells the lower layer can accept, effectively returning it to
     315             :    * Vanilla scheduler behavior. */
     316           0 :   ent->cwnd = ent->unacked = ent->mss = ent->notsent = 0;
     317             :   /* This function calls the specialized channel object (currently channeltls)
     318             :    * and ask how many cells it can write on the outbuf which we then multiply
     319             :    * by the size of the cells for this channel. The cast is because this
     320             :    * function requires a non-const channel object, meh. */
     321           0 :   ent->limit = channel_num_cells_writeable((channel_t *) ent->chan) *
     322           0 :                (get_cell_network_size(ent->chan->wide_circ_ids) +
     323             :                 TLS_PER_CELL_OVERHEAD);
     324             : }
     325             : 
     326             : /* Given a socket that isn't in the table, add it.
     327             :  * Given a socket that is in the table, re-init values that need init-ing
     328             :  * every scheduling run
     329             :  */
     330             : static void
     331          12 : init_socket_info(socket_table_t *table, const channel_t *chan)
     332             : {
     333          12 :   socket_table_ent_t *ent = NULL;
     334          12 :   ent = socket_table_search(table, chan);
     335          12 :   if (!ent) {
     336           5 :     log_debug(LD_SCHED, "scheduler init socket info for chan=%" PRIu64,
     337             :               chan->global_identifier);
     338           5 :     ent = tor_malloc_zero(sizeof(*ent));
     339           5 :     ent->chan = chan;
     340           5 :     HT_INSERT(socket_table_s, table, ent);
     341             :   }
     342          12 :   ent->written = 0;
     343          12 : }
     344             : 
     345             : /* Add chan to the outbuf table if it isn't already in it. If it is, then don't
     346             :  * do anything */
     347             : static void
     348          32 : outbuf_table_add(outbuf_table_t *table, channel_t *chan)
     349             : {
     350          32 :   outbuf_table_ent_t search, *ent;
     351          32 :   search.chan = chan;
     352          32 :   ent = HT_FIND(outbuf_table_s, table, &search);
     353          32 :   if (!ent) {
     354          28 :     log_debug(LD_SCHED, "scheduler init outbuf info for chan=%" PRIu64,
     355             :               chan->global_identifier);
     356          28 :     ent = tor_malloc_zero(sizeof(*ent));
     357          28 :     ent->chan = chan;
     358          28 :     HT_INSERT(outbuf_table_s, table, ent);
     359             :   }
     360          32 : }
     361             : 
     362             : static void
     363          19 : outbuf_table_remove(outbuf_table_t *table, channel_t *chan)
     364             : {
     365          19 :   outbuf_table_ent_t search, *ent;
     366          19 :   search.chan = chan;
     367          19 :   ent = HT_FIND(outbuf_table_s, table, &search);
     368          19 :   if (ent) {
     369          19 :     HT_REMOVE(outbuf_table_s, table, ent);
     370          19 :     free_outbuf_info_by_ent(ent, NULL);
     371             :   }
     372          19 : }
     373             : 
     374             : /* Set the scheduler running interval. */
     375             : static void
     376          16 : set_scheduler_run_interval(void)
     377             : {
     378          16 :   int old_sched_run_interval = sched_run_interval;
     379          16 :   sched_run_interval = kist_scheduler_run_interval();
     380          16 :   if (old_sched_run_interval != sched_run_interval) {
     381           3 :     log_info(LD_SCHED, "Scheduler KIST changing its running interval "
     382             :                        "from %" PRId32 " to %" PRId32,
     383             :              old_sched_run_interval, sched_run_interval);
     384             :   }
     385          16 : }
     386             : 
     387             : /* Return true iff the channel hasn't hit its kist-imposed write limit yet */
     388             : static int
     389          62 : socket_can_write(socket_table_t *table, const channel_t *chan)
     390             : {
     391          62 :   socket_table_ent_t *ent = NULL;
     392          62 :   ent = socket_table_search(table, chan);
     393          62 :   if (SCHED_BUG(!ent, chan)) {
     394           0 :     return 1; // Just return true, saying that kist wouldn't limit the socket
     395             :   }
     396             : 
     397             :   /* We previously calculated a write limit for this socket. In the below
     398             :    * calculation, first determine how much room is left in bytes. Then divide
     399             :    * that by the amount of space a cell takes. If there's room for at least 1
     400             :    * cell, then KIST will allow the socket to write. */
     401          62 :   int64_t kist_limit_space =
     402          62 :     (int64_t) (ent->limit - ent->written) /
     403             :     (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD);
     404          62 :   return kist_limit_space > 0;
     405             : }
     406             : 
     407             : /* Update the channel's socket kernel information. */
     408             : static void
     409          12 : update_socket_info(socket_table_t *table, const channel_t *chan)
     410             : {
     411          12 :   socket_table_ent_t *ent = NULL;
     412          12 :   ent = socket_table_search(table, chan);
     413          12 :   if (SCHED_BUG(!ent, chan)) {
     414           0 :     return; // Whelp. Entry didn't exist for some reason so nothing to do.
     415             :   }
     416          12 :   update_socket_info_impl(ent);
     417          12 :   log_debug(LD_SCHED, "chan=%" PRIu64 " updated socket info, limit: %" PRIu64
     418             :                       ", cwnd: %" PRIu32 ", unacked: %" PRIu32
     419             :                       ", notsent: %" PRIu32 ", mss: %" PRIu32,
     420             :             ent->chan->global_identifier, ent->limit, ent->cwnd, ent->unacked,
     421             :             ent->notsent, ent->mss);
     422             : }
     423             : 
     424             : /* Increment the channel's socket written value by the number of bytes. */
     425             : static void
     426          29 : update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes)
     427             : {
     428          29 :   socket_table_ent_t *ent = NULL;
     429          29 :   ent = socket_table_search(table, chan);
     430          29 :   if (SCHED_BUG(!ent, chan)) {
     431           0 :     return; // Whelp. Entry didn't exist so nothing to do.
     432             :   }
     433             : 
     434          29 :   log_debug(LD_SCHED, "chan=%" PRIu64 " wrote %lu bytes, old was %" PRIi64,
     435             :             chan->global_identifier, (unsigned long) bytes, ent->written);
     436             : 
     437          29 :   ent->written += bytes;
     438             : }
     439             : 
     440             : /*
     441             :  * A naive KIST impl would write every single cell all the way to the kernel.
     442             :  * That would take a lot of system calls. A less bad KIST impl would write a
     443             :  * channel's outbuf to the kernel only when we are switching to a different
     444             :  * channel. But if we have two channels with equal priority, we end up writing
     445             :  * one cell for each and bouncing back and forth. This KIST impl avoids that
     446             :  * by only writing a channel's outbuf to the kernel if it has 8 cells or more
     447             :  * in it.
     448             :  *
     449             :  * Note: The number 8 has been picked for no particular reasons except that it
     450             :  * is 4096 bytes which is a common number for buffering. A TLS record can hold
     451             :  * up to 16KiB thus using 8 cells means that a relay will at most send a TLS
     452             :  * record of 4KiB or 1/4 of the maximum capacity of a TLS record.
     453             :  */
     454           0 : MOCK_IMPL(int, channel_should_write_to_kernel,
     455             :           (outbuf_table_t *table, channel_t *chan))
     456             : {
     457           0 :   outbuf_table_add(table, chan);
     458             :   /* CELL_MAX_NETWORK_SIZE * 8 because we only want to write the outbuf to the
     459             :    * kernel if there's 8 or more cells waiting */
     460           0 :   return channel_outbuf_length(chan) > (CELL_MAX_NETWORK_SIZE * 8);
     461             : }
     462             : 
     463             : /* Little helper function to write a channel's outbuf all the way to the
     464             :  * kernel */
     465           0 : MOCK_IMPL(void, channel_write_to_kernel, (channel_t *chan))
     466             : {
     467           0 :   tor_assert(chan);
     468           0 :   log_debug(LD_SCHED, "Writing %lu bytes to kernel for chan %" PRIu64,
     469             :             (unsigned long)channel_outbuf_length(chan),
     470             :             chan->global_identifier);
     471             :   /* Note that 'connection_handle_write()' may change the scheduler state of
     472             :    * the channel during the scheduling loop with
     473             :    * 'connection_or_flushed_some()' -> 'scheduler_channel_wants_writes()'.
     474             :    * This side-effect will only occur if the channel is currently in the
     475             :    * 'SCHED_CHAN_WAITING_TO_WRITE' or 'SCHED_CHAN_IDLE' states, which KIST
     476             :    * rarely uses, so it should be fine unless KIST begins using these states
     477             :    * in the future. */
     478           0 :   connection_handle_write(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn), 0);
     479           0 : }
     480             : 
     481             : /* Return true iff the scheduler has work to perform. */
     482             : static int
     483          17 : have_work(void)
     484             : {
     485          17 :   smartlist_t *cp = get_channels_pending();
     486          17 :   IF_BUG_ONCE(!cp) {
     487             :     return 0; // channels_pending doesn't exist so... no work?
     488             :   }
     489          17 :   return smartlist_len(cp) > 0;
     490             : }
     491             : 
     492             : /* Function of the scheduler interface: free_all() */
     493             : static void
     494          10 : kist_free_all(void)
     495             : {
     496          10 :   free_all_socket_info();
     497          10 : }
     498             : 
     499             : /* Function of the scheduler interface: on_channel_free() */
     500             : static void
     501          21 : kist_on_channel_free_fn(const channel_t *chan)
     502             : {
     503          21 :   free_socket_info_by_chan(&socket_table, chan);
     504          21 : }
     505             : 
     506             : /* Function of the scheduler interface: on_new_consensus() */
     507             : static void
     508           2 : kist_scheduler_on_new_consensus(void)
     509             : {
     510           2 :   set_scheduler_run_interval();
     511           2 : }
     512             : 
     513             : /* Function of the scheduler interface: on_new_options() */
     514             : static void
     515          14 : kist_scheduler_on_new_options(void)
     516             : {
     517          14 :   sock_buf_size_factor = get_options()->KISTSockBufSizeFactor;
     518             : 
     519             :   /* Calls kist_scheduler_run_interval which calls get_options(). */
     520          14 :   set_scheduler_run_interval();
     521          14 : }
     522             : 
     523             : /* Function of the scheduler interface: init() */
     524             : static void
     525          10 : kist_scheduler_init(void)
     526             : {
     527             :   /* When initializing the scheduler, the last run could be 0 because it is
     528             :    * declared static or a value in the past that was set when it was last
     529             :    * used. In both cases, we want to initialize it to now so we don't risk
     530             :    * using the value 0 which doesn't play well with our monotonic time
     531             :    * interface.
     532             :    *
     533             :    * One side effect is that the first scheduler run will be at the next tick
     534             :    * that is in now + 10 msec (KIST_SCHED_RUN_INTERVAL_DEFAULT) by default. */
     535          10 :   monotime_get(&scheduler_last_run);
     536             : 
     537          10 :   kist_scheduler_on_new_options();
     538          10 :   IF_BUG_ONCE(sched_run_interval == 0) {
     539           0 :     log_warn(LD_SCHED, "We are initing the KIST scheduler and noticed the "
     540             :              "KISTSchedRunInterval is telling us to not use KIST. That's "
     541             :              "weird! We'll continue using KIST, but at %" PRId32 "ms.",
     542             :              KIST_SCHED_RUN_INTERVAL_DEFAULT);
     543           0 :     sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
     544             :   }
     545          10 : }
     546             : 
     547             : /* Function of the scheduler interface: schedule() */
     548             : static void
     549          17 : kist_scheduler_schedule(void)
     550             : {
     551          17 :   struct monotime_t now;
     552          17 :   struct timeval next_run;
     553          17 :   int64_t diff;
     554             : 
     555          17 :   if (!have_work()) {
     556           0 :     return;
     557             :   }
     558          17 :   monotime_get(&now);
     559             : 
     560             :   /* If time is really monotonic, we can never have now being smaller than the
     561             :    * last scheduler run. The scheduler_last_run at first is set to 0.
     562             :    * Unfortunately, not all platforms guarantee monotonic time so we log at
     563             :    * info level but don't make it more noisy. */
     564          17 :   diff = monotime_diff_msec(&scheduler_last_run, &now);
     565          17 :   if (diff < 0) {
     566           0 :     log_info(LD_SCHED, "Monotonic time between now and last run of scheduler "
     567             :                        "is negative: %" PRId64 ". Setting diff to 0.", diff);
     568           0 :     diff = 0;
     569             :   }
     570          17 :   if (diff < sched_run_interval) {
     571          17 :     next_run.tv_sec = 0;
     572             :     /* Takes 1000 ms -> us. This will always be valid because diff can NOT be
     573             :      * negative and can NOT be bigger than sched_run_interval so values can
     574             :      * only go from 1000 usec (diff set to interval - 1) to 100000 usec (diff
     575             :      * set to 0) for the maximum allowed run interval (100ms). */
     576          17 :     next_run.tv_usec = (int) ((sched_run_interval - diff) * 1000);
     577             :     /* Re-adding an event reschedules it. It does not duplicate it. */
     578          17 :     scheduler_ev_add(&next_run);
     579             :   } else {
     580           0 :     scheduler_ev_active();
     581             :   }
     582             : }
     583             : 
     584             : /* Function of the scheduler interface: run() */
     585             : static void
     586           9 : kist_scheduler_run(void)
     587             : {
     588             :   /* Define variables */
     589           9 :   channel_t *chan = NULL; // current working channel
     590             :   /* The last distinct chan served in a sched loop. */
     591           9 :   channel_t *prev_chan = NULL;
     592           9 :   int flush_result; // temporarily store results from flush calls
     593             :   /* Channels to be re-adding to pending at the end */
     594           9 :   smartlist_t *to_readd = NULL;
     595           9 :   smartlist_t *cp = get_channels_pending();
     596             : 
     597           9 :   outbuf_table_t outbuf_table = HT_INITIALIZER();
     598             : 
     599             :   /* For each pending channel, collect new kernel information */
     600          21 :   SMARTLIST_FOREACH_BEGIN(cp, const channel_t *, pchan) {
     601          12 :       init_socket_info(&socket_table, pchan);
     602          12 :       update_socket_info(&socket_table, pchan);
     603          12 :   } SMARTLIST_FOREACH_END(pchan);
     604             : 
     605           9 :   log_debug(LD_SCHED, "Running the scheduler. %d channels pending",
     606             :             smartlist_len(cp));
     607             : 
     608             :   /* The main scheduling loop. Loop until there are no more pending channels */
     609          41 :   while (smartlist_len(cp) > 0) {
     610             :     /* get best channel */
     611          32 :     chan = smartlist_pqueue_pop(cp, scheduler_compare_channels,
     612             :                                 offsetof(channel_t, sched_heap_idx));
     613          32 :     if (SCHED_BUG(!chan, NULL)) {
     614             :       /* Some-freaking-how a NULL got into the channels_pending. That should
     615             :        * never happen, but it should be harmless to ignore it and keep looping.
     616             :        */
     617           0 :       continue;
     618             :     }
     619          32 :     outbuf_table_add(&outbuf_table, chan);
     620             : 
     621             :     /* if we have switched to a new channel, consider writing the previous
     622             :      * channel's outbuf to the kernel. */
     623          32 :     if (!prev_chan) {
     624           9 :       prev_chan = chan;
     625             :     }
     626          32 :     if (prev_chan != chan) {
     627          19 :       if (channel_should_write_to_kernel(&outbuf_table, prev_chan)) {
     628          19 :         channel_write_to_kernel(prev_chan);
     629          19 :         outbuf_table_remove(&outbuf_table, prev_chan);
     630             :       }
     631             :       prev_chan = chan;
     632             :     }
     633             : 
     634             :     /* Only flush and write if the per-socket limit hasn't been hit */
     635          32 :     if (socket_can_write(&socket_table, chan)) {
     636             :       /* flush to channel queue/outbuf */
     637          31 :       flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells
     638             :       /* XXX: While flushing cells, it is possible that the connection write
     639             :        * fails leading to the channel to be closed which triggers a release
     640             :        * and free its entry in the socket table. And because of a engineering
     641             :        * design issue, the error is not propagated back so we don't get an
     642             :        * error at this point. So before we continue, make sure the channel is
     643             :        * open and if not just ignore it. See #23751. */
     644          31 :       if (!CHANNEL_IS_OPEN(chan)) {
     645             :         /* Channel isn't open so we put it back in IDLE mode. It is either
     646             :          * renegotiating its TLS session or about to be released. */
     647           1 :         scheduler_set_channel_state(chan, SCHED_CHAN_IDLE);
     648           1 :         continue;
     649             :       }
     650             :       /* flush_result has the # cells flushed */
     651          30 :       if (flush_result > 0) {
     652          29 :         update_socket_written(&socket_table, chan, flush_result *
     653             :                               (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD));
     654             :       } else {
     655             :         /* XXX: This can happen because tor sometimes does flush in an
     656             :          * opportunistic way cells from the circuit to the outbuf so the
     657             :          * channel can end up here without having anything to flush nor needed
     658             :          * to write to the kernel. Hopefully we'll fix that soon but for now
     659             :          * we have to handle this case which happens kind of often. */
     660           1 :         log_debug(LD_SCHED,
     661             :                  "We didn't flush anything on a chan that we think "
     662             :                  "can write and wants to write. The channel's state is '%s' "
     663             :                  "and in scheduler state '%s'. We're going to mark it as "
     664             :                  "waiting_for_cells (as that's most likely the issue) and "
     665             :                  "stop scheduling it this round.",
     666             :                  channel_state_to_string(chan->state),
     667             :                  get_scheduler_state_string(chan->scheduler_state));
     668           1 :         scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
     669           1 :         continue;
     670             :       }
     671             :     }
     672             : 
     673             :     /* Decide what to do with the channel now */
     674             : 
     675          37 :     if (!channel_more_to_flush(chan) &&
     676           7 :         !socket_can_write(&socket_table, chan)) {
     677             : 
     678             :       /* Case 1: no more cells to send, and cannot write */
     679             : 
     680             :       /*
     681             :        * You might think we should put the channel in SCHED_CHAN_IDLE. And
     682             :        * you're probably correct. While implementing KIST, we found that the
     683             :        * scheduling system would sometimes lose track of channels when we did
     684             :        * that. We suspect it has to do with the difference between "can't
     685             :        * write because socket/outbuf is full" and KIST's "can't write because
     686             :        * we've arbitrarily decided that that's enough for now." Sometimes
     687             :        * channels run out of cells at the same time they hit their
     688             :        * kist-imposed write limit and maybe the rest of Tor doesn't put the
     689             :        * channel back in pending when it is supposed to.
     690             :        *
     691             :        * This should be investigated again. It is as simple as changing
     692             :        * SCHED_CHAN_WAITING_FOR_CELLS to SCHED_CHAN_IDLE and seeing if Tor
     693             :        * starts having serious throughput issues. Best done in shadow/chutney.
     694             :        */
     695           0 :       scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
     696          30 :     } else if (!channel_more_to_flush(chan)) {
     697             : 
     698             :       /* Case 2: no more cells to send, but still open for writes */
     699             : 
     700           7 :       scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
     701          23 :     } else if (!socket_can_write(&socket_table, chan)) {
     702             : 
     703             :       /* Case 3: cells to send, but cannot write */
     704             : 
     705             :       /*
     706             :        * We want to write, but can't. If we left the channel in
     707             :        * channels_pending, we would never exit the scheduling loop. We need to
     708             :        * add it to a temporary list of channels to be added to channels_pending
     709             :        * after the scheduling loop is over. They can hopefully be taken care of
     710             :        * in the next scheduling round.
     711             :        */
     712           3 :       if (!to_readd) {
     713           2 :         to_readd = smartlist_new();
     714             :       }
     715           3 :       smartlist_add(to_readd, chan);
     716             :     } else {
     717             : 
     718             :       /* Case 4: cells to send, and still open for writes */
     719             : 
     720          20 :       scheduler_set_channel_state(chan, SCHED_CHAN_PENDING);
     721          20 :       if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) {
     722          20 :         smartlist_pqueue_add(cp, scheduler_compare_channels,
     723             :                              offsetof(channel_t, sched_heap_idx), chan);
     724             :       }
     725             :     }
     726             :   } /* End of main scheduling loop */
     727             : 
     728             :   /* Write the outbuf of any channels that still have data */
     729           9 :   HT_FOREACH_FN(outbuf_table_s, &outbuf_table, each_channel_write_to_kernel,
     730             :                 NULL);
     731             :   /* We are done with it. */
     732           9 :   HT_FOREACH_FN(outbuf_table_s, &outbuf_table, free_outbuf_info_by_ent, NULL);
     733           9 :   HT_CLEAR(outbuf_table_s, &outbuf_table);
     734             : 
     735           9 :   log_debug(LD_SCHED, "len pending=%d, len to_readd=%d",
     736             :             smartlist_len(cp),
     737             :             (to_readd ? smartlist_len(to_readd) : -1));
     738             : 
     739             :   /* Re-add any channels we need to */
     740           9 :   if (to_readd) {
     741           5 :     SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
     742           3 :       scheduler_set_channel_state(readd_chan, SCHED_CHAN_PENDING);
     743           3 :       if (!smartlist_contains(cp, readd_chan)) {
     744           3 :         if (!SCHED_BUG(readd_chan->sched_heap_idx != -1, readd_chan)) {
     745             :           /* XXXX Note that the check above is in theory redundant with
     746             :            * the smartlist_contains check.  But let's make sure we're
     747             :            * not messing anything up, and leave them both for now. */
     748           3 :           smartlist_pqueue_add(cp, scheduler_compare_channels,
     749             :                              offsetof(channel_t, sched_heap_idx), readd_chan);
     750             :         }
     751             :       }
     752           3 :     } SMARTLIST_FOREACH_END(readd_chan);
     753           2 :     smartlist_free(to_readd);
     754             :   }
     755             : 
     756           9 :   monotime_get(&scheduler_last_run);
     757           9 : }
     758             : 
     759             : /*****************************************************************************
     760             :  * Externally called function implementations not called through scheduler_t
     761             :  *****************************************************************************/
     762             : 
     763             : /* Stores the kist scheduler function pointers. */
     764             : static scheduler_t kist_scheduler = {
     765             :   .type = SCHEDULER_KIST,
     766             :   .free_all = kist_free_all,
     767             :   .on_channel_free = kist_on_channel_free_fn,
     768             :   .init = kist_scheduler_init,
     769             :   .on_new_consensus = kist_scheduler_on_new_consensus,
     770             :   .schedule = kist_scheduler_schedule,
     771             :   .run = kist_scheduler_run,
     772             :   .on_new_options = kist_scheduler_on_new_options,
     773             : };
     774             : 
     775             : /* Return the KIST scheduler object. If it didn't exists, return a newly
     776             :  * allocated one but init() is not called. */
     777             : scheduler_t *
     778          20 : get_kist_scheduler(void)
     779             : {
     780          20 :   return &kist_scheduler;
     781             : }
     782             : 
     783             : /* Check the torrc (and maybe consensus) for the configured KIST scheduler run
     784             :  * interval.
     785             :  * - If torrc > 0, then return the positive torrc value (should use KIST, and
     786             :  *   should use the set value)
     787             :  * - If torrc == 0, then look in the consensus for what the value should be.
     788             :  *   - If == 0, then return 0 (don't use KIST)
     789             :  *   - If > 0, then return the positive consensus value
     790             :  *   - If consensus doesn't say anything, return 10 milliseconds, default.
     791             :  */
     792             : int
     793          40 : kist_scheduler_run_interval(void)
     794             : {
     795          40 :   int run_interval = get_options()->KISTSchedRunInterval;
     796             : 
     797          40 :   if (run_interval != 0) {
     798           9 :     log_debug(LD_SCHED, "Found KISTSchedRunInterval=%" PRId32 " in torrc. "
     799             :                         "Using that.", run_interval);
     800           9 :     return run_interval;
     801             :   }
     802             : 
     803          31 :   log_debug(LD_SCHED, "KISTSchedRunInterval=0, turning to the consensus.");
     804             : 
     805             :   /* Will either be the consensus value or the default. Note that 0 can be
     806             :    * returned which means the consensus wants us to NOT use KIST. */
     807          31 :   return networkstatus_get_param(NULL, "KISTSchedRunInterval",
     808             :                                  KIST_SCHED_RUN_INTERVAL_DEFAULT,
     809             :                                  KIST_SCHED_RUN_INTERVAL_MIN,
     810             :                                  KIST_SCHED_RUN_INTERVAL_MAX);
     811             : }
     812             : 
     813             : /* Set KISTLite mode that is KIST without kernel support. */
     814             : void
     815           1 : scheduler_kist_set_lite_mode(void)
     816             : {
     817           1 :   kist_lite_mode = 1;
     818           1 :   kist_scheduler.type = SCHEDULER_KIST_LITE;
     819           1 :   log_info(LD_SCHED,
     820             :            "Setting KIST scheduler without kernel support (KISTLite mode)");
     821           1 : }
     822             : 
     823             : /* Set KIST mode that is KIST with kernel support. */
     824             : void
     825          14 : scheduler_kist_set_full_mode(void)
     826             : {
     827          14 :   kist_lite_mode = 0;
     828          14 :   kist_scheduler.type = SCHEDULER_KIST;
     829          14 :   log_info(LD_SCHED,
     830             :            "Setting KIST scheduler with kernel support (KIST mode)");
     831          14 : }
     832             : 
     833             : #ifdef HAVE_KIST_SUPPORT
     834             : 
     835             : /* Return true iff the scheduler subsystem should use KIST. */
     836             : int
     837          20 : scheduler_can_use_kist(void)
     838             : {
     839          20 :   if (kist_no_kernel_support) {
     840             :     /* We have no kernel support so we can't use KIST. */
     841             :     return 0;
     842             :   }
     843             : 
     844             :   /* We do have the support, time to check if we can get the interval that the
     845             :    * consensus can be disabling. */
     846          20 :   int run_interval = kist_scheduler_run_interval();
     847          23 :   log_debug(LD_SCHED, "Determined KIST sched_run_interval should be "
     848             :                       "%" PRId32 ". Can%s use KIST.",
     849             :            run_interval, (run_interval > 0 ? "" : " not"));
     850          20 :   return run_interval > 0;
     851             : }
     852             : 
     853             : #else /* !defined(HAVE_KIST_SUPPORT) */
     854             : 
     855             : int
     856             : scheduler_can_use_kist(void)
     857             : {
     858             :   return 0;
     859             : }
     860             : 
     861             : #endif /* defined(HAVE_KIST_SUPPORT) */

Generated by: LCOV version 1.14