LCOV - code coverage report
Current view: top level - lib/string - util_string.c (source / functions) Hit Total Coverage
Test: lcov.info Lines: 172 176 97.7 %
Date: 2021-11-24 03:28:48 Functions: 30 31 96.8 %

          Line data    Source code
       1             : /* Copyright (c) 2003-2004, Roger Dingledine
       2             :  * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
       3             :  * Copyright (c) 2007-2021, The Tor Project, Inc. */
       4             : /* See LICENSE for licensing information */
       5             : 
       6             : /**
       7             :  * \file util_string.c
       8             :  * \brief Non-standard string functions used throughout Tor.
       9             :  **/
      10             : 
      11             : #include "lib/string/util_string.h"
      12             : #include "lib/string/compat_ctype.h"
      13             : #include "lib/err/torerr.h"
      14             : #include "lib/ctime/di_ops.h"
      15             : #include "lib/defs/digest_sizes.h"
      16             : 
      17             : #include <string.h>
      18             : #include <stdlib.h>
      19             : 
      20             : /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
      21             :  * <b>needle</b>, return a pointer to the first occurrence of the needle
      22             :  * within the haystack, or NULL if there is no such occurrence.
      23             :  *
      24             :  * This function is <em>not</em> timing-safe.
      25             :  *
      26             :  * Requires that <b>nlen</b> be greater than zero.
      27             :  */
      28             : const void *
      29       85432 : tor_memmem(const void *_haystack, size_t hlen,
      30             :            const void *_needle, size_t nlen)
      31             : {
      32             : #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
      33       85432 :   raw_assert(nlen);
      34       85432 :   return memmem(_haystack, hlen, _needle, nlen);
      35             : #else
      36             :   /* This isn't as fast as the GLIBC implementation, but it doesn't need to
      37             :    * be. */
      38             :   const char *p, *last_possible_start;
      39             :   const char *haystack = (const char*)_haystack;
      40             :   const char *needle = (const char*)_needle;
      41             :   char first;
      42             :   raw_assert(nlen);
      43             : 
      44             :   if (nlen > hlen)
      45             :     return NULL;
      46             : 
      47             :   p = haystack;
      48             :   /* Last position at which the needle could start. */
      49             :   last_possible_start = haystack + hlen - nlen;
      50             :   first = *(const char*)needle;
      51             :   while ((p = memchr(p, first, last_possible_start + 1 - p))) {
      52             :     if (fast_memeq(p, needle, nlen))
      53             :       return p;
      54             :     if (++p > last_possible_start) {
      55             :       /* This comparison shouldn't be necessary, since if p was previously
      56             :        * equal to last_possible_start, the next memchr call would be
      57             :        * "memchr(p, first, 0)", which will return NULL. But it clarifies the
      58             :        * logic. */
      59             :       return NULL;
      60             :     }
      61             :   }
      62             :   return NULL;
      63             : #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
      64             : }
      65             : 
      66             : const void *
      67       84111 : tor_memstr(const void *haystack, size_t hlen, const char *needle)
      68             : {
      69       84111 :   return tor_memmem(haystack, hlen, needle, strlen(needle));
      70             : }
      71             : 
      72             : /** Return true iff the 'len' bytes at 'mem' are all zero. */
      73             : int
      74       87161 : fast_mem_is_zero(const char *mem, size_t len)
      75             : {
      76       87161 :   static const char ZERO[] = {
      77             :     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
      78             :   };
      79       89032 :   while (len >= sizeof(ZERO)) {
      80             :     /* It's safe to use fast_memcmp here, since the very worst thing an
      81             :      * attacker could learn is how many initial bytes of a secret were zero */
      82       55713 :     if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
      83             :       return 0;
      84        1871 :     len -= sizeof(ZERO);
      85        1871 :     mem += sizeof(ZERO);
      86             :   }
      87             :   /* Deal with leftover bytes. */
      88       33319 :   if (len)
      89       32332 :     return fast_memeq(mem, ZERO, len);
      90             : 
      91             :   return 1;
      92             : }
      93             : 
      94             : /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
      95             : int
      96       46374 : tor_digest_is_zero(const char *digest)
      97             : {
      98       46374 :   return safe_mem_is_zero(digest, DIGEST_LEN);
      99             : }
     100             : 
     101             : /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
     102             : int
     103         129 : tor_digest256_is_zero(const char *digest)
     104             : {
     105         129 :   return safe_mem_is_zero(digest, DIGEST256_LEN);
     106             : }
     107             : 
     108             : /** Remove from the string <b>s</b> every character which appears in
     109             :  * <b>strip</b>. */
     110             : void
     111         139 : tor_strstrip(char *s, const char *strip)
     112             : {
     113         139 :   char *readp = s;
     114        5274 :   while (*readp) {
     115        5135 :     if (strchr(strip, *readp)) {
     116         740 :       ++readp;
     117             :     } else {
     118        4395 :       *s++ = *readp++;
     119             :     }
     120             :   }
     121         139 :   *s = '\0';
     122         139 : }
     123             : 
     124             : /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
     125             :  * lowercase. */
     126             : void
     127        3042 : tor_strlower(char *s)
     128             : {
     129       39534 :   while (*s) {
     130       36492 :     *s = TOR_TOLOWER(*s);
     131       36492 :     ++s;
     132             :   }
     133        3042 : }
     134             : 
     135             : /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
     136             :  * lowercase. */
     137             : void
     138          34 : tor_strupper(char *s)
     139             : {
     140        1367 :   while (*s) {
     141        1333 :     *s = TOR_TOUPPER(*s);
     142        1333 :     ++s;
     143             :   }
     144          34 : }
     145             : 
     146             : /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
     147             : void
     148           4 : tor_strreplacechar(char *s, char find, char replacement)
     149             : {
     150           8 :   for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
     151           4 :     *s = replacement;
     152             :   }
     153           4 : }
     154             : 
     155             : /** Return 1 if every character in <b>s</b> is printable, else return 0.
     156             :  */
     157             : int
     158           2 : tor_strisprint(const char *s)
     159             : {
     160          11 :   while (*s) {
     161          10 :     if (!TOR_ISPRINT(*s))
     162             :       return 0;
     163           9 :     s++;
     164             :   }
     165             :   return 1;
     166             : }
     167             : 
     168             : /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
     169             :  */
     170             : int
     171           6 : tor_strisnonupper(const char *s)
     172             : {
     173          63 :   while (*s) {
     174          58 :     if (TOR_ISUPPER(*s))
     175             :       return 0;
     176          57 :     s++;
     177             :   }
     178             :   return 1;
     179             : }
     180             : 
     181             : /** Return true iff every character in <b>s</b> is whitespace space; else
     182             :  * return false. */
     183             : int
     184          35 : tor_strisspace(const char *s)
     185             : {
     186          70 :   while (*s) {
     187          38 :     if (!TOR_ISSPACE(*s))
     188             :       return 0;
     189          35 :     s++;
     190             :   }
     191             :   return 1;
     192             : }
     193             : 
     194             : /** As strcmp, except that either string may be NULL.  The NULL string is
     195             :  * considered to be before any non-NULL string. */
     196             : int
     197       39267 : strcmp_opt(const char *s1, const char *s2)
     198             : {
     199       39267 :   if (!s1) {
     200        4423 :     if (!s2)
     201             :       return 0;
     202             :     else
     203           9 :       return -1;
     204       34844 :   } else if (!s2) {
     205             :     return 1;
     206             :   } else {
     207       34733 :     return strcmp(s1, s2);
     208             :   }
     209             : }
     210             : 
     211             : /** Compares the first strlen(s2) characters of s1 with s2.  Returns as for
     212             :  * strcmp.
     213             :  */
     214             : int
     215      316689 : strcmpstart(const char *s1, const char *s2)
     216             : {
     217      316689 :   size_t n = strlen(s2);
     218      316689 :   return strncmp(s1, s2, n);
     219             : }
     220             : 
     221             : /** Compares the first strlen(s2) characters of s1 with s2.  Returns as for
     222             :  * strcasecmp.
     223             :  */
     224             : int
     225       27735 : strcasecmpstart(const char *s1, const char *s2)
     226             : {
     227       27735 :   size_t n = strlen(s2);
     228       27735 :   return strncasecmp(s1, s2, n);
     229             : }
     230             : 
     231             : /** Compare the value of the string <b>prefix</b> with the start of the
     232             :  * <b>memlen</b>-byte memory chunk at <b>mem</b>.  Return as for strcmp.
     233             :  *
     234             :  * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
     235             :  * less than strlen(prefix).]
     236             :  */
     237             : int
     238           0 : fast_memcmpstart(const void *mem, size_t memlen,
     239             :                 const char *prefix)
     240             : {
     241           0 :   size_t plen = strlen(prefix);
     242           0 :   if (memlen < plen)
     243             :     return -1;
     244           0 :   return fast_memcmp(mem, prefix, plen);
     245             : }
     246             : 
     247             : /** Compares the last strlen(s2) characters of s1 with s2.  Returns as for
     248             :  * strcmp.
     249             :  */
     250             : int
     251        1085 : strcmpend(const char *s1, const char *s2)
     252             : {
     253        1085 :   size_t n1 = strlen(s1), n2 = strlen(s2);
     254        1085 :   if (n2>n1)
     255           4 :     return strcmp(s1,s2);
     256             :   else
     257        1081 :     return strncmp(s1+(n1-n2), s2, n2);
     258             : }
     259             : 
     260             : /** Compares the last strlen(s2) characters of s1 with s2.  Returns as for
     261             :  * strcasecmp.
     262             :  */
     263             : int
     264         768 : strcasecmpend(const char *s1, const char *s2)
     265             : {
     266         768 :   size_t n1 = strlen(s1), n2 = strlen(s2);
     267         768 :   if (n2>n1) /* then they can't be the same; figure out which is bigger */
     268         435 :     return strcasecmp(s1,s2);
     269             :   else
     270         333 :     return strncasecmp(s1+(n1-n2), s2, n2);
     271             : }
     272             : 
     273             : /** Return a pointer to the first char of s that is not whitespace and
     274             :  * not a comment, or to the terminating NUL if no such character exists.
     275             :  */
     276             : const char *
     277      235463 : eat_whitespace(const char *s)
     278             : {
     279      235463 :   raw_assert(s);
     280             : 
     281      275682 :   while (1) {
     282      275682 :     switch (*s) {
     283      235463 :     case '\0':
     284             :     default:
     285      235463 :       return s;
     286       39982 :     case ' ':
     287             :     case '\t':
     288             :     case '\n':
     289             :     case '\r':
     290       39982 :       ++s;
     291       39982 :       break;
     292         237 :     case '#':
     293         237 :       ++s;
     294         237 :       while (*s && *s != '\n')
     295         641 :         ++s;
     296             :     }
     297             :   }
     298             : }
     299             : 
     300             : /** Return a pointer to the first char of s that is not whitespace and
     301             :  * not a comment, or to the terminating NUL if no such character exists.
     302             :  */
     303             : const char *
     304      537501 : eat_whitespace_eos(const char *s, const char *eos)
     305             : {
     306      537501 :   raw_assert(s);
     307      537501 :   raw_assert(eos && s <= eos);
     308             : 
     309      677749 :   while (s < eos) {
     310      612848 :     switch (*s) {
     311             :     case '\0':
     312             :     default:
     313             :       return s;
     314      140010 :     case ' ':
     315             :     case '\t':
     316             :     case '\n':
     317             :     case '\r':
     318      140010 :       ++s;
     319      140010 :       break;
     320         238 :     case '#':
     321         238 :       ++s;
     322         775 :       while (s < eos && *s && *s != '\n')
     323         537 :         ++s;
     324             :     }
     325             :   }
     326             :   return s;
     327             : }
     328             : 
     329             : /** Return a pointer to the first char of s that is not a space or a tab
     330             :  * or a \\r, or to the terminating NUL if no such character exists. */
     331             : const char *
     332         243 : eat_whitespace_no_nl(const char *s)
     333             : {
     334         412 :   while (*s == ' ' || *s == '\t' || *s == '\r')
     335         169 :     ++s;
     336         243 :   return s;
     337             : }
     338             : 
     339             : /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
     340             :  * found a non-whitespace character or not. */
     341             : const char *
     342      182695 : eat_whitespace_eos_no_nl(const char *s, const char *eos)
     343             : {
     344      305200 :   while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
     345      122505 :     ++s;
     346      182695 :   return s;
     347             : }
     348             : 
     349             : /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
     350             :  * or to the terminating NUL if no such character exists.
     351             :  */
     352             : const char *
     353      202113 : find_whitespace(const char *s)
     354             : {
     355             :   /* tor_assert(s); */
     356     4183585 :   while (1) {
     357     2192849 :     switch (*s)
     358             :     {
     359      202113 :     case '\0':
     360             :     case '#':
     361             :     case ' ':
     362             :     case '\r':
     363             :     case '\n':
     364             :     case '\t':
     365      202113 :       return s;
     366     1990736 :     default:
     367     1990736 :       ++s;
     368             :     }
     369             :   }
     370             : }
     371             : 
     372             : /** As find_whitespace, but stop at <b>eos</b> whether we have found a
     373             :  * whitespace or not. */
     374             : const char *
     375      184256 : find_whitespace_eos(const char *s, const char *eos)
     376             : {
     377             :   /* tor_assert(s); */
     378     1639215 :   while (s < eos) {
     379     1582595 :     switch (*s)
     380             :     {
     381             :     case '\0':
     382             :     case '#':
     383             :     case ' ':
     384             :     case '\r':
     385             :     case '\n':
     386             :     case '\t':
     387             :       return s;
     388     1454959 :     default:
     389     1454959 :       ++s;
     390             :     }
     391             :   }
     392             :   return s;
     393             : }
     394             : 
     395             : /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
     396             :  * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
     397             :  * or immediately after a newline).  Return NULL if no such string is found.
     398             :  */
     399             : const char *
     400         711 : find_str_at_start_of_line(const char *haystack, const char *needle)
     401             : {
     402         711 :   size_t needle_len = strlen(needle);
     403             : 
     404         733 :   do {
     405         733 :     if (!strncmp(haystack, needle, needle_len))
     406         705 :       return haystack;
     407             : 
     408          28 :     haystack = strchr(haystack, '\n');
     409          28 :     if (!haystack)
     410             :       return NULL;
     411             :     else
     412          23 :       ++haystack;
     413          23 :   } while (*haystack);
     414             : 
     415             :   return NULL;
     416             : }
     417             : 
     418             : /** Returns true if <b>string</b> could be a C identifier.
     419             :     A C identifier must begin with a letter or an underscore and the
     420             :     rest of its characters can be letters, numbers or underscores. No
     421             :     length limit is imposed. */
     422             : int
     423         244 : string_is_C_identifier(const char *string)
     424             : {
     425         244 :   size_t iter;
     426         244 :   size_t length = strlen(string);
     427         244 :   if (!length)
     428             :     return 0;
     429             : 
     430        1288 :   for (iter = 0; iter < length ; iter++) {
     431        1130 :     if (iter == 0) {
     432         243 :       if (!(TOR_ISALPHA(string[iter]) ||
     433          87 :             string[iter] == '_'))
     434             :         return 0;
     435             :     } else {
     436         941 :       if (!(TOR_ISALPHA(string[iter]) ||
     437         159 :             TOR_ISDIGIT(string[iter]) ||
     438             :             string[iter] == '_'))
     439             :         return 0;
     440             :     }
     441             :   }
     442             : 
     443             :   return 1;
     444             : }
     445             : 
     446             : /** A byte with the top <b>x</b> bits set. */
     447             : #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
     448             : /** A byte with the lowest <b>x</b> bits set. */
     449             : #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
     450             : 
     451             : /** Given the leading byte <b>b</b>, return the total number of bytes in the
     452             :  * UTF-8 character. Returns 0 if it's an invalid leading byte.
     453             :  */
     454             : static uint8_t
     455        1343 : bytes_in_char(uint8_t b)
     456             : {
     457        1343 :   if ((TOP_BITS(1) & b) == 0x00)
     458             :     return 1; // a 1-byte UTF-8 char, aka ASCII
     459          34 :   if ((TOP_BITS(3) & b) == TOP_BITS(2))
     460             :     return 2; // a 2-byte UTF-8 char
     461          27 :   if ((TOP_BITS(4) & b) == TOP_BITS(3))
     462             :     return 3; // a 3-byte UTF-8 char
     463          14 :   if ((TOP_BITS(5) & b) == TOP_BITS(4))
     464           9 :     return 4; // a 4-byte UTF-8 char
     465             : 
     466             :   // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
     467             :   return 0;
     468             : }
     469             : 
     470             : /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
     471             : static bool
     472          53 : is_continuation_byte(uint8_t b)
     473             : {
     474          53 :   uint8_t top2bits = b & TOP_BITS(2);
     475          53 :   return top2bits == TOP_BITS(1);
     476             : }
     477             : 
     478             : /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
     479             :  * character.
     480             :  */
     481             : static bool
     482        1334 : validate_char(const uint8_t *c, uint8_t len)
     483             : {
     484        1334 :   if (len == 1)
     485             :     return true; // already validated this is an ASCII char earlier.
     486             : 
     487          25 :   uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
     488          25 :   uint32_t codepoint = c[0] & mask;
     489             : 
     490          25 :   mask = LOW_BITS(6); // bitmask for continuation bytes.
     491          74 :   for (uint8_t i = 1; i < len; i++) {
     492          53 :     if (!is_continuation_byte(c[i]))
     493             :       return false;
     494          49 :     codepoint <<= 6;
     495          49 :     codepoint |= (c[i] & mask);
     496             :   }
     497             : 
     498          21 :   if (len == 2 && codepoint <= 0x7f)
     499             :     return false; // Invalid, overly long encoding, should have fit in 1 byte.
     500             : 
     501          20 :   if (len == 3 && codepoint <= 0x7ff)
     502             :     return false; // Invalid, overly long encoding, should have fit in 2 bytes.
     503             : 
     504          19 :   if (len == 4 && codepoint <= 0xffff)
     505             :     return false; // Invalid, overly long encoding, should have fit in 3 bytes.
     506             : 
     507          18 :   if (codepoint >= 0xd800 && codepoint <= 0xdfff)
     508             :     return false; // Invalid, reserved for UTF-16 surrogate pairs.
     509             : 
     510          13 :   return codepoint <= 0x10ffff; // Check if within maximum.
     511             : }
     512             : 
     513             : /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
     514             :     valid UTF-8 string. */
     515             : int
     516         119 : string_is_utf8(const char *str, size_t len)
     517             : {
     518             :   // If str is NULL, don't try to read it
     519         119 :   if (!str) {
     520             :     // We could test for this case, but the low-level logs would produce
     521             :     // confusing test output.
     522             :     // LCOV_EXCL_START
     523             :     if (len) {
     524             :       // Use the low-level logging function, so that the log module can
     525             :       // validate UTF-8 (if needed in future code)
     526             :       tor_log_err_sigsafe(
     527             :         "BUG: string_is_utf8() called with NULL str but non-zero len.");
     528             :       // Since it's a bug, we should probably reject this string
     529             :       return false;
     530             :     }
     531             :     // LCOV_EXCL_STOP
     532             :     return true;
     533             :   }
     534             : 
     535        1438 :   for (size_t i = 0; i < len;) {
     536        1343 :     uint8_t num_bytes = bytes_in_char(str[i]);
     537        1343 :     if (num_bytes == 0) // Invalid leading byte found.
     538             :       return false;
     539             : 
     540        1338 :     size_t next_char = i + num_bytes;
     541        1338 :     if (next_char > len)
     542             :       return false;
     543             : 
     544             :     // Validate the continuation bytes in this multi-byte character,
     545             :     // and advance to the next character in the string.
     546        1334 :     if (!validate_char((const uint8_t*)&str[i], num_bytes))
     547             :       return false;
     548             :     i = next_char;
     549             :   }
     550             :   return true;
     551             : }
     552             : 
     553             : /** As string_is_utf8(), but returns false if the string begins with a UTF-8
     554             :  * byte order mark (BOM).
     555             :  */
     556             : int
     557           4 : string_is_utf8_no_bom(const char *str, size_t len)
     558             : {
     559           4 :   if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
     560           2 :                           !strcmpstart(str, "\uFFFE"))) {
     561             :     return false;
     562             :   }
     563           1 :   return string_is_utf8(str, len);
     564             : }

Generated by: LCOV version 1.14