Tor  0.4.7.0-alpha-dev
util_string.c
Go to the documentation of this file.
1 /* Copyright (c) 2003-2004, Roger Dingledine
2  * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3  * Copyright (c) 2007-2021, The Tor Project, Inc. */
4 /* See LICENSE for licensing information */
5 
6 /**
7  * \file util_string.c
8  * \brief Non-standard string functions used throughout Tor.
9  **/
10 
11 #include "lib/string/util_string.h"
13 #include "lib/err/torerr.h"
14 #include "lib/ctime/di_ops.h"
15 #include "lib/defs/digest_sizes.h"
16 
17 #include <string.h>
18 #include <stdlib.h>
19 
20 /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
21  * <b>needle</b>, return a pointer to the first occurrence of the needle
22  * within the haystack, or NULL if there is no such occurrence.
23  *
24  * This function is <em>not</em> timing-safe.
25  *
26  * Requires that <b>nlen</b> be greater than zero.
27  */
28 const void *
29 tor_memmem(const void *_haystack, size_t hlen,
30  const void *_needle, size_t nlen)
31 {
32 #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
33  raw_assert(nlen);
34  return memmem(_haystack, hlen, _needle, nlen);
35 #else
36  /* This isn't as fast as the GLIBC implementation, but it doesn't need to
37  * be. */
38  const char *p, *last_possible_start;
39  const char *haystack = (const char*)_haystack;
40  const char *needle = (const char*)_needle;
41  char first;
42  raw_assert(nlen);
43 
44  if (nlen > hlen)
45  return NULL;
46 
47  p = haystack;
48  /* Last position at which the needle could start. */
49  last_possible_start = haystack + hlen - nlen;
50  first = *(const char*)needle;
51  while ((p = memchr(p, first, last_possible_start + 1 - p))) {
52  if (fast_memeq(p, needle, nlen))
53  return p;
54  if (++p > last_possible_start) {
55  /* This comparison shouldn't be necessary, since if p was previously
56  * equal to last_possible_start, the next memchr call would be
57  * "memchr(p, first, 0)", which will return NULL. But it clarifies the
58  * logic. */
59  return NULL;
60  }
61  }
62  return NULL;
63 #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
64 }
65 
66 const void *
67 tor_memstr(const void *haystack, size_t hlen, const char *needle)
68 {
69  return tor_memmem(haystack, hlen, needle, strlen(needle));
70 }
71 
72 /** Return true iff the 'len' bytes at 'mem' are all zero. */
73 int
74 fast_mem_is_zero(const char *mem, size_t len)
75 {
76  static const char ZERO[] = {
77  0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
78  };
79  while (len >= sizeof(ZERO)) {
80  /* It's safe to use fast_memcmp here, since the very worst thing an
81  * attacker could learn is how many initial bytes of a secret were zero */
82  if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
83  return 0;
84  len -= sizeof(ZERO);
85  mem += sizeof(ZERO);
86  }
87  /* Deal with leftover bytes. */
88  if (len)
89  return fast_memeq(mem, ZERO, len);
90 
91  return 1;
92 }
93 
94 /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
95 int
96 tor_digest_is_zero(const char *digest)
97 {
98  return safe_mem_is_zero(digest, DIGEST_LEN);
99 }
100 
101 /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
102 int
103 tor_digest256_is_zero(const char *digest)
104 {
105  return safe_mem_is_zero(digest, DIGEST256_LEN);
106 }
107 
108 /** Remove from the string <b>s</b> every character which appears in
109  * <b>strip</b>. */
110 void
111 tor_strstrip(char *s, const char *strip)
112 {
113  char *readp = s;
114  while (*readp) {
115  if (strchr(strip, *readp)) {
116  ++readp;
117  } else {
118  *s++ = *readp++;
119  }
120  }
121  *s = '\0';
122 }
123 
124 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
125  * lowercase. */
126 void
127 tor_strlower(char *s)
128 {
129  while (*s) {
130  *s = TOR_TOLOWER(*s);
131  ++s;
132  }
133 }
134 
135 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
136  * lowercase. */
137 void
138 tor_strupper(char *s)
139 {
140  while (*s) {
141  *s = TOR_TOUPPER(*s);
142  ++s;
143  }
144 }
145 
146 /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
147 void
148 tor_strreplacechar(char *s, char find, char replacement)
149 {
150  for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
151  *s = replacement;
152  }
153 }
154 
155 /** Return 1 if every character in <b>s</b> is printable, else return 0.
156  */
157 int
158 tor_strisprint(const char *s)
159 {
160  while (*s) {
161  if (!TOR_ISPRINT(*s))
162  return 0;
163  s++;
164  }
165  return 1;
166 }
167 
168 /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
169  */
170 int
171 tor_strisnonupper(const char *s)
172 {
173  while (*s) {
174  if (TOR_ISUPPER(*s))
175  return 0;
176  s++;
177  }
178  return 1;
179 }
180 
181 /** Return true iff every character in <b>s</b> is whitespace space; else
182  * return false. */
183 int
184 tor_strisspace(const char *s)
185 {
186  while (*s) {
187  if (!TOR_ISSPACE(*s))
188  return 0;
189  s++;
190  }
191  return 1;
192 }
193 
194 /** As strcmp, except that either string may be NULL. The NULL string is
195  * considered to be before any non-NULL string. */
196 int
197 strcmp_opt(const char *s1, const char *s2)
198 {
199  if (!s1) {
200  if (!s2)
201  return 0;
202  else
203  return -1;
204  } else if (!s2) {
205  return 1;
206  } else {
207  return strcmp(s1, s2);
208  }
209 }
210 
211 /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
212  * strcmp.
213  */
214 int
215 strcmpstart(const char *s1, const char *s2)
216 {
217  size_t n = strlen(s2);
218  return strncmp(s1, s2, n);
219 }
220 
221 /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
222  * strcasecmp.
223  */
224 int
225 strcasecmpstart(const char *s1, const char *s2)
226 {
227  size_t n = strlen(s2);
228  return strncasecmp(s1, s2, n);
229 }
230 
231 /** Compare the value of the string <b>prefix</b> with the start of the
232  * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
233  *
234  * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
235  * less than strlen(prefix).]
236  */
237 int
238 fast_memcmpstart(const void *mem, size_t memlen,
239  const char *prefix)
240 {
241  size_t plen = strlen(prefix);
242  if (memlen < plen)
243  return -1;
244  return fast_memcmp(mem, prefix, plen);
245 }
246 
247 /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
248  * strcmp.
249  */
250 int
251 strcmpend(const char *s1, const char *s2)
252 {
253  size_t n1 = strlen(s1), n2 = strlen(s2);
254  if (n2>n1)
255  return strcmp(s1,s2);
256  else
257  return strncmp(s1+(n1-n2), s2, n2);
258 }
259 
260 /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
261  * strcasecmp.
262  */
263 int
264 strcasecmpend(const char *s1, const char *s2)
265 {
266  size_t n1 = strlen(s1), n2 = strlen(s2);
267  if (n2>n1) /* then they can't be the same; figure out which is bigger */
268  return strcasecmp(s1,s2);
269  else
270  return strncasecmp(s1+(n1-n2), s2, n2);
271 }
272 
273 /** Return a pointer to the first char of s that is not whitespace and
274  * not a comment, or to the terminating NUL if no such character exists.
275  */
276 const char *
277 eat_whitespace(const char *s)
278 {
279  raw_assert(s);
280 
281  while (1) {
282  switch (*s) {
283  case '\0':
284  default:
285  return s;
286  case ' ':
287  case '\t':
288  case '\n':
289  case '\r':
290  ++s;
291  break;
292  case '#':
293  ++s;
294  while (*s && *s != '\n')
295  ++s;
296  }
297  }
298 }
299 
300 /** Return a pointer to the first char of s that is not whitespace and
301  * not a comment, or to the terminating NUL if no such character exists.
302  */
303 const char *
304 eat_whitespace_eos(const char *s, const char *eos)
305 {
306  raw_assert(s);
307  raw_assert(eos && s <= eos);
308 
309  while (s < eos) {
310  switch (*s) {
311  case '\0':
312  default:
313  return s;
314  case ' ':
315  case '\t':
316  case '\n':
317  case '\r':
318  ++s;
319  break;
320  case '#':
321  ++s;
322  while (s < eos && *s && *s != '\n')
323  ++s;
324  }
325  }
326  return s;
327 }
328 
329 /** Return a pointer to the first char of s that is not a space or a tab
330  * or a \\r, or to the terminating NUL if no such character exists. */
331 const char *
332 eat_whitespace_no_nl(const char *s)
333 {
334  while (*s == ' ' || *s == '\t' || *s == '\r')
335  ++s;
336  return s;
337 }
338 
339 /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
340  * found a non-whitespace character or not. */
341 const char *
342 eat_whitespace_eos_no_nl(const char *s, const char *eos)
343 {
344  while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
345  ++s;
346  return s;
347 }
348 
349 /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
350  * or to the terminating NUL if no such character exists.
351  */
352 const char *
353 find_whitespace(const char *s)
354 {
355  /* tor_assert(s); */
356  while (1) {
357  switch (*s)
358  {
359  case '\0':
360  case '#':
361  case ' ':
362  case '\r':
363  case '\n':
364  case '\t':
365  return s;
366  default:
367  ++s;
368  }
369  }
370 }
371 
372 /** As find_whitespace, but stop at <b>eos</b> whether we have found a
373  * whitespace or not. */
374 const char *
375 find_whitespace_eos(const char *s, const char *eos)
376 {
377  /* tor_assert(s); */
378  while (s < eos) {
379  switch (*s)
380  {
381  case '\0':
382  case '#':
383  case ' ':
384  case '\r':
385  case '\n':
386  case '\t':
387  return s;
388  default:
389  ++s;
390  }
391  }
392  return s;
393 }
394 
395 /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
396  * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
397  * or immediately after a newline). Return NULL if no such string is found.
398  */
399 const char *
400 find_str_at_start_of_line(const char *haystack, const char *needle)
401 {
402  size_t needle_len = strlen(needle);
403 
404  do {
405  if (!strncmp(haystack, needle, needle_len))
406  return haystack;
407 
408  haystack = strchr(haystack, '\n');
409  if (!haystack)
410  return NULL;
411  else
412  ++haystack;
413  } while (*haystack);
414 
415  return NULL;
416 }
417 
418 /** Returns true if <b>string</b> could be a C identifier.
419  A C identifier must begin with a letter or an underscore and the
420  rest of its characters can be letters, numbers or underscores. No
421  length limit is imposed. */
422 int
423 string_is_C_identifier(const char *string)
424 {
425  size_t iter;
426  size_t length = strlen(string);
427  if (!length)
428  return 0;
429 
430  for (iter = 0; iter < length ; iter++) {
431  if (iter == 0) {
432  if (!(TOR_ISALPHA(string[iter]) ||
433  string[iter] == '_'))
434  return 0;
435  } else {
436  if (!(TOR_ISALPHA(string[iter]) ||
437  TOR_ISDIGIT(string[iter]) ||
438  string[iter] == '_'))
439  return 0;
440  }
441  }
442 
443  return 1;
444 }
445 
446 /** A byte with the top <b>x</b> bits set. */
447 #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
448 /** A byte with the lowest <b>x</b> bits set. */
449 #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
450 
451 /** Given the leading byte <b>b</b>, return the total number of bytes in the
452  * UTF-8 character. Returns 0 if it's an invalid leading byte.
453  */
454 static uint8_t
455 bytes_in_char(uint8_t b)
456 {
457  if ((TOP_BITS(1) & b) == 0x00)
458  return 1; // a 1-byte UTF-8 char, aka ASCII
459  if ((TOP_BITS(3) & b) == TOP_BITS(2))
460  return 2; // a 2-byte UTF-8 char
461  if ((TOP_BITS(4) & b) == TOP_BITS(3))
462  return 3; // a 3-byte UTF-8 char
463  if ((TOP_BITS(5) & b) == TOP_BITS(4))
464  return 4; // a 4-byte UTF-8 char
465 
466  // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
467  return 0;
468 }
469 
470 /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
471 static bool
473 {
474  uint8_t top2bits = b & TOP_BITS(2);
475  return top2bits == TOP_BITS(1);
476 }
477 
478 /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
479  * character.
480  */
481 static bool
482 validate_char(const uint8_t *c, uint8_t len)
483 {
484  if (len == 1)
485  return true; // already validated this is an ASCII char earlier.
486 
487  uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
488  uint32_t codepoint = c[0] & mask;
489 
490  mask = LOW_BITS(6); // bitmask for continuation bytes.
491  for (uint8_t i = 1; i < len; i++) {
492  if (!is_continuation_byte(c[i]))
493  return false;
494  codepoint <<= 6;
495  codepoint |= (c[i] & mask);
496  }
497 
498  if (len == 2 && codepoint <= 0x7f)
499  return false; // Invalid, overly long encoding, should have fit in 1 byte.
500 
501  if (len == 3 && codepoint <= 0x7ff)
502  return false; // Invalid, overly long encoding, should have fit in 2 bytes.
503 
504  if (len == 4 && codepoint <= 0xffff)
505  return false; // Invalid, overly long encoding, should have fit in 3 bytes.
506 
507  if (codepoint >= 0xd800 && codepoint <= 0xdfff)
508  return false; // Invalid, reserved for UTF-16 surrogate pairs.
509 
510  return codepoint <= 0x10ffff; // Check if within maximum.
511 }
512 
513 /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
514  valid UTF-8 string. */
515 int
516 string_is_utf8(const char *str, size_t len)
517 {
518  // If str is NULL, don't try to read it
519  if (!str) {
520  // We could test for this case, but the low-level logs would produce
521  // confusing test output.
522  // LCOV_EXCL_START
523  if (len) {
524  // Use the low-level logging function, so that the log module can
525  // validate UTF-8 (if needed in future code)
527  "BUG: string_is_utf8() called with NULL str but non-zero len.");
528  // Since it's a bug, we should probably reject this string
529  return false;
530  }
531  // LCOV_EXCL_STOP
532  return true;
533  }
534 
535  for (size_t i = 0; i < len;) {
536  uint8_t num_bytes = bytes_in_char(str[i]);
537  if (num_bytes == 0) // Invalid leading byte found.
538  return false;
539 
540  size_t next_char = i + num_bytes;
541  if (next_char > len)
542  return false;
543 
544  // Validate the continuation bytes in this multi-byte character,
545  // and advance to the next character in the string.
546  if (!validate_char((const uint8_t*)&str[i], num_bytes))
547  return false;
548  i = next_char;
549  }
550  return true;
551 }
552 
553 /** As string_is_utf8(), but returns false if the string begins with a UTF-8
554  * byte order mark (BOM).
555  */
556 int
557 string_is_utf8_no_bom(const char *str, size_t len)
558 {
559  if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
560  !strcmpstart(str, "\uFFFE"))) {
561  return false;
562  }
563  return string_is_utf8(str, len);
564 }
Locale-independent character-type inspection (header)
int safe_mem_is_zero(const void *mem, size_t sz)
Definition: di_ops.c:224
Headers for di_ops.c.
#define fast_memeq(a, b, c)
Definition: di_ops.h:35
#define fast_memcmp(a, b, c)
Definition: di_ops.h:28
Definitions for common sizes of cryptographic digests.
#define DIGEST_LEN
Definition: digest_sizes.h:20
#define DIGEST256_LEN
Definition: digest_sizes.h:23
void tor_log_err_sigsafe(const char *m,...)
Definition: torerr.c:70
Headers for torerr.c.
int tor_strisspace(const char *s)
Definition: util_string.c:184
const char * eat_whitespace_eos_no_nl(const char *s, const char *eos)
Definition: util_string.c:342
int strcasecmpstart(const char *s1, const char *s2)
Definition: util_string.c:225
void tor_strlower(char *s)
Definition: util_string.c:127
int strcmpstart(const char *s1, const char *s2)
Definition: util_string.c:215
int strcmpend(const char *s1, const char *s2)
Definition: util_string.c:251
const char * find_whitespace_eos(const char *s, const char *eos)
Definition: util_string.c:375
static uint8_t bytes_in_char(uint8_t b)
Definition: util_string.c:455
const char * find_whitespace(const char *s)
Definition: util_string.c:353
void tor_strupper(char *s)
Definition: util_string.c:138
int fast_memcmpstart(const void *mem, size_t memlen, const char *prefix)
Definition: util_string.c:238
const void * tor_memmem(const void *_haystack, size_t hlen, const void *_needle, size_t nlen)
Definition: util_string.c:29
int strcasecmpend(const char *s1, const char *s2)
Definition: util_string.c:264
int tor_strisprint(const char *s)
Definition: util_string.c:158
const char * eat_whitespace_eos(const char *s, const char *eos)
Definition: util_string.c:304
int strcmp_opt(const char *s1, const char *s2)
Definition: util_string.c:197
const char * eat_whitespace(const char *s)
Definition: util_string.c:277
int string_is_utf8_no_bom(const char *str, size_t len)
Definition: util_string.c:557
int tor_digest256_is_zero(const char *digest)
Definition: util_string.c:103
int fast_mem_is_zero(const char *mem, size_t len)
Definition: util_string.c:74
static bool validate_char(const uint8_t *c, uint8_t len)
Definition: util_string.c:482
int string_is_C_identifier(const char *string)
Definition: util_string.c:423
void tor_strstrip(char *s, const char *strip)
Definition: util_string.c:111
const char * eat_whitespace_no_nl(const char *s)
Definition: util_string.c:332
int tor_strisnonupper(const char *s)
Definition: util_string.c:171
const char * find_str_at_start_of_line(const char *haystack, const char *needle)
Definition: util_string.c:400
int string_is_utf8(const char *str, size_t len)
Definition: util_string.c:516
int tor_digest_is_zero(const char *digest)
Definition: util_string.c:96
void tor_strreplacechar(char *s, char find, char replacement)
Definition: util_string.c:148
static bool is_continuation_byte(uint8_t b)
Definition: util_string.c:472
#define LOW_BITS(x)
Definition: util_string.c:449
#define TOP_BITS(x)
Definition: util_string.c:447
Header for util_string.c.