Line data Source code
1 : /* Copyright (c) 2003-2004, Roger Dingledine
2 : * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3 : * Copyright (c) 2007-2021, The Tor Project, Inc. */
4 : /* See LICENSE for licensing information */
5 :
6 : /**
7 : * \file util_string.c
8 : * \brief Non-standard string functions used throughout Tor.
9 : **/
10 :
11 : #include "lib/string/util_string.h"
12 : #include "lib/string/compat_ctype.h"
13 : #include "lib/err/torerr.h"
14 : #include "lib/ctime/di_ops.h"
15 : #include "lib/defs/digest_sizes.h"
16 :
17 : #include <string.h>
18 : #include <stdlib.h>
19 :
20 : /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
21 : * <b>needle</b>, return a pointer to the first occurrence of the needle
22 : * within the haystack, or NULL if there is no such occurrence.
23 : *
24 : * This function is <em>not</em> timing-safe.
25 : *
26 : * Requires that <b>nlen</b> be greater than zero.
27 : */
28 : const void *
29 85432 : tor_memmem(const void *_haystack, size_t hlen,
30 : const void *_needle, size_t nlen)
31 : {
32 : #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
33 85432 : raw_assert(nlen);
34 85432 : return memmem(_haystack, hlen, _needle, nlen);
35 : #else
36 : /* This isn't as fast as the GLIBC implementation, but it doesn't need to
37 : * be. */
38 : const char *p, *last_possible_start;
39 : const char *haystack = (const char*)_haystack;
40 : const char *needle = (const char*)_needle;
41 : char first;
42 : raw_assert(nlen);
43 :
44 : if (nlen > hlen)
45 : return NULL;
46 :
47 : p = haystack;
48 : /* Last position at which the needle could start. */
49 : last_possible_start = haystack + hlen - nlen;
50 : first = *(const char*)needle;
51 : while ((p = memchr(p, first, last_possible_start + 1 - p))) {
52 : if (fast_memeq(p, needle, nlen))
53 : return p;
54 : if (++p > last_possible_start) {
55 : /* This comparison shouldn't be necessary, since if p was previously
56 : * equal to last_possible_start, the next memchr call would be
57 : * "memchr(p, first, 0)", which will return NULL. But it clarifies the
58 : * logic. */
59 : return NULL;
60 : }
61 : }
62 : return NULL;
63 : #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
64 : }
65 :
66 : const void *
67 84111 : tor_memstr(const void *haystack, size_t hlen, const char *needle)
68 : {
69 84111 : return tor_memmem(haystack, hlen, needle, strlen(needle));
70 : }
71 :
72 : /** Return true iff the 'len' bytes at 'mem' are all zero. */
73 : int
74 87161 : fast_mem_is_zero(const char *mem, size_t len)
75 : {
76 87161 : static const char ZERO[] = {
77 : 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
78 : };
79 89032 : while (len >= sizeof(ZERO)) {
80 : /* It's safe to use fast_memcmp here, since the very worst thing an
81 : * attacker could learn is how many initial bytes of a secret were zero */
82 55713 : if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
83 : return 0;
84 1871 : len -= sizeof(ZERO);
85 1871 : mem += sizeof(ZERO);
86 : }
87 : /* Deal with leftover bytes. */
88 33319 : if (len)
89 32332 : return fast_memeq(mem, ZERO, len);
90 :
91 : return 1;
92 : }
93 :
94 : /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
95 : int
96 46374 : tor_digest_is_zero(const char *digest)
97 : {
98 46374 : return safe_mem_is_zero(digest, DIGEST_LEN);
99 : }
100 :
101 : /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
102 : int
103 129 : tor_digest256_is_zero(const char *digest)
104 : {
105 129 : return safe_mem_is_zero(digest, DIGEST256_LEN);
106 : }
107 :
108 : /** Remove from the string <b>s</b> every character which appears in
109 : * <b>strip</b>. */
110 : void
111 139 : tor_strstrip(char *s, const char *strip)
112 : {
113 139 : char *readp = s;
114 5274 : while (*readp) {
115 5135 : if (strchr(strip, *readp)) {
116 740 : ++readp;
117 : } else {
118 4395 : *s++ = *readp++;
119 : }
120 : }
121 139 : *s = '\0';
122 139 : }
123 :
124 : /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
125 : * lowercase. */
126 : void
127 3042 : tor_strlower(char *s)
128 : {
129 39534 : while (*s) {
130 36492 : *s = TOR_TOLOWER(*s);
131 36492 : ++s;
132 : }
133 3042 : }
134 :
135 : /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
136 : * lowercase. */
137 : void
138 34 : tor_strupper(char *s)
139 : {
140 1367 : while (*s) {
141 1333 : *s = TOR_TOUPPER(*s);
142 1333 : ++s;
143 : }
144 34 : }
145 :
146 : /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
147 : void
148 4 : tor_strreplacechar(char *s, char find, char replacement)
149 : {
150 8 : for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
151 4 : *s = replacement;
152 : }
153 4 : }
154 :
155 : /** Return 1 if every character in <b>s</b> is printable, else return 0.
156 : */
157 : int
158 2 : tor_strisprint(const char *s)
159 : {
160 11 : while (*s) {
161 10 : if (!TOR_ISPRINT(*s))
162 : return 0;
163 9 : s++;
164 : }
165 : return 1;
166 : }
167 :
168 : /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
169 : */
170 : int
171 6 : tor_strisnonupper(const char *s)
172 : {
173 63 : while (*s) {
174 58 : if (TOR_ISUPPER(*s))
175 : return 0;
176 57 : s++;
177 : }
178 : return 1;
179 : }
180 :
181 : /** Return true iff every character in <b>s</b> is whitespace space; else
182 : * return false. */
183 : int
184 35 : tor_strisspace(const char *s)
185 : {
186 70 : while (*s) {
187 38 : if (!TOR_ISSPACE(*s))
188 : return 0;
189 35 : s++;
190 : }
191 : return 1;
192 : }
193 :
194 : /** As strcmp, except that either string may be NULL. The NULL string is
195 : * considered to be before any non-NULL string. */
196 : int
197 39267 : strcmp_opt(const char *s1, const char *s2)
198 : {
199 39267 : if (!s1) {
200 4423 : if (!s2)
201 : return 0;
202 : else
203 9 : return -1;
204 34844 : } else if (!s2) {
205 : return 1;
206 : } else {
207 34733 : return strcmp(s1, s2);
208 : }
209 : }
210 :
211 : /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
212 : * strcmp.
213 : */
214 : int
215 316689 : strcmpstart(const char *s1, const char *s2)
216 : {
217 316689 : size_t n = strlen(s2);
218 316689 : return strncmp(s1, s2, n);
219 : }
220 :
221 : /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
222 : * strcasecmp.
223 : */
224 : int
225 27735 : strcasecmpstart(const char *s1, const char *s2)
226 : {
227 27735 : size_t n = strlen(s2);
228 27735 : return strncasecmp(s1, s2, n);
229 : }
230 :
231 : /** Compare the value of the string <b>prefix</b> with the start of the
232 : * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
233 : *
234 : * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
235 : * less than strlen(prefix).]
236 : */
237 : int
238 0 : fast_memcmpstart(const void *mem, size_t memlen,
239 : const char *prefix)
240 : {
241 0 : size_t plen = strlen(prefix);
242 0 : if (memlen < plen)
243 : return -1;
244 0 : return fast_memcmp(mem, prefix, plen);
245 : }
246 :
247 : /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
248 : * strcmp.
249 : */
250 : int
251 1085 : strcmpend(const char *s1, const char *s2)
252 : {
253 1085 : size_t n1 = strlen(s1), n2 = strlen(s2);
254 1085 : if (n2>n1)
255 4 : return strcmp(s1,s2);
256 : else
257 1081 : return strncmp(s1+(n1-n2), s2, n2);
258 : }
259 :
260 : /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
261 : * strcasecmp.
262 : */
263 : int
264 768 : strcasecmpend(const char *s1, const char *s2)
265 : {
266 768 : size_t n1 = strlen(s1), n2 = strlen(s2);
267 768 : if (n2>n1) /* then they can't be the same; figure out which is bigger */
268 435 : return strcasecmp(s1,s2);
269 : else
270 333 : return strncasecmp(s1+(n1-n2), s2, n2);
271 : }
272 :
273 : /** Return a pointer to the first char of s that is not whitespace and
274 : * not a comment, or to the terminating NUL if no such character exists.
275 : */
276 : const char *
277 235463 : eat_whitespace(const char *s)
278 : {
279 235463 : raw_assert(s);
280 :
281 275682 : while (1) {
282 275682 : switch (*s) {
283 235463 : case '\0':
284 : default:
285 235463 : return s;
286 39982 : case ' ':
287 : case '\t':
288 : case '\n':
289 : case '\r':
290 39982 : ++s;
291 39982 : break;
292 237 : case '#':
293 237 : ++s;
294 237 : while (*s && *s != '\n')
295 641 : ++s;
296 : }
297 : }
298 : }
299 :
300 : /** Return a pointer to the first char of s that is not whitespace and
301 : * not a comment, or to the terminating NUL if no such character exists.
302 : */
303 : const char *
304 537501 : eat_whitespace_eos(const char *s, const char *eos)
305 : {
306 537501 : raw_assert(s);
307 537501 : raw_assert(eos && s <= eos);
308 :
309 677749 : while (s < eos) {
310 612848 : switch (*s) {
311 : case '\0':
312 : default:
313 : return s;
314 140010 : case ' ':
315 : case '\t':
316 : case '\n':
317 : case '\r':
318 140010 : ++s;
319 140010 : break;
320 238 : case '#':
321 238 : ++s;
322 775 : while (s < eos && *s && *s != '\n')
323 537 : ++s;
324 : }
325 : }
326 : return s;
327 : }
328 :
329 : /** Return a pointer to the first char of s that is not a space or a tab
330 : * or a \\r, or to the terminating NUL if no such character exists. */
331 : const char *
332 243 : eat_whitespace_no_nl(const char *s)
333 : {
334 412 : while (*s == ' ' || *s == '\t' || *s == '\r')
335 169 : ++s;
336 243 : return s;
337 : }
338 :
339 : /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
340 : * found a non-whitespace character or not. */
341 : const char *
342 182695 : eat_whitespace_eos_no_nl(const char *s, const char *eos)
343 : {
344 305200 : while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
345 122505 : ++s;
346 182695 : return s;
347 : }
348 :
349 : /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
350 : * or to the terminating NUL if no such character exists.
351 : */
352 : const char *
353 202113 : find_whitespace(const char *s)
354 : {
355 : /* tor_assert(s); */
356 4183585 : while (1) {
357 2192849 : switch (*s)
358 : {
359 202113 : case '\0':
360 : case '#':
361 : case ' ':
362 : case '\r':
363 : case '\n':
364 : case '\t':
365 202113 : return s;
366 1990736 : default:
367 1990736 : ++s;
368 : }
369 : }
370 : }
371 :
372 : /** As find_whitespace, but stop at <b>eos</b> whether we have found a
373 : * whitespace or not. */
374 : const char *
375 184256 : find_whitespace_eos(const char *s, const char *eos)
376 : {
377 : /* tor_assert(s); */
378 1639215 : while (s < eos) {
379 1582595 : switch (*s)
380 : {
381 : case '\0':
382 : case '#':
383 : case ' ':
384 : case '\r':
385 : case '\n':
386 : case '\t':
387 : return s;
388 1454959 : default:
389 1454959 : ++s;
390 : }
391 : }
392 : return s;
393 : }
394 :
395 : /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
396 : * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
397 : * or immediately after a newline). Return NULL if no such string is found.
398 : */
399 : const char *
400 711 : find_str_at_start_of_line(const char *haystack, const char *needle)
401 : {
402 711 : size_t needle_len = strlen(needle);
403 :
404 733 : do {
405 733 : if (!strncmp(haystack, needle, needle_len))
406 705 : return haystack;
407 :
408 28 : haystack = strchr(haystack, '\n');
409 28 : if (!haystack)
410 : return NULL;
411 : else
412 23 : ++haystack;
413 23 : } while (*haystack);
414 :
415 : return NULL;
416 : }
417 :
418 : /** Returns true if <b>string</b> could be a C identifier.
419 : A C identifier must begin with a letter or an underscore and the
420 : rest of its characters can be letters, numbers or underscores. No
421 : length limit is imposed. */
422 : int
423 244 : string_is_C_identifier(const char *string)
424 : {
425 244 : size_t iter;
426 244 : size_t length = strlen(string);
427 244 : if (!length)
428 : return 0;
429 :
430 1288 : for (iter = 0; iter < length ; iter++) {
431 1130 : if (iter == 0) {
432 243 : if (!(TOR_ISALPHA(string[iter]) ||
433 87 : string[iter] == '_'))
434 : return 0;
435 : } else {
436 941 : if (!(TOR_ISALPHA(string[iter]) ||
437 159 : TOR_ISDIGIT(string[iter]) ||
438 : string[iter] == '_'))
439 : return 0;
440 : }
441 : }
442 :
443 : return 1;
444 : }
445 :
446 : /** A byte with the top <b>x</b> bits set. */
447 : #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
448 : /** A byte with the lowest <b>x</b> bits set. */
449 : #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
450 :
451 : /** Given the leading byte <b>b</b>, return the total number of bytes in the
452 : * UTF-8 character. Returns 0 if it's an invalid leading byte.
453 : */
454 : static uint8_t
455 1343 : bytes_in_char(uint8_t b)
456 : {
457 1343 : if ((TOP_BITS(1) & b) == 0x00)
458 : return 1; // a 1-byte UTF-8 char, aka ASCII
459 34 : if ((TOP_BITS(3) & b) == TOP_BITS(2))
460 : return 2; // a 2-byte UTF-8 char
461 27 : if ((TOP_BITS(4) & b) == TOP_BITS(3))
462 : return 3; // a 3-byte UTF-8 char
463 14 : if ((TOP_BITS(5) & b) == TOP_BITS(4))
464 9 : return 4; // a 4-byte UTF-8 char
465 :
466 : // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
467 : return 0;
468 : }
469 :
470 : /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
471 : static bool
472 53 : is_continuation_byte(uint8_t b)
473 : {
474 53 : uint8_t top2bits = b & TOP_BITS(2);
475 53 : return top2bits == TOP_BITS(1);
476 : }
477 :
478 : /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
479 : * character.
480 : */
481 : static bool
482 1334 : validate_char(const uint8_t *c, uint8_t len)
483 : {
484 1334 : if (len == 1)
485 : return true; // already validated this is an ASCII char earlier.
486 :
487 25 : uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
488 25 : uint32_t codepoint = c[0] & mask;
489 :
490 25 : mask = LOW_BITS(6); // bitmask for continuation bytes.
491 74 : for (uint8_t i = 1; i < len; i++) {
492 53 : if (!is_continuation_byte(c[i]))
493 : return false;
494 49 : codepoint <<= 6;
495 49 : codepoint |= (c[i] & mask);
496 : }
497 :
498 21 : if (len == 2 && codepoint <= 0x7f)
499 : return false; // Invalid, overly long encoding, should have fit in 1 byte.
500 :
501 20 : if (len == 3 && codepoint <= 0x7ff)
502 : return false; // Invalid, overly long encoding, should have fit in 2 bytes.
503 :
504 19 : if (len == 4 && codepoint <= 0xffff)
505 : return false; // Invalid, overly long encoding, should have fit in 3 bytes.
506 :
507 18 : if (codepoint >= 0xd800 && codepoint <= 0xdfff)
508 : return false; // Invalid, reserved for UTF-16 surrogate pairs.
509 :
510 13 : return codepoint <= 0x10ffff; // Check if within maximum.
511 : }
512 :
513 : /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
514 : valid UTF-8 string. */
515 : int
516 119 : string_is_utf8(const char *str, size_t len)
517 : {
518 : // If str is NULL, don't try to read it
519 119 : if (!str) {
520 : // We could test for this case, but the low-level logs would produce
521 : // confusing test output.
522 : // LCOV_EXCL_START
523 : if (len) {
524 : // Use the low-level logging function, so that the log module can
525 : // validate UTF-8 (if needed in future code)
526 : tor_log_err_sigsafe(
527 : "BUG: string_is_utf8() called with NULL str but non-zero len.");
528 : // Since it's a bug, we should probably reject this string
529 : return false;
530 : }
531 : // LCOV_EXCL_STOP
532 : return true;
533 : }
534 :
535 1438 : for (size_t i = 0; i < len;) {
536 1343 : uint8_t num_bytes = bytes_in_char(str[i]);
537 1343 : if (num_bytes == 0) // Invalid leading byte found.
538 : return false;
539 :
540 1338 : size_t next_char = i + num_bytes;
541 1338 : if (next_char > len)
542 : return false;
543 :
544 : // Validate the continuation bytes in this multi-byte character,
545 : // and advance to the next character in the string.
546 1334 : if (!validate_char((const uint8_t*)&str[i], num_bytes))
547 : return false;
548 : i = next_char;
549 : }
550 : return true;
551 : }
552 :
553 : /** As string_is_utf8(), but returns false if the string begins with a UTF-8
554 : * byte order mark (BOM).
555 : */
556 : int
557 4 : string_is_utf8_no_bom(const char *str, size_t len)
558 : {
559 4 : if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
560 2 : !strcmpstart(str, "\uFFFE"))) {
561 : return false;
562 : }
563 1 : return string_is_utf8(str, len);
564 : }
|