diff options
Diffstat (limited to 'src/lib/string')
-rw-r--r-- | src/lib/string/.may_include | 10 | ||||
-rw-r--r-- | src/lib/string/compat_ctype.c | 72 | ||||
-rw-r--r-- | src/lib/string/compat_ctype.h | 67 | ||||
-rw-r--r-- | src/lib/string/compat_string.c | 74 | ||||
-rw-r--r-- | src/lib/string/compat_string.h | 62 | ||||
-rw-r--r-- | src/lib/string/include.am | 27 | ||||
-rw-r--r-- | src/lib/string/parse_int.c | 131 | ||||
-rw-r--r-- | src/lib/string/parse_int.h | 25 | ||||
-rw-r--r-- | src/lib/string/printf.c | 167 | ||||
-rw-r--r-- | src/lib/string/printf.h | 30 | ||||
-rw-r--r-- | src/lib/string/scanf.c | 317 | ||||
-rw-r--r-- | src/lib/string/scanf.h | 24 | ||||
-rw-r--r-- | src/lib/string/util_string.c | 543 | ||||
-rw-r--r-- | src/lib/string/util_string.h | 57 |
14 files changed, 1606 insertions, 0 deletions
diff --git a/src/lib/string/.may_include b/src/lib/string/.may_include new file mode 100644 index 0000000000..ec5c769831 --- /dev/null +++ b/src/lib/string/.may_include @@ -0,0 +1,10 @@ +orconfig.h +lib/cc/*.h +lib/defs/*.h +lib/err/*.h +lib/malloc/*.h +lib/ctime/*.h +lib/string/*.h + +strlcat.c +strlcpy.c diff --git a/src/lib/string/compat_ctype.c b/src/lib/string/compat_ctype.c new file mode 100644 index 0000000000..f5d82be3ae --- /dev/null +++ b/src/lib/string/compat_ctype.c @@ -0,0 +1,72 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file compat_ctype.c + * \brief Locale-independent character-type inspection (backend) + **/ + +#include "lib/string/compat_ctype.h" + +/** + * Tables to implement ctypes-replacement TOR_IS*() functions. Each table + * has 256 bits to look up whether a character is in some set or not. This + * fails on non-ASCII platforms, but it is hard to find a platform whose + * character set is not a superset of ASCII nowadays. */ + +/**@{*/ +const uint32_t TOR_ISALPHA_TABLE[8] = + { 0, 0, 0x7fffffe, 0x7fffffe, 0, 0, 0, 0 }; +const uint32_t TOR_ISALNUM_TABLE[8] = + { 0, 0x3ff0000, 0x7fffffe, 0x7fffffe, 0, 0, 0, 0 }; +const uint32_t TOR_ISSPACE_TABLE[8] = { 0x3e00, 0x1, 0, 0, 0, 0, 0, 0 }; +const uint32_t TOR_ISXDIGIT_TABLE[8] = + { 0, 0x3ff0000, 0x7e, 0x7e, 0, 0, 0, 0 }; +const uint32_t TOR_ISDIGIT_TABLE[8] = { 0, 0x3ff0000, 0, 0, 0, 0, 0, 0 }; +const uint32_t TOR_ISPRINT_TABLE[8] = + { 0, 0xffffffff, 0xffffffff, 0x7fffffff, 0, 0, 0, 0x0 }; +const uint32_t TOR_ISUPPER_TABLE[8] = { 0, 0, 0x7fffffe, 0, 0, 0, 0, 0 }; +const uint32_t TOR_ISLOWER_TABLE[8] = { 0, 0, 0, 0x7fffffe, 0, 0, 0, 0 }; + +/** Upper-casing and lowercasing tables to map characters to upper/lowercase + * equivalents. Used by tor_toupper() and tor_tolower(). */ +/**@{*/ +const uint8_t TOR_TOUPPER_TABLE[256] = { + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, + 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, + 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, + 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, + 96,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, + 80,81,82,83,84,85,86,87,88,89,90,123,124,125,126,127, + 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255, +}; +const uint8_t TOR_TOLOWER_TABLE[256] = { + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, + 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, + 64,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119,120,121,122,91,92,93,94,95, + 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, + 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255, +}; +/**@}*/ diff --git a/src/lib/string/compat_ctype.h b/src/lib/string/compat_ctype.h new file mode 100644 index 0000000000..dbddd356c1 --- /dev/null +++ b/src/lib/string/compat_ctype.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file compat_ctype.h + * \brief Locale-independent character-type inspection (header) + **/ + +#ifndef TOR_COMPAT_CTYPE_H +#define TOR_COMPAT_CTYPE_H + +#include "orconfig.h" +#include "lib/cc/torint.h" + +/* Much of the time when we're checking ctypes, we're doing spec compliance, + * which all assumes we're doing ASCII. */ +#define DECLARE_CTYPE_FN(name) \ + static int TOR_##name(char c); \ + extern const uint32_t TOR_##name##_TABLE[]; \ + static inline int TOR_##name(char c) { \ + uint8_t u = c; \ + return !!(TOR_##name##_TABLE[(u >> 5) & 7] & (1u << (u & 31))); \ + } +DECLARE_CTYPE_FN(ISALPHA) +DECLARE_CTYPE_FN(ISALNUM) +DECLARE_CTYPE_FN(ISSPACE) +DECLARE_CTYPE_FN(ISDIGIT) +DECLARE_CTYPE_FN(ISXDIGIT) +DECLARE_CTYPE_FN(ISPRINT) +DECLARE_CTYPE_FN(ISLOWER) +DECLARE_CTYPE_FN(ISUPPER) +extern const uint8_t TOR_TOUPPER_TABLE[]; +extern const uint8_t TOR_TOLOWER_TABLE[]; +#define TOR_TOLOWER(c) (TOR_TOLOWER_TABLE[(uint8_t)c]) +#define TOR_TOUPPER(c) (TOR_TOUPPER_TABLE[(uint8_t)c]) + +static inline int hex_decode_digit(char c); + +/** Helper: given a hex digit, return its value, or -1 if it isn't hex. */ +static inline int +hex_decode_digit(char c) +{ + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'A': case 'a': return 10; + case 'B': case 'b': return 11; + case 'C': case 'c': return 12; + case 'D': case 'd': return 13; + case 'E': case 'e': return 14; + case 'F': case 'f': return 15; + default: + return -1; + } +} + +#endif /* !defined(TOR_COMPAT_CTYPE_H) */ diff --git a/src/lib/string/compat_string.c b/src/lib/string/compat_string.c new file mode 100644 index 0000000000..e125c921a4 --- /dev/null +++ b/src/lib/string/compat_string.c @@ -0,0 +1,74 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file compat_string.c + * \brief Useful string-processing functions that some platforms don't + * provide. + **/ + +#include "lib/string/compat_string.h" +#include "lib/err/torerr.h" + +/* Inline the strl functions if the platform doesn't have them. */ +#ifndef HAVE_STRLCPY +#include "strlcpy.c" +#endif +#ifndef HAVE_STRLCAT +#include "strlcat.c" +#endif + +#include <stdlib.h> +#include <string.h> + +/** Helper for tor_strtok_r_impl: Advances cp past all characters in + * <b>sep</b>, and returns its new value. */ +static char * +strtok_helper(char *cp, const char *sep) +{ + if (sep[1]) { + while (*cp && strchr(sep, *cp)) + ++cp; + } else { + while (*cp && *cp == *sep) + ++cp; + } + return cp; +} + +/** Implementation of strtok_r for platforms whose coders haven't figured out + * how to write one. Hey, retrograde libc developers! You can use this code + * here for free! */ +char * +tor_strtok_r_impl(char *str, const char *sep, char **lasts) +{ + char *cp, *start; + raw_assert(*sep); + if (str) { + str = strtok_helper(str, sep); + if (!*str) + return NULL; + start = cp = *lasts = str; + } else if (!*lasts || !**lasts) { + return NULL; + } else { + start = cp = *lasts; + } + + if (sep[1]) { + while (*cp && !strchr(sep, *cp)) + ++cp; + } else { + cp = strchr(cp, *sep); + } + + if (!cp || !*cp) { + *lasts = NULL; + } else { + *cp++ = '\0'; + *lasts = strtok_helper(cp, sep); + } + return start; +} diff --git a/src/lib/string/compat_string.h b/src/lib/string/compat_string.h new file mode 100644 index 0000000000..a0e37bb6dc --- /dev/null +++ b/src/lib/string/compat_string.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file compat_string.h + * \brief Header for compat_string.c + **/ + +#ifndef TOR_COMPAT_STRING_H +#define TOR_COMPAT_STRING_H + +#include "orconfig.h" +#include "lib/cc/compat_compiler.h" + +#include <stddef.h> + +/* ===== String compatibility */ +#ifdef _WIN32 +/* Windows doesn't have str(n)casecmp, but mingw defines it: only define it + * ourselves if it's missing. */ +#ifndef HAVE_STRNCASECMP +static inline int strncasecmp(const char *a, const char *b, size_t n); +static inline int strncasecmp(const char *a, const char *b, size_t n) { + return _strnicmp(a,b,n); +} +#endif +#ifndef HAVE_STRCASECMP +static inline int strcasecmp(const char *a, const char *b); +static inline int strcasecmp(const char *a, const char *b) { + return _stricmp(a,b); +} +#endif +#endif + +#if defined __APPLE__ +/* On OSX 10.9 and later, the overlap-checking code for strlcat would + * appear to have a severe bug that can sometimes cause aborts in Tor. + * Instead, use the non-checking variants. This is sad. + * + * See https://trac.torproject.org/projects/tor/ticket/15205 + */ +#undef strlcat +#undef strlcpy +#endif /* defined __APPLE__ */ + +#ifndef HAVE_STRLCAT +size_t strlcat(char *dst, const char *src, size_t siz); +#endif +#ifndef HAVE_STRLCPY +size_t strlcpy(char *dst, const char *src, size_t siz); +#endif + +char *tor_strtok_r_impl(char *str, const char *sep, char **lasts); +#ifdef HAVE_STRTOK_R +#define tor_strtok_r(str, sep, lasts) strtok_r(str, sep, lasts) +#else +#define tor_strtok_r(str, sep, lasts) tor_strtok_r_impl(str, sep, lasts) +#endif + +#endif diff --git a/src/lib/string/include.am b/src/lib/string/include.am new file mode 100644 index 0000000000..edd74b8a3e --- /dev/null +++ b/src/lib/string/include.am @@ -0,0 +1,27 @@ + +noinst_LIBRARIES += src/lib/libtor-string.a + +if UNITTESTS_ENABLED +noinst_LIBRARIES += src/lib/libtor-string-testing.a +endif + +src_lib_libtor_string_a_SOURCES = \ + src/lib/string/compat_ctype.c \ + src/lib/string/compat_string.c \ + src/lib/string/util_string.c \ + src/lib/string/parse_int.c \ + src/lib/string/printf.c \ + src/lib/string/scanf.c + +src_lib_libtor_string_testing_a_SOURCES = \ + $(src_lib_libtor_string_a_SOURCES) +src_lib_libtor_string_testing_a_CPPFLAGS = $(AM_CPPFLAGS) $(TEST_CPPFLAGS) +src_lib_libtor_string_testing_a_CFLAGS = $(AM_CFLAGS) $(TEST_CFLAGS) + +noinst_HEADERS += \ + src/lib/string/compat_ctype.h \ + src/lib/string/compat_string.h \ + src/lib/string/util_string.h \ + src/lib/string/parse_int.h \ + src/lib/string/printf.h \ + src/lib/string/scanf.h diff --git a/src/lib/string/parse_int.c b/src/lib/string/parse_int.c new file mode 100644 index 0000000000..fbdd554a47 --- /dev/null +++ b/src/lib/string/parse_int.c @@ -0,0 +1,131 @@ +/* Copyright (c) 2003, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file parse_int.c + * \brief Convert strings into the integers they encode, with bounds checking. + **/ + +#include "lib/string/parse_int.h" + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +/* Helper: common code to check whether the result of a strtol or strtoul or + * strtoll is correct. */ +#define CHECK_STRTOX_RESULT() \ + /* Did an overflow occur? */ \ + if (errno == ERANGE) \ + goto err; \ + /* Was at least one character converted? */ \ + if (endptr == s) \ + goto err; \ + /* Were there unexpected unconverted characters? */ \ + if (!next && *endptr) \ + goto err; \ + /* Illogical (max, min) inputs? */ \ + if (max < min) \ + goto err; \ + /* Is r within limits? */ \ + if (r < min || r > max) \ + goto err; \ + if (ok) *ok = 1; \ + if (next) *next = endptr; \ + return r; \ + err: \ + if (ok) *ok = 0; \ + if (next) *next = endptr; \ + return 0 + +/** Extract a long from the start of <b>s</b>, in the given numeric + * <b>base</b>. If <b>base</b> is 0, <b>s</b> is parsed as a decimal, + * octal, or hex number in the syntax of a C integer literal. If + * there is unconverted data and <b>next</b> is provided, set + * *<b>next</b> to the first unconverted character. An error has + * occurred if no characters are converted; or if there are + * unconverted characters and <b>next</b> is NULL; or if the parsed + * value is not between <b>min</b> and <b>max</b>. When no error + * occurs, return the parsed value and set *<b>ok</b> (if provided) to + * 1. When an error occurs, return 0 and set *<b>ok</b> (if provided) + * to 0. + */ +long +tor_parse_long(const char *s, int base, long min, long max, + int *ok, char **next) +{ + char *endptr; + long r; + + if (base < 0) { + if (ok) + *ok = 0; + return 0; + } + + errno = 0; + r = strtol(s, &endptr, base); + CHECK_STRTOX_RESULT(); +} + +/** As tor_parse_long(), but return an unsigned long. */ +unsigned long +tor_parse_ulong(const char *s, int base, unsigned long min, + unsigned long max, int *ok, char **next) +{ + char *endptr; + unsigned long r; + + if (base < 0) { + if (ok) + *ok = 0; + return 0; + } + + errno = 0; + r = strtoul(s, &endptr, base); + CHECK_STRTOX_RESULT(); +} + +/** As tor_parse_long(), but return a double. */ +double +tor_parse_double(const char *s, double min, double max, int *ok, char **next) +{ + char *endptr; + double r; + + errno = 0; + r = strtod(s, &endptr); + CHECK_STRTOX_RESULT(); +} + +/** As tor_parse_long, but return a uint64_t. Only base 10 is guaranteed to + * work for now. */ +uint64_t +tor_parse_uint64(const char *s, int base, uint64_t min, + uint64_t max, int *ok, char **next) +{ + char *endptr; + uint64_t r; + + if (base < 0) { + if (ok) + *ok = 0; + return 0; + } + + errno = 0; +#ifdef HAVE_STRTOULL + r = (uint64_t)strtoull(s, &endptr, base); +#elif defined(_WIN32) + r = (uint64_t)_strtoui64(s, &endptr, base); +#elif SIZEOF_LONG == 8 + r = (uint64_t)strtoul(s, &endptr, base); +#else +#error "I don't know how to parse 64-bit numbers." +#endif /* defined(HAVE_STRTOULL) || ... */ + + CHECK_STRTOX_RESULT(); +} diff --git a/src/lib/string/parse_int.h b/src/lib/string/parse_int.h new file mode 100644 index 0000000000..925547942e --- /dev/null +++ b/src/lib/string/parse_int.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2003, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file parse_int.h + * \brief Header for parse_int.c + **/ + +#ifndef TOR_PARSE_INT_H +#define TOR_PARSE_INT_H + +#include "lib/cc/torint.h" + +long tor_parse_long(const char *s, int base, long min, + long max, int *ok, char **next); +unsigned long tor_parse_ulong(const char *s, int base, unsigned long min, + unsigned long max, int *ok, char **next); +double tor_parse_double(const char *s, double min, double max, int *ok, + char **next); +uint64_t tor_parse_uint64(const char *s, int base, uint64_t min, + uint64_t max, int *ok, char **next); + +#endif diff --git a/src/lib/string/printf.c b/src/lib/string/printf.c new file mode 100644 index 0000000000..a5cb71ce09 --- /dev/null +++ b/src/lib/string/printf.c @@ -0,0 +1,167 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file printf.c + * \brief Compatibility wrappers around snprintf and its friends + **/ + +#include "lib/string/printf.h" +#include "lib/err/torerr.h" +#include "lib/cc/torint.h" +#include "lib/malloc/malloc.h" + +#include <stdlib.h> +#include <stdio.h> + +/** Replacement for snprintf. Differs from platform snprintf in two + * ways: First, always NUL-terminates its output. Second, always + * returns -1 if the result is truncated. (Note that this return + * behavior does <i>not</i> conform to C99; it just happens to be + * easier to emulate "return -1" with conformant implementations than + * it is to emulate "return number that would be written" with + * non-conformant implementations.) */ +int +tor_snprintf(char *str, size_t size, const char *format, ...) +{ + va_list ap; + int r; + va_start(ap,format); + r = tor_vsnprintf(str,size,format,ap); + va_end(ap); + return r; +} + +/** Replacement for vsnprintf; behavior differs as tor_snprintf differs from + * snprintf. + */ +int +tor_vsnprintf(char *str, size_t size, const char *format, va_list args) +{ + int r; + if (size == 0) + return -1; /* no place for the NUL */ + if (size > SIZE_T_CEILING) + return -1; +#ifdef _WIN32 + r = _vsnprintf(str, size, format, args); +#else + r = vsnprintf(str, size, format, args); +#endif + str[size-1] = '\0'; + if (r < 0 || r >= (ssize_t)size) + return -1; + return r; +} + +/** + * Portable asprintf implementation. Does a printf() into a newly malloc'd + * string. Sets *<b>strp</b> to this string, and returns its length (not + * including the terminating NUL character). + * + * You can treat this function as if its implementation were something like + <pre> + char buf[_INFINITY_]; + tor_snprintf(buf, sizeof(buf), fmt, args); + *strp = tor_strdup(buf); + return strlen(*strp): + </pre> + * Where _INFINITY_ is an imaginary constant so big that any string can fit + * into it. + */ +int +tor_asprintf(char **strp, const char *fmt, ...) +{ + int r; + va_list args; + va_start(args, fmt); + r = tor_vasprintf(strp, fmt, args); + va_end(args); + if (!*strp || r < 0) { + /* LCOV_EXCL_START */ + raw_assert_unreached_msg("Internal error in asprintf"); + /* LCOV_EXCL_STOP */ + } + return r; +} + +/** + * Portable vasprintf implementation. Does a printf() into a newly malloc'd + * string. Differs from regular vasprintf in the same ways that + * tor_asprintf() differs from regular asprintf. + */ +int +tor_vasprintf(char **strp, const char *fmt, va_list args) +{ + /* use a temporary variable in case *strp is in args. */ + char *strp_tmp=NULL; +#ifdef HAVE_VASPRINTF + /* If the platform gives us one, use it. */ + int r = vasprintf(&strp_tmp, fmt, args); + if (r < 0) + *strp = NULL; // LCOV_EXCL_LINE -- no cross-platform way to force this + else + *strp = strp_tmp; + return r; +#elif defined(HAVE__VSCPRINTF) + /* On Windows, _vsnprintf won't tell us the length of the string if it + * overflows, so we need to use _vcsprintf to tell how much to allocate */ + int len, r; + va_list tmp_args; + va_copy(tmp_args, args); + len = _vscprintf(fmt, tmp_args); + va_end(tmp_args); + if (len < 0) { + *strp = NULL; + return -1; + } + strp_tmp = tor_malloc(len + 1); + r = _vsnprintf(strp_tmp, len+1, fmt, args); + if (r != len) { + tor_free(strp_tmp); + *strp = NULL; + return -1; + } + *strp = strp_tmp; + return len; +#else + /* Everywhere else, we have a decent vsnprintf that tells us how many + * characters we need. We give it a try on a short buffer first, since + * it might be nice to avoid the second vsnprintf call. + */ + /* XXXX This code spent a number of years broken (see bug 30651). It is + * possible that no Tor users actually run on systems without vasprintf() or + * _vscprintf(). If so, we should consider removing this code. */ + char buf[128]; + int len, r; + va_list tmp_args; + va_copy(tmp_args, args); + /* Use vsnprintf to retrieve needed length. tor_vsnprintf() is not an + * option here because it will simply return -1 if buf is not large enough + * to hold the complete string. + */ + len = vsnprintf(buf, sizeof(buf), fmt, tmp_args); + va_end(tmp_args); + buf[sizeof(buf) - 1] = '\0'; + if (len < 0) { + *strp = NULL; + return -1; + } + if (len < (int)sizeof(buf)) { + *strp = tor_strdup(buf); + return len; + } + strp_tmp = tor_malloc(len+1); + /* use of tor_vsnprintf() will ensure string is null terminated */ + r = tor_vsnprintf(strp_tmp, len+1, fmt, args); + if (r != len) { + tor_free(strp_tmp); + *strp = NULL; + return -1; + } + *strp = strp_tmp; + return len; +#endif /* defined(HAVE_VASPRINTF) || ... */ +} diff --git a/src/lib/string/printf.h b/src/lib/string/printf.h new file mode 100644 index 0000000000..2cc13d6bee --- /dev/null +++ b/src/lib/string/printf.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file printf.h + * \brief Header for printf.c + **/ + +#ifndef TOR_UTIL_PRINTF_H +#define TOR_UTIL_PRINTF_H + +#include "orconfig.h" +#include "lib/cc/compat_compiler.h" + +#include <stdarg.h> +#include <stddef.h> + +int tor_snprintf(char *str, size_t size, const char *format, ...) + CHECK_PRINTF(3,4); +int tor_vsnprintf(char *str, size_t size, const char *format, va_list args) + CHECK_PRINTF(3,0); + +int tor_asprintf(char **strp, const char *fmt, ...) + CHECK_PRINTF(2,3); +int tor_vasprintf(char **strp, const char *fmt, va_list args) + CHECK_PRINTF(2,0); + +#endif /* !defined(TOR_UTIL_STRING_H) */ diff --git a/src/lib/string/scanf.c b/src/lib/string/scanf.c new file mode 100644 index 0000000000..1bc39b5182 --- /dev/null +++ b/src/lib/string/scanf.c @@ -0,0 +1,317 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file scanf.c + * \brief Locale-independent minimal implementation of sscanf(). + **/ + +#include "lib/string/scanf.h" +#include "lib/string/compat_ctype.h" +#include "lib/cc/torint.h" +#include "lib/err/torerr.h" + +#include <stdlib.h> + +#define MAX_SCANF_WIDTH 9999 + +/** Helper: given an ASCII-encoded decimal digit, return its numeric value. + * NOTE: requires that its input be in-bounds. */ +static int +digit_to_num(char d) +{ + int num = ((int)d) - (int)'0'; + raw_assert(num <= 9 && num >= 0); + return num; +} + +/** Helper: Read an unsigned int from *<b>bufp</b> of up to <b>width</b> + * characters. (Handle arbitrary width if <b>width</b> is less than 0.) On + * success, store the result in <b>out</b>, advance bufp to the next + * character, and return 0. On failure, return -1. */ +static int +scan_unsigned(const char **bufp, unsigned long *out, int width, unsigned base) +{ + unsigned long result = 0; + int scanned_so_far = 0; + const int hex = base==16; + raw_assert(base == 10 || base == 16); + if (!bufp || !*bufp || !out) + return -1; + if (width<0) + width=MAX_SCANF_WIDTH; + + while (**bufp && (hex?TOR_ISXDIGIT(**bufp):TOR_ISDIGIT(**bufp)) + && scanned_so_far < width) { + unsigned digit = hex?hex_decode_digit(*(*bufp)++):digit_to_num(*(*bufp)++); + // Check for overflow beforehand, without actually causing any overflow + // This preserves functionality on compilers that don't wrap overflow + // (i.e. that trap or optimise away overflow) + // result * base + digit > ULONG_MAX + // result * base > ULONG_MAX - digit + if (result > (ULONG_MAX - digit)/base) + return -1; /* Processing this digit would overflow */ + result = result * base + digit; + ++scanned_so_far; + } + + if (!scanned_so_far) /* No actual digits scanned */ + return -1; + + *out = result; + return 0; +} + +/** Helper: Read an signed int from *<b>bufp</b> of up to <b>width</b> + * characters. (Handle arbitrary width if <b>width</b> is less than 0.) On + * success, store the result in <b>out</b>, advance bufp to the next + * character, and return 0. On failure, return -1. */ +static int +scan_signed(const char **bufp, long *out, int width) +{ + int neg = 0; + unsigned long result = 0; + + if (!bufp || !*bufp || !out) + return -1; + if (width<0) + width=MAX_SCANF_WIDTH; + + if (**bufp == '-') { + neg = 1; + ++*bufp; + --width; + } + + if (scan_unsigned(bufp, &result, width, 10) < 0) + return -1; + + if (neg && result > 0) { + if (result > ((unsigned long)LONG_MAX) + 1) + return -1; /* Underflow */ + else if (result == ((unsigned long)LONG_MAX) + 1) + *out = LONG_MIN; + else { + /* We once had a far more clever no-overflow conversion here, but + * some versions of GCC apparently ran it into the ground. Now + * we just check for LONG_MIN explicitly. + */ + *out = -(long)result; + } + } else { + if (result > LONG_MAX) + return -1; /* Overflow */ + *out = (long)result; + } + + return 0; +} + +/** Helper: Read a decimal-formatted double from *<b>bufp</b> of up to + * <b>width</b> characters. (Handle arbitrary width if <b>width</b> is less + * than 0.) On success, store the result in <b>out</b>, advance bufp to the + * next character, and return 0. On failure, return -1. */ +static int +scan_double(const char **bufp, double *out, int width) +{ + int neg = 0; + double result = 0; + int scanned_so_far = 0; + + if (!bufp || !*bufp || !out) + return -1; + if (width<0) + width=MAX_SCANF_WIDTH; + + if (**bufp == '-') { + neg = 1; + ++*bufp; + } + + while (**bufp && TOR_ISDIGIT(**bufp) && scanned_so_far < width) { + const int digit = digit_to_num(*(*bufp)++); + result = result * 10 + digit; + ++scanned_so_far; + } + if (**bufp == '.') { + double fracval = 0, denominator = 1; + ++*bufp; + ++scanned_so_far; + while (**bufp && TOR_ISDIGIT(**bufp) && scanned_so_far < width) { + const int digit = digit_to_num(*(*bufp)++); + fracval = fracval * 10 + digit; + denominator *= 10; + ++scanned_so_far; + } + result += fracval / denominator; + } + + if (!scanned_so_far) /* No actual digits scanned */ + return -1; + + *out = neg ? -result : result; + return 0; +} + +/** Helper: copy up to <b>width</b> non-space characters from <b>bufp</b> to + * <b>out</b>. Make sure <b>out</b> is nul-terminated. Advance <b>bufp</b> + * to the next non-space character or the EOS. */ +static int +scan_string(const char **bufp, char *out, int width) +{ + int scanned_so_far = 0; + if (!bufp || !out || width < 0) + return -1; + while (**bufp && ! TOR_ISSPACE(**bufp) && scanned_so_far < width) { + *out++ = *(*bufp)++; + ++scanned_so_far; + } + *out = '\0'; + return 0; +} + +/** Locale-independent, minimal, no-surprises scanf variant, accepting only a + * restricted pattern format. For more info on what it supports, see + * tor_sscanf() documentation. */ +int +tor_vsscanf(const char *buf, const char *pattern, va_list ap) +{ + int n_matched = 0; + + while (*pattern) { + if (*pattern != '%') { + if (*buf == *pattern) { + ++buf; + ++pattern; + continue; + } else { + return n_matched; + } + } else { + int width = -1; + int longmod = 0; + ++pattern; + if (TOR_ISDIGIT(*pattern)) { + width = digit_to_num(*pattern++); + while (TOR_ISDIGIT(*pattern)) { + width *= 10; + width += digit_to_num(*pattern++); + if (width > MAX_SCANF_WIDTH) + return -1; + } + if (!width) /* No zero-width things. */ + return -1; + } + if (*pattern == 'l') { + longmod = 1; + ++pattern; + } + if (*pattern == 'u' || *pattern == 'x') { + unsigned long u; + const int base = (*pattern == 'u') ? 10 : 16; + if (!*buf) + return n_matched; + if (scan_unsigned(&buf, &u, width, base)<0) + return n_matched; + if (longmod) { + unsigned long *out = va_arg(ap, unsigned long *); + *out = u; + } else { + unsigned *out = va_arg(ap, unsigned *); + if (u > UINT_MAX) + return n_matched; + *out = (unsigned) u; + } + ++pattern; + ++n_matched; + } else if (*pattern == 'f') { + double *d = va_arg(ap, double *); + if (!longmod) + return -1; /* float not supported */ + if (!*buf) + return n_matched; + if (scan_double(&buf, d, width)<0) + return n_matched; + ++pattern; + ++n_matched; + } else if (*pattern == 'd') { + long lng=0; + if (scan_signed(&buf, &lng, width)<0) + return n_matched; + if (longmod) { + long *out = va_arg(ap, long *); + *out = lng; + } else { + int *out = va_arg(ap, int *); +#if LONG_MAX > INT_MAX + if (lng < INT_MIN || lng > INT_MAX) + return n_matched; +#endif + *out = (int)lng; + } + ++pattern; + ++n_matched; + } else if (*pattern == 's') { + char *s = va_arg(ap, char *); + if (longmod) + return -1; + if (width < 0) + return -1; + if (scan_string(&buf, s, width)<0) + return n_matched; + ++pattern; + ++n_matched; + } else if (*pattern == 'c') { + char *ch = va_arg(ap, char *); + if (longmod) + return -1; + if (width != -1) + return -1; + if (!*buf) + return n_matched; + *ch = *buf++; + ++pattern; + ++n_matched; + } else if (*pattern == '%') { + if (*buf != '%') + return n_matched; + if (longmod) + return -1; + ++buf; + ++pattern; + } else { + return -1; /* Unrecognized pattern component. */ + } + } + } + + return n_matched; +} + +/** Minimal sscanf replacement: parse <b>buf</b> according to <b>pattern</b> + * and store the results in the corresponding argument fields. Differs from + * sscanf in that: + * <ul><li>It only handles %u, %lu, %x, %lx, %[NUM]s, %d, %ld, %lf, and %c. + * <li>It only handles decimal inputs for %lf. (12.3, not 1.23e1) + * <li>It does not handle arbitrarily long widths. + * <li>Numbers do not consume any space characters. + * <li>It is locale-independent. + * <li>%u and %x do not consume any space. + * <li>It returns -1 on malformed patterns.</ul> + * + * (As with other locale-independent functions, we need this to parse data that + * is in ASCII without worrying that the C library's locale-handling will make + * miscellaneous characters look like numbers, spaces, and so on.) + */ +int +tor_sscanf(const char *buf, const char *pattern, ...) +{ + int r; + va_list ap; + va_start(ap, pattern); + r = tor_vsscanf(buf, pattern, ap); + va_end(ap); + return r; +} diff --git a/src/lib/string/scanf.h b/src/lib/string/scanf.h new file mode 100644 index 0000000000..6673173de5 --- /dev/null +++ b/src/lib/string/scanf.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file scanf.h + * \brief Header for scanf.c + **/ + +#ifndef TOR_UTIL_SCANF_H +#define TOR_UTIL_SCANF_H + +#include "orconfig.h" +#include "lib/cc/compat_compiler.h" + +#include <stdarg.h> + +int tor_vsscanf(const char *buf, const char *pattern, va_list ap) \ + CHECK_SCANF(2, 0); +int tor_sscanf(const char *buf, const char *pattern, ...) + CHECK_SCANF(2, 3); + +#endif /* !defined(TOR_UTIL_STRING_H) */ diff --git a/src/lib/string/util_string.c b/src/lib/string/util_string.c new file mode 100644 index 0000000000..f934f66f02 --- /dev/null +++ b/src/lib/string/util_string.c @@ -0,0 +1,543 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file util_string.c + * \brief Non-standard string functions used throughout Tor. + **/ + +#include "lib/string/util_string.h" +#include "lib/string/compat_ctype.h" +#include "lib/err/torerr.h" +#include "lib/ctime/di_ops.h" +#include "lib/defs/digest_sizes.h" + +#include <string.h> +#include <stdlib.h> + +/** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at + * <b>needle</b>, return a pointer to the first occurrence of the needle + * within the haystack, or NULL if there is no such occurrence. + * + * This function is <em>not</em> timing-safe. + * + * Requires that <b>nlen</b> be greater than zero. + */ +const void * +tor_memmem(const void *_haystack, size_t hlen, + const void *_needle, size_t nlen) +{ +#if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) + raw_assert(nlen); + return memmem(_haystack, hlen, _needle, nlen); +#else + /* This isn't as fast as the GLIBC implementation, but it doesn't need to + * be. */ + const char *p, *last_possible_start; + const char *haystack = (const char*)_haystack; + const char *needle = (const char*)_needle; + char first; + raw_assert(nlen); + + if (nlen > hlen) + return NULL; + + p = haystack; + /* Last position at which the needle could start. */ + last_possible_start = haystack + hlen - nlen; + first = *(const char*)needle; + while ((p = memchr(p, first, last_possible_start + 1 - p))) { + if (fast_memeq(p, needle, nlen)) + return p; + if (++p > last_possible_start) { + /* This comparison shouldn't be necessary, since if p was previously + * equal to last_possible_start, the next memchr call would be + * "memchr(p, first, 0)", which will return NULL. But it clarifies the + * logic. */ + return NULL; + } + } + return NULL; +#endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */ +} + +const void * +tor_memstr(const void *haystack, size_t hlen, const char *needle) +{ + return tor_memmem(haystack, hlen, needle, strlen(needle)); +} + +/** Return true iff the 'len' bytes at 'mem' are all zero. */ +int +tor_mem_is_zero(const char *mem, size_t len) +{ + static const char ZERO[] = { + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + }; + while (len >= sizeof(ZERO)) { + /* It's safe to use fast_memcmp here, since the very worst thing an + * attacker could learn is how many initial bytes of a secret were zero */ + if (fast_memcmp(mem, ZERO, sizeof(ZERO))) + return 0; + len -= sizeof(ZERO); + mem += sizeof(ZERO); + } + /* Deal with leftover bytes. */ + if (len) + return fast_memeq(mem, ZERO, len); + + return 1; +} + +/** Return true iff the DIGEST_LEN bytes in digest are all zero. */ +int +tor_digest_is_zero(const char *digest) +{ + static const uint8_t ZERO_DIGEST[] = { + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 + }; + return tor_memeq(digest, ZERO_DIGEST, DIGEST_LEN); +} + +/** Return true iff the DIGEST256_LEN bytes in digest are all zero. */ +int +tor_digest256_is_zero(const char *digest) +{ + return tor_mem_is_zero(digest, DIGEST256_LEN); +} + +/** Remove from the string <b>s</b> every character which appears in + * <b>strip</b>. */ +void +tor_strstrip(char *s, const char *strip) +{ + char *readp = s; + while (*readp) { + if (strchr(strip, *readp)) { + ++readp; + } else { + *s++ = *readp++; + } + } + *s = '\0'; +} + +/** Convert all alphabetic characters in the nul-terminated string <b>s</b> to + * lowercase. */ +void +tor_strlower(char *s) +{ + while (*s) { + *s = TOR_TOLOWER(*s); + ++s; + } +} + +/** Convert all alphabetic characters in the nul-terminated string <b>s</b> to + * lowercase. */ +void +tor_strupper(char *s) +{ + while (*s) { + *s = TOR_TOUPPER(*s); + ++s; + } +} + +/** Return 1 if every character in <b>s</b> is printable, else return 0. + */ +int +tor_strisprint(const char *s) +{ + while (*s) { + if (!TOR_ISPRINT(*s)) + return 0; + s++; + } + return 1; +} + +/** Return 1 if no character in <b>s</b> is uppercase, else return 0. + */ +int +tor_strisnonupper(const char *s) +{ + while (*s) { + if (TOR_ISUPPER(*s)) + return 0; + s++; + } + return 1; +} + +/** Return true iff every character in <b>s</b> is whitespace space; else + * return false. */ +int +tor_strisspace(const char *s) +{ + while (*s) { + if (!TOR_ISSPACE(*s)) + return 0; + s++; + } + return 1; +} + +/** As strcmp, except that either string may be NULL. The NULL string is + * considered to be before any non-NULL string. */ +int +strcmp_opt(const char *s1, const char *s2) +{ + if (!s1) { + if (!s2) + return 0; + else + return -1; + } else if (!s2) { + return 1; + } else { + return strcmp(s1, s2); + } +} + +/** Compares the first strlen(s2) characters of s1 with s2. Returns as for + * strcmp. + */ +int +strcmpstart(const char *s1, const char *s2) +{ + size_t n = strlen(s2); + return strncmp(s1, s2, n); +} + +/** Compare the s1_len-byte string <b>s1</b> with <b>s2</b>, + * without depending on a terminating nul in s1. Sorting order is first by + * length, then lexically; return values are as for strcmp. + */ +int +strcmp_len(const char *s1, const char *s2, size_t s1_len) +{ + size_t s2_len = strlen(s2); + if (s1_len < s2_len) + return -1; + if (s1_len > s2_len) + return 1; + return fast_memcmp(s1, s2, s2_len); +} + +/** Compares the first strlen(s2) characters of s1 with s2. Returns as for + * strcasecmp. + */ +int +strcasecmpstart(const char *s1, const char *s2) +{ + size_t n = strlen(s2); + return strncasecmp(s1, s2, n); +} + +/** Compare the value of the string <b>prefix</b> with the start of the + * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp. + * + * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is + * less than strlen(prefix).] + */ +int +fast_memcmpstart(const void *mem, size_t memlen, + const char *prefix) +{ + size_t plen = strlen(prefix); + if (memlen < plen) + return -1; + return fast_memcmp(mem, prefix, plen); +} + +/** Compares the last strlen(s2) characters of s1 with s2. Returns as for + * strcmp. + */ +int +strcmpend(const char *s1, const char *s2) +{ + size_t n1 = strlen(s1), n2 = strlen(s2); + if (n2>n1) + return strcmp(s1,s2); + else + return strncmp(s1+(n1-n2), s2, n2); +} + +/** Compares the last strlen(s2) characters of s1 with s2. Returns as for + * strcasecmp. + */ +int +strcasecmpend(const char *s1, const char *s2) +{ + size_t n1 = strlen(s1), n2 = strlen(s2); + if (n2>n1) /* then they can't be the same; figure out which is bigger */ + return strcasecmp(s1,s2); + else + return strncasecmp(s1+(n1-n2), s2, n2); +} + +/** Return a pointer to the first char of s that is not whitespace and + * not a comment, or to the terminating NUL if no such character exists. + */ +const char * +eat_whitespace(const char *s) +{ + raw_assert(s); + + while (1) { + switch (*s) { + case '\0': + default: + return s; + case ' ': + case '\t': + case '\n': + case '\r': + ++s; + break; + case '#': + ++s; + while (*s && *s != '\n') + ++s; + } + } +} + +/** Return a pointer to the first char of s that is not whitespace and + * not a comment, or to the terminating NUL if no such character exists. + */ +const char * +eat_whitespace_eos(const char *s, const char *eos) +{ + raw_assert(s); + raw_assert(eos && s <= eos); + + while (s < eos) { + switch (*s) { + case '\0': + default: + return s; + case ' ': + case '\t': + case '\n': + case '\r': + ++s; + break; + case '#': + ++s; + while (s < eos && *s && *s != '\n') + ++s; + } + } + return s; +} + +/** Return a pointer to the first char of s that is not a space or a tab + * or a \\r, or to the terminating NUL if no such character exists. */ +const char * +eat_whitespace_no_nl(const char *s) +{ + while (*s == ' ' || *s == '\t' || *s == '\r') + ++s; + return s; +} + +/** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have + * found a non-whitespace character or not. */ +const char * +eat_whitespace_eos_no_nl(const char *s, const char *eos) +{ + while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r')) + ++s; + return s; +} + +/** Return a pointer to the first char of s that is whitespace or <b>#</b>, + * or to the terminating NUL if no such character exists. + */ +const char * +find_whitespace(const char *s) +{ + /* tor_assert(s); */ + while (1) { + switch (*s) + { + case '\0': + case '#': + case ' ': + case '\r': + case '\n': + case '\t': + return s; + default: + ++s; + } + } +} + +/** As find_whitespace, but stop at <b>eos</b> whether we have found a + * whitespace or not. */ +const char * +find_whitespace_eos(const char *s, const char *eos) +{ + /* tor_assert(s); */ + while (s < eos) { + switch (*s) + { + case '\0': + case '#': + case ' ': + case '\r': + case '\n': + case '\t': + return s; + default: + ++s; + } + } + return s; +} + +/** Return the first occurrence of <b>needle</b> in <b>haystack</b> that + * occurs at the start of a line (that is, at the beginning of <b>haystack</b> + * or immediately after a newline). Return NULL if no such string is found. + */ +const char * +find_str_at_start_of_line(const char *haystack, const char *needle) +{ + size_t needle_len = strlen(needle); + + do { + if (!strncmp(haystack, needle, needle_len)) + return haystack; + + haystack = strchr(haystack, '\n'); + if (!haystack) + return NULL; + else + ++haystack; + } while (*haystack); + + return NULL; +} + +/** Returns true if <b>string</b> could be a C identifier. + A C identifier must begin with a letter or an underscore and the + rest of its characters can be letters, numbers or underscores. No + length limit is imposed. */ +int +string_is_C_identifier(const char *string) +{ + size_t iter; + size_t length = strlen(string); + if (!length) + return 0; + + for (iter = 0; iter < length ; iter++) { + if (iter == 0) { + if (!(TOR_ISALPHA(string[iter]) || + string[iter] == '_')) + return 0; + } else { + if (!(TOR_ISALPHA(string[iter]) || + TOR_ISDIGIT(string[iter]) || + string[iter] == '_')) + return 0; + } + } + + return 1; +} + +/** A byte with the top <b>x</b> bits set. */ +#define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x)))) +/** A byte with the lowest <b>x</b> bits set. */ +#define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x)))) + +/** Given the leading byte <b>b</b>, return the total number of bytes in the + * UTF-8 character. Returns 0 if it's an invalid leading byte. + */ +static uint8_t +bytes_in_char(uint8_t b) +{ + if ((TOP_BITS(1) & b) == 0x00) + return 1; // a 1-byte UTF-8 char, aka ASCII + if ((TOP_BITS(3) & b) == TOP_BITS(2)) + return 2; // a 2-byte UTF-8 char + if ((TOP_BITS(4) & b) == TOP_BITS(3)) + return 3; // a 3-byte UTF-8 char + if ((TOP_BITS(5) & b) == TOP_BITS(4)) + return 4; // a 4-byte UTF-8 char + + // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111. + return 0; +} + +/** Returns true iff <b>b</b> is a UTF-8 continuation byte. */ +static bool +is_continuation_byte(uint8_t b) +{ + uint8_t top2bits = b & TOP_BITS(2); + return top2bits == TOP_BITS(1); +} + +/** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8 + * character. + */ +static bool +validate_char(const uint8_t *c, uint8_t len) +{ + if (len == 1) + return true; // already validated this is an ASCII char earlier. + + uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte. + uint32_t codepoint = c[0] & mask; + + mask = LOW_BITS(6); // bitmask for continuation bytes. + for (uint8_t i = 1; i < len; i++) { + if (!is_continuation_byte(c[i])) + return false; + codepoint <<= 6; + codepoint |= (c[i] & mask); + } + + if (len == 2 && codepoint <= 0x7f) + return false; // Invalid, overly long encoding, should have fit in 1 byte. + + if (len == 3 && codepoint <= 0x7ff) + return false; // Invalid, overly long encoding, should have fit in 2 bytes. + + if (len == 4 && codepoint <= 0xffff) + return false; // Invalid, overly long encoding, should have fit in 3 bytes. + + if (codepoint >= 0xd800 && codepoint <= 0xdfff) + return false; // Invalid, reserved for UTF-16 surrogate pairs. + + return codepoint <= 0x10ffff; // Check if within maximum. +} + +/** Returns true iff the first <b>len</b> bytes in <b>str</b> are a + valid UTF-8 string. */ +int +string_is_utf8(const char *str, size_t len) +{ + for (size_t i = 0; i < len;) { + uint8_t num_bytes = bytes_in_char(str[i]); + if (num_bytes == 0) // Invalid leading byte found. + return false; + + size_t next_char = i + num_bytes; + if (next_char > len) + return false; + + // Validate the continuation bytes in this multi-byte character, + // and advance to the next character in the string. + if (!validate_char((const uint8_t*)&str[i], num_bytes)) + return false; + i = next_char; + } + return true; +} diff --git a/src/lib/string/util_string.h b/src/lib/string/util_string.h new file mode 100644 index 0000000000..d9fbf8c61e --- /dev/null +++ b/src/lib/string/util_string.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2003-2004, Roger Dingledine + * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. + * Copyright (c) 2007-2019, The Tor Project, Inc. */ +/* See LICENSE for licensing information */ + +/** + * \file util_string.h + * \brief Header for util_string.c + **/ + +#ifndef TOR_UTIL_STRING_H +#define TOR_UTIL_STRING_H + +#include "orconfig.h" +#include "lib/cc/compat_compiler.h" + +#include <stddef.h> + +const void *tor_memmem(const void *haystack, size_t hlen, const void *needle, + size_t nlen); +const void *tor_memstr(const void *haystack, size_t hlen, + const char *needle); +int tor_mem_is_zero(const char *mem, size_t len); +int tor_digest_is_zero(const char *digest); +int tor_digest256_is_zero(const char *digest); + +/** Allowable characters in a hexadecimal string. */ +#define HEX_CHARACTERS "0123456789ABCDEFabcdef" +void tor_strlower(char *s); +void tor_strupper(char *s); +int tor_strisprint(const char *s); +int tor_strisnonupper(const char *s); +int tor_strisspace(const char *s); +int strcmp_opt(const char *s1, const char *s2); +int strcmpstart(const char *s1, const char *s2); +int strcmp_len(const char *s1, const char *s2, size_t len); +int strcasecmpstart(const char *s1, const char *s2); +int strcmpend(const char *s1, const char *s2); +int strcasecmpend(const char *s1, const char *s2); +int fast_memcmpstart(const void *mem, size_t memlen, const char *prefix); + +void tor_strstrip(char *s, const char *strip); + +const char *eat_whitespace(const char *s); +const char *eat_whitespace_eos(const char *s, const char *eos); +const char *eat_whitespace_no_nl(const char *s); +const char *eat_whitespace_eos_no_nl(const char *s, const char *eos); +const char *find_whitespace(const char *s); +const char *find_whitespace_eos(const char *s, const char *eos); +const char *find_str_at_start_of_line(const char *haystack, + const char *needle); + +int string_is_C_identifier(const char *string); + +int string_is_utf8(const char *str, size_t len); + +#endif /* !defined(TOR_UTIL_STRING_H) */ |