summaryrefslogtreecommitdiff
path: root/src/lib/string
diff options
context:
space:
mode:
authorcypherpunks <cypherpunks@torproject.org>2018-08-29 03:22:30 +0000
committercypherpunks <cypherpunks@torproject.org>2018-09-03 13:54:43 +0000
commitd32b08af6f38d76d609edfddd44159446b5f25b6 (patch)
treea2fee3b622a53b557b4acdc299b962d179c3f509 /src/lib/string
parent1c62adb65baa99c92f937318c452955306301643 (diff)
downloadtor-d32b08af6f38d76d609edfddd44159446b5f25b6.tar.gz
tor-d32b08af6f38d76d609edfddd44159446b5f25b6.zip
string: add string_is_utf8() helper
Ticket #27373.
Diffstat (limited to 'src/lib/string')
-rw-r--r--src/lib/string/util_string.c90
-rw-r--r--src/lib/string/util_string.h2
2 files changed, 92 insertions, 0 deletions
diff --git a/src/lib/string/util_string.c b/src/lib/string/util_string.c
index a6b0a3d68a..b2b85d151d 100644
--- a/src/lib/string/util_string.c
+++ b/src/lib/string/util_string.c
@@ -451,3 +451,93 @@ string_is_C_identifier(const char *string)
return 1;
}
+
+/** A byte with the top <b>x</b> bits set. */
+#define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
+/** A byte with the lowest <b>x</b> bits set. */
+#define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
+
+/** Given the leading byte <b>b</b>, return the total number of bytes in the
+ * UTF-8 character. Returns 0 if it's an invalid leading byte.
+ */
+static uint8_t
+bytes_in_char(uint8_t b)
+{
+ if ((TOP_BITS(1) & b) == 0x00)
+ return 1; // a 1-byte UTF-8 char, aka ASCII
+ if ((TOP_BITS(3) & b) == TOP_BITS(2))
+ return 2; // a 2-byte UTF-8 char
+ if ((TOP_BITS(4) & b) == TOP_BITS(3))
+ return 3; // a 3-byte UTF-8 char
+ if ((TOP_BITS(5) & b) == TOP_BITS(4))
+ return 4; // a 4-byte UTF-8 char
+
+ // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
+ return 0;
+}
+
+/** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
+static bool
+is_continuation_byte(uint8_t b)
+{
+ uint8_t top2bits = b & TOP_BITS(2);
+ return top2bits == TOP_BITS(1);
+}
+
+/** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
+ * character.
+ */
+static bool
+validate_char(const uint8_t *c, uint8_t len)
+{
+ if (len == 1)
+ return true; // already validated this is an ASCII char earlier.
+
+ uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
+ uint32_t codepoint = c[0] & mask;
+
+ mask = LOW_BITS(6); // bitmask for continuation bytes.
+ for (uint8_t i = 1; i < len; i++) {
+ if (!is_continuation_byte(c[i]))
+ return false;
+ codepoint <<= 6;
+ codepoint |= (c[i] & mask);
+ }
+
+ if (len == 2 && codepoint <= 0x7f)
+ return false; // Invalid, overly long encoding, should have fit in 1 byte.
+
+ if (len == 3 && codepoint <= 0x7ff)
+ return false; // Invalid, overly long encoding, should have fit in 2 bytes.
+
+ if (len == 4 && codepoint <= 0xffff)
+ return false; // Invalid, overly long encoding, should have fit in 3 bytes.
+
+ if (codepoint >= 0xd800 && codepoint <= 0xdfff)
+ return false; // Invalid, reserved for UTF-16 surrogate pairs.
+
+ return codepoint <= 0x10ffff; // Check if within maximum.
+}
+
+/** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
+ valid UTF-8 string. */
+int
+string_is_utf8(const char *str, size_t len)
+{
+ for (size_t i = 0; i < len;) {
+ uint8_t num_bytes = bytes_in_char(str[i]);
+ if (num_bytes == 0) // Invalid leading byte found.
+ return false;
+
+ size_t next_char = i + num_bytes;
+ if (next_char > len)
+ return false;
+
+ // Validate the continuation bytes in this multi-byte character,
+ // and advance to the next character in the string.
+ if (!validate_char((const uint8_t*)&str[i], num_bytes))
+ return false;
+ i = next_char;
+ }
+ return true;
+}
diff --git a/src/lib/string/util_string.h b/src/lib/string/util_string.h
index 471613462a..746ece0d33 100644
--- a/src/lib/string/util_string.h
+++ b/src/lib/string/util_string.h
@@ -52,4 +52,6 @@ const char *find_str_at_start_of_line(const char *haystack,
int string_is_C_identifier(const char *string);
+int string_is_utf8(const char *str, size_t len);
+
#endif /* !defined(TOR_UTIL_STRING_H) */