summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNick Mathewson <nickm@torproject.org>2020-01-07 10:16:15 -0500
committerNick Mathewson <nickm@torproject.org>2020-01-07 10:16:15 -0500
commite231cd5b61afcb6640a7e17506bf33ddc6b1d2fe (patch)
tree92f602c40653ea0b478999707171cdfc356b27be /src
parent17a1ae025ac65d68bcfff2971fa6153daed7e220 (diff)
parent1fd27155d452e4144d05c518ceae3313704c2110 (diff)
downloadtor-e231cd5b61afcb6640a7e17506bf33ddc6b1d2fe.tar.gz
tor-e231cd5b61afcb6640a7e17506bf33ddc6b1d2fe.zip
Merge branch 'ticket32845_squashed'
Diffstat (limited to 'src')
-rw-r--r--src/lib/string/util_string.c21
-rw-r--r--src/test/test_util.c35
2 files changed, 53 insertions, 3 deletions
diff --git a/src/lib/string/util_string.c b/src/lib/string/util_string.c
index f5061a11d2..93b3eb09f2 100644
--- a/src/lib/string/util_string.c
+++ b/src/lib/string/util_string.c
@@ -506,6 +506,23 @@ validate_char(const uint8_t *c, uint8_t len)
int
string_is_utf8(const char *str, size_t len)
{
+ // If str is NULL, don't try to read it
+ if (!str) {
+ // We could test for this case, but the low-level logs would produce
+ // confusing test output.
+ // LCOV_EXCL_START
+ if (len) {
+ // Use the low-level logging function, so that the log module can
+ // validate UTF-8 (if needed in future code)
+ tor_log_err_sigsafe(
+ "BUG: string_is_utf8() called with NULL str but non-zero len.");
+ // Since it's a bug, we should probably reject this string
+ return false;
+ }
+ // LCOV_EXCL_STOP
+ return true;
+ }
+
for (size_t i = 0; i < len;) {
uint8_t num_bytes = bytes_in_char(str[i]);
if (num_bytes == 0) // Invalid leading byte found.
@@ -530,8 +547,8 @@ string_is_utf8(const char *str, size_t len)
int
string_is_utf8_no_bom(const char *str, size_t len)
{
- if (len >= 3 && (!strcmpstart(str, "\uFEFF") ||
- !strcmpstart(str, "\uFFFE"))) {
+ if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
+ !strcmpstart(str, "\uFFFE"))) {
return false;
}
return string_is_utf8(str, len);
diff --git a/src/test/test_util.c b/src/test/test_util.c
index 7f7e157c17..92623ea0cd 100644
--- a/src/test/test_util.c
+++ b/src/test/test_util.c
@@ -4104,10 +4104,43 @@ test_util_string_is_utf8(void *ptr)
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
- // The maximum legal codepoint, 10FFFF.
+ // The minimum legal codepoint, 0x00.
+ tt_int_op(1, OP_EQ, string_is_utf8("\0", 1));
+
+ // The maximum legal codepoint, 0x10FFFF.
tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
+ /* Test cases that vary between programming languages /
+ * UTF-8 implementations.
+ * Source: POC||GTFO 19, page 43
+​ * https://www.alchemistowl.org/pocorgtfo/
+ */
+
+ // Invalid (in most implementations)
+ // surrogate
+ tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x81", 3));
+ // nullsurrog
+ tt_int_op(0, OP_EQ, string_is_utf8("\x30\x00\xed\xa0\x81", 5));
+ // threehigh
+ tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
+ // fourhigh
+ tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\xbf\xbf", 4));
+ // fivebyte
+ tt_int_op(0, OP_EQ, string_is_utf8("\xfb\x80\x80\x80\x80", 5));
+ // sixbyte
+ tt_int_op(0, OP_EQ, string_is_utf8("\xfd\x80\x80\x80\x80", 5));
+ // sixhigh
+ tt_int_op(0, OP_EQ, string_is_utf8("\xfd\xbf\xbf\xbf\xbf", 5));
+
+ // Valid (in most implementations)
+ // fourbyte
+ tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x8d\x88", 4));
+ // fourbyte2
+ tt_int_op(1, OP_EQ, string_is_utf8("\xf0\xbf\xbf\xbf", 4));
+ // nullbyte
+ tt_int_op(1, OP_EQ, string_is_utf8("\x30\x31\x32\x00\x33", 5));
+
done:
;
}