summaryrefslogtreecommitdiff
path: root/src/test/test_util.c
diff options
context:
space:
mode:
authorteor <teor@torproject.org>2020-01-07 17:05:48 +1000
committerteor <teor@torproject.org>2020-01-07 17:05:48 +1000
commitd62dbb676242dbdfdd121828c97e12e737aa596c (patch)
treeaa1c6c42fe523d29eb8a33f64f15b6309331c207 /src/test/test_util.c
parent1b63eea66cbb8793a3cff05de8d856ce3b93fc17 (diff)
downloadtor-d62dbb676242dbdfdd121828c97e12e737aa596c.tar.gz
tor-d62dbb676242dbdfdd121828c97e12e737aa596c.zip
string: Add extra UTF-8 test cases
These test cases are validated differently by some programming languages, because those languages have incorrect UTF-8 implementations. We want to make sure that tor validates them correctly. Closes ticket 32845.
Diffstat (limited to 'src/test/test_util.c')
-rw-r--r--src/test/test_util.c35
1 files changed, 34 insertions, 1 deletions
diff --git a/src/test/test_util.c b/src/test/test_util.c
index 7f7e157c17..92623ea0cd 100644
--- a/src/test/test_util.c
+++ b/src/test/test_util.c
@@ -4104,10 +4104,43 @@ test_util_string_is_utf8(void *ptr)
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
- // The maximum legal codepoint, 10FFFF.
+ // The minimum legal codepoint, 0x00.
+ tt_int_op(1, OP_EQ, string_is_utf8("\0", 1));
+
+ // The maximum legal codepoint, 0x10FFFF.
tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
+ /* Test cases that vary between programming languages /
+ * UTF-8 implementations.
+ * Source: POC||GTFO 19, page 43
+​ * https://www.alchemistowl.org/pocorgtfo/
+ */
+
+ // Invalid (in most implementations)
+ // surrogate
+ tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x81", 3));
+ // nullsurrog
+ tt_int_op(0, OP_EQ, string_is_utf8("\x30\x00\xed\xa0\x81", 5));
+ // threehigh
+ tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
+ // fourhigh
+ tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\xbf\xbf", 4));
+ // fivebyte
+ tt_int_op(0, OP_EQ, string_is_utf8("\xfb\x80\x80\x80\x80", 5));
+ // sixbyte
+ tt_int_op(0, OP_EQ, string_is_utf8("\xfd\x80\x80\x80\x80", 5));
+ // sixhigh
+ tt_int_op(0, OP_EQ, string_is_utf8("\xfd\xbf\xbf\xbf\xbf", 5));
+
+ // Valid (in most implementations)
+ // fourbyte
+ tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x8d\x88", 4));
+ // fourbyte2
+ tt_int_op(1, OP_EQ, string_is_utf8("\xf0\xbf\xbf\xbf", 4));
+ // nullbyte
+ tt_int_op(1, OP_EQ, string_is_utf8("\x30\x31\x32\x00\x33", 5));
+
done:
;
}