diff options
Diffstat (limited to 'runtime/utf_test.cc')
-rw-r--r-- | runtime/utf_test.cc | 201 |
1 files changed, 192 insertions, 9 deletions
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc index 94a6ea57e2..64e4eb798b 100644 --- a/runtime/utf_test.cc +++ b/runtime/utf_test.cc @@ -48,7 +48,7 @@ static const uint8_t kAllSequences[] = { }; // A test string that contains a UTF-8 encoding of a surrogate pair -// (code point = U+10400) +// (code point = U+10400). static const uint8_t kSurrogateEncoding[] = { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80, @@ -66,13 +66,13 @@ TEST_F(UtfTest, GetUtf16FromUtf8) { EXPECT_EQ(0, GetTrailingUtf16Char(pair)); EXPECT_ARRAY_POSITION(1, ptr, start); - // Two byte sequence + // Two byte sequence. pair = GetUtf16FromUtf8(&ptr); EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); EXPECT_EQ(0, GetTrailingUtf16Char(pair)); EXPECT_ARRAY_POSITION(3, ptr, start); - // Three byte sequence + // Three byte sequence. pair = GetUtf16FromUtf8(&ptr); EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); EXPECT_EQ(0, GetTrailingUtf16Char(pair)); @@ -84,7 +84,7 @@ TEST_F(UtfTest, GetUtf16FromUtf8) { EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); EXPECT_ARRAY_POSITION(10, ptr, start); - // Null terminator + // Null terminator. pair = GetUtf16FromUtf8(&ptr); EXPECT_EQ(0, GetLeadingUtf16Char(pair)); EXPECT_EQ(0, GetTrailingUtf16Char(pair)); @@ -117,7 +117,8 @@ static void AssertConversion(const std::vector<uint16_t> input, ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); std::vector<uint8_t> output(expected.size()); - ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size()); + ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(), + &input[0], input.size()); EXPECT_EQ(expected, output); } @@ -139,10 +140,10 @@ TEST_F(UtfTest, CountAndConvertUtf8Bytes) { AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); AssertConversion({ - 0xd802, 0xdc02, // Surrogate pair - 0xdef0, 0xdcff, // Three byte encodings - 0x0101, 0x0000, // Two byte encodings - 'p' , 'p' // One byte encoding + 0xd802, 0xdc02, // Surrogate pair. + 0xdef0, 0xdcff, // Three byte encodings. + 0x0101, 0x0000, // Two byte encodings. + 'p' , 'p' // One byte encoding. }, { 0xf0, 0x90, 0xa0, 0x82, 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, @@ -160,4 +161,186 @@ TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' }); } +// Old versions of functions, here to compare answers with optimized versions. + +size_t CountModifiedUtf8Chars_reference(const char* utf8) { + size_t len = 0; + int ic; + while ((ic = *utf8++) != '\0') { + len++; + if ((ic & 0x80) == 0) { + // one-byte encoding + continue; + } + // two- or three-byte encoding + utf8++; + if ((ic & 0x20) == 0) { + // two-byte encoding + continue; + } + utf8++; + if ((ic & 0x10) == 0) { + // three-byte encoding + continue; + } + + // four-byte encoding: needs to be converted into a surrogate + // pair. + utf8++; + len++; + } + return len; +} + +static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { + size_t result = 0; + while (char_count--) { + const uint16_t ch = *chars++; + if (ch > 0 && ch <= 0x7f) { + ++result; + } else if (ch >= 0xd800 && ch <= 0xdbff) { + if (char_count > 0) { + const uint16_t ch2 = *chars; + // If we find a properly paired surrogate, we emit it as a 4 byte + // UTF sequence. If we find an unpaired leading or trailing surrogate, + // we emit it as a 3 byte sequence like would have done earlier. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + chars++; + char_count--; + + result += 4; + } else { + result += 3; + } + } else { + // This implies we found an unpaired trailing surrogate at the end + // of a string. + result += 3; + } + } else if (ch > 0x7ff) { + result += 3; + } else { + result += 2; + } + } + return result; +} + +static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in, + size_t char_count) { + while (char_count--) { + const uint16_t ch = *utf16_in++; + if (ch > 0 && ch <= 0x7f) { + *utf8_out++ = ch; + } else { + // Char_count == 0 here implies we've encountered an unpaired + // surrogate and we have no choice but to encode it as 3-byte UTF + // sequence. Note that unpaired surrogates can occur as a part of + // "normal" operation. + if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { + const uint16_t ch2 = *utf16_in; + + // Check if the other half of the pair is within the expected + // range. If it isn't, we will have to emit both "halves" as + // separate 3 byte sequences. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + utf16_in++; + char_count--; + const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; + *utf8_out++ = (code_point >> 18) | 0xf0; + *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; + *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; + *utf8_out++ = (code_point & 0x3f) | 0x80; + continue; + } + } + + if (ch > 0x07ff) { + // Three byte encoding. + *utf8_out++ = (ch >> 12) | 0xe0; + *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; + *utf8_out++ = (ch & 0x3f) | 0x80; + } else /*(ch > 0x7f || ch == 0)*/ { + // Two byte encoding. + *utf8_out++ = (ch >> 6) | 0xc0; + *utf8_out++ = (ch & 0x3f) | 0x80; + } + } + } +} + +// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again. + +static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) { + first = (code_point >> 10) + 0xd7c0; + second = (code_point & 0x03ff) + 0xdc00; +} + +static void testConversions(uint16_t *buf, int char_count) { + char bytes_test[8], bytes_reference[8]; + uint16_t out_buf_test[4], out_buf_reference[4]; + int byte_count_test, byte_count_reference; + int char_count_test, char_count_reference; + + // Calculate the number of utf-8 bytes for the utf-16 chars. + byte_count_reference = CountUtf8Bytes_reference(buf, char_count); + byte_count_test = CountUtf8Bytes(buf, char_count); + EXPECT_EQ(byte_count_reference, byte_count_test); + + // Convert the utf-16 string to utf-8 bytes. + ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count); + ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count); + for (int i = 0; i < byte_count_test; ++i) { + EXPECT_EQ(bytes_reference[i], bytes_test[i]); + } + + // Calculate the number of utf-16 chars from the utf-8 bytes. + bytes_reference[byte_count_reference] = 0; // Reference function needs null termination. + char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference); + char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test); + EXPECT_EQ(char_count, char_count_reference); + EXPECT_EQ(char_count, char_count_test); + + // Convert the utf-8 bytes back to utf-16 chars. + // Does not need copied _reference version of the function because the original + // function with the old API is retained for debug/testing code. + ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference); + ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test); + for (int i = 0; i < char_count_test; ++i) { + EXPECT_EQ(buf[i], out_buf_reference[i]); + EXPECT_EQ(buf[i], out_buf_test[i]); + } +} + +TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) { + for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) { + uint16_t buf[4]; + if (codePoint <= 0xffff) { + if (codePoint >= 0xd800 && codePoint <= 0xdfff) { + // According to the Unicode standard, no character will ever + // be assigned to these code points, and they can not be encoded + // into either utf-16 or utf-8. + continue; + } + buf[0] = 'h'; + buf[1] = codePoint; + buf[2] = 'e'; + testConversions(buf, 2); + testConversions(buf, 3); + testConversions(buf + 1, 1); + testConversions(buf + 1, 2); + } else { + buf[0] = 'h'; + codePointToSurrogatePair(codePoint, buf[1], buf[2]); + buf[3] = 'e'; + testConversions(buf, 2); + testConversions(buf, 3); + testConversions(buf, 4); + testConversions(buf + 1, 1); + testConversions(buf + 1, 2); + testConversions(buf + 1, 3); + } + } +} + } // namespace art |