diff options
author | 2022-12-16 11:16:45 +0000 | |
---|---|---|
committer | 2022-12-19 08:25:47 +0000 | |
commit | 419484b8d944122049f2517bf9a2e482d26575b5 (patch) | |
tree | f945d37932372b8d359937cb42992c688a7d3d70 | |
parent | 88a7d206e7b50661a8256d844acca3d1a2dc5922 (diff) |
Clean up Modified UTF-8 encoding.
The function `String::GetUtfLength()` is used exclusively in
`String::ToModifiedUtf8()`, so rename it to better reflect
the intent, namely to `String::GetModifiedUtf8Length().
Similarly, rename the function `CountUtf8Bytes()` used only
in the above function to `CountModifiedUtf8BytesInUtf16()`.
Rename a copy of the function used in tests.
Reimplement `CountModifiedUtf8Bytes()` and the renamed
`CountModifiedUtf8BytesInUtf16()` with equivalent code using
the helper template function `ConvertUtf16ToUtf8<>()` and
flag the fact that despite the name referencing Modified
UTF-8, the generated encoding can contain 4-byte sequences.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: m libctstiagent
Bug: 192935764
Change-Id: I942a3b77ee3af0194ee8315e0631ec217a0af67d
-rw-r--r-- | libdexfile/dex/utf.cc | 77 | ||||
-rw-r--r-- | libdexfile/dex/utf.h | 14 | ||||
-rw-r--r-- | libdexfile/dex/utf_test.cc | 8 | ||||
-rw-r--r-- | runtime/mirror/object_test.cc | 2 | ||||
-rw-r--r-- | runtime/mirror/string-inl.h | 4 | ||||
-rw-r--r-- | runtime/mirror/string.cc | 2 | ||||
-rw-r--r-- | runtime/mirror/string.h | 2 | ||||
-rw-r--r-- | test/906-iterate-heap/iterate_heap.cc | 2 | ||||
-rw-r--r-- | test/913-heaps/heaps.cc | 2 | ||||
-rw-r--r-- | test/ti-agent/ti_macros.h | 6 | ||||
-rw-r--r-- | test/ti-agent/ti_utf.h | 128 |
11 files changed, 97 insertions, 150 deletions
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc index bfc704d4a6..9692a26827 100644 --- a/libdexfile/dex/utf.cc +++ b/libdexfile/dex/utf.cc @@ -133,45 +133,11 @@ void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, } // String contains non-ASCII characters. - while (char_count--) { - const uint16_t ch = *utf16_in++; - if (ch > 0 && ch <= 0x7f) { - *utf8_out++ = ch; - } else { - // Char_count == 0 here implies we've encountered an unpaired - // surrogate and we have no choice but to encode it as 3-byte UTF - // sequence. Note that unpaired surrogates can occur as a part of - // "normal" operation. - if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { - const uint16_t ch2 = *utf16_in; - - // Check if the other half of the pair is within the expected - // range. If it isn't, we will have to emit both "halves" as - // separate 3 byte sequences. - if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { - utf16_in++; - char_count--; - const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; - *utf8_out++ = (code_point >> 18) | 0xf0; - *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; - *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; - *utf8_out++ = (code_point & 0x3f) | 0x80; - continue; - } - } - - if (ch > 0x07ff) { - // Three byte encoding. - *utf8_out++ = (ch >> 12) | 0xe0; - *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; - *utf8_out++ = (ch & 0x3f) | 0x80; - } else /*(ch > 0x7f || ch == 0)*/ { - // Two byte encoding. - *utf8_out++ = (ch >> 6) | 0xc0; - *utf8_out++ = (ch & 0x3f) | 0x80; - } - } - } + // FIXME: We should not emit 4-byte sequences. Bug: 192935764 + auto append = [&](char c) { *utf8_out++ = c; }; + ConvertUtf16ToUtf8</*kUseShortZero=*/ false, + /*kUse4ByteSequence=*/ true, + /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append); } int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) { @@ -240,34 +206,13 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t } } -size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { +size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) { + // FIXME: We should not emit 4-byte sequences. Bug: 192935764 size_t result = 0; - const uint16_t *end = chars + char_count; - while (chars < end) { - const uint16_t ch = *chars++; - if (LIKELY(ch != 0 && ch < 0x80)) { - result++; - continue; - } - if (ch < 0x800) { - result += 2; - continue; - } - if (ch >= 0xd800 && ch < 0xdc00) { - if (chars < end) { - const uint16_t ch2 = *chars; - // If we find a properly paired surrogate, we emit it as a 4 byte - // UTF sequence. If we find an unpaired leading or trailing surrogate, - // we emit it as a 3 byte sequence like would have done earlier. - if (ch2 >= 0xdc00 && ch2 < 0xe000) { - chars++; - result += 4; - continue; - } - } - } - result += 3; - } + auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; }; + ConvertUtf16ToUtf8</*kUseShortZero=*/ false, + /*kUse4ByteSequence=*/ true, + /*kReplaceBadSurrogates=*/ false>(chars, char_count, append); return result; } diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h index 35cbf78463..d372bff662 100644 --- a/libdexfile/dex/utf.h +++ b/libdexfile/dex/utf.h @@ -41,12 +41,6 @@ size_t CountModifiedUtf8Chars(const char* utf8); size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count); /* - * Returns the number of modified UTF-8 bytes needed to represent the given - * UTF-16 string. - */ -size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count); - -/* * Convert from Modified UTF-8 to UTF-16. */ void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, const char* utf8_in); @@ -85,8 +79,14 @@ template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append); /* + * Returns the number of modified UTF-8 bytes needed to represent the given + * UTF-16 string. + */ +size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count); + +/* * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_ - * NUL-terminated. You probably need to call CountUtf8Bytes before calling + * NUL-terminated. You probably need to call CountModifiedUtf8BytesInUtf16 before calling * this anyway, so if you want a NUL-terminated string, you know where to * put the NUL byte. */ diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc index 919259e4d3..85c74d285c 100644 --- a/libdexfile/dex/utf_test.cc +++ b/libdexfile/dex/utf_test.cc @@ -117,7 +117,7 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) { static void AssertConversion(const std::vector<uint16_t>& input, const std::vector<uint8_t>& expected) { - ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); + ASSERT_EQ(expected.size(), CountModifiedUtf8BytesInUtf16(&input[0], input.size())); std::vector<uint8_t> output(expected.size()); ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(), @@ -229,7 +229,7 @@ size_t CountModifiedUtf8Chars_reference(const char* utf8) { return len; } -static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { +static size_t CountModifiedUtf8BytesInUtf16_reference(const uint16_t* chars, size_t char_count) { size_t result = 0; while (char_count--) { const uint16_t ch = *chars++; @@ -320,8 +320,8 @@ static void testConversions(uint16_t *buf, int char_count) { int char_count_test, char_count_reference; // Calculate the number of utf-8 bytes for the utf-16 chars. - byte_count_reference = CountUtf8Bytes_reference(buf, char_count); - byte_count_test = CountUtf8Bytes(buf, char_count); + byte_count_reference = CountModifiedUtf8BytesInUtf16_reference(buf, char_count); + byte_count_test = CountModifiedUtf8BytesInUtf16(buf, char_count); EXPECT_EQ(byte_count_reference, byte_count_test); // Convert the utf-16 string to utf-8 bytes. diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc index 7edeacbb05..6f42c5b8b8 100644 --- a/runtime/mirror/object_test.cc +++ b/runtime/mirror/object_test.cc @@ -528,7 +528,7 @@ TEST_F(ObjectTest, StringLength) { StackHandleScope<1> hs(soa.Self()); Handle<String> string(hs.NewHandle(String::AllocFromModifiedUtf8(soa.Self(), "android"))); EXPECT_EQ(string->GetLength(), 7); - EXPECT_EQ(string->GetUtfLength(), 7); + EXPECT_EQ(string->GetModifiedUtf8Length(), 7); } TEST_F(ObjectTest, DescriptorCompare) { diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h index 548b194943..883a45c054 100644 --- a/runtime/mirror/string-inl.h +++ b/runtime/mirror/string-inl.h @@ -83,11 +83,11 @@ inline int32_t String::GetHashCode() { return result; } -inline int32_t String::GetUtfLength() { +inline int32_t String::GetModifiedUtf8Length() { if (IsCompressed()) { return GetLength(); } else { - return CountUtf8Bytes(GetValue(), GetLength()); + return CountModifiedUtf8BytesInUtf16(GetValue(), GetLength()); } } diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc index 4961776d41..a21c967f1f 100644 --- a/runtime/mirror/string.cc +++ b/runtime/mirror/string.cc @@ -313,7 +313,7 @@ std::string String::ToModifiedUtf8() { if (IsCompressed()) { return std::string(reinterpret_cast<const char*>(GetValueCompressed()), GetLength()); } else { - size_t byte_count = GetUtfLength(); + size_t byte_count = GetModifiedUtf8Length(); std::string result(byte_count, static_cast<char>(0)); ConvertUtf16ToModifiedUtf8(&result[0], byte_count, GetValue(), GetLength()); return result; diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h index 3ce2d4c369..2b0f6f3d19 100644 --- a/runtime/mirror/string.h +++ b/runtime/mirror/string.h @@ -114,7 +114,7 @@ class MANAGED String final : public Object { // Computes and returns the hash code. int32_t ComputeHashCode() REQUIRES_SHARED(Locks::mutator_lock_); - int32_t GetUtfLength() REQUIRES_SHARED(Locks::mutator_lock_); + int32_t GetModifiedUtf8Length() REQUIRES_SHARED(Locks::mutator_lock_); uint16_t CharAt(int32_t index) REQUIRES_SHARED(Locks::mutator_lock_); diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc index 521f9a6c72..f0a6624ef2 100644 --- a/test/906-iterate-heap/iterate_heap.cc +++ b/test/906-iterate-heap/iterate_heap.cc @@ -198,7 +198,7 @@ extern "C" JNIEXPORT jstring JNICALL Java_art_Test906_iterateThroughHeapString( void* user_data) { FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data); if (*tag_ptr == p->tag_to_find) { - size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length); + size_t utf_byte_count = ti::CountModifiedUtf8BytesInUtf16(value, value_length); std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]); memset(mod_utf.get(), 0, utf_byte_count + 1); ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length); diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc index 98ea9066d7..311b029a9b 100644 --- a/test/913-heaps/heaps.cc +++ b/test/913-heaps/heaps.cc @@ -592,7 +592,7 @@ extern "C" JNIEXPORT jobjectArray JNICALL Java_art_Test913_followReferencesStrin void* user_data) { FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data); if (*tag_ptr != 0) { - size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length); + size_t utf_byte_count = ti::CountModifiedUtf8BytesInUtf16(value, value_length); std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]); memset(mod_utf.get(), 0, utf_byte_count + 1); ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length); diff --git a/test/ti-agent/ti_macros.h b/test/ti-agent/ti_macros.h index a871270dcf..abd54e0e78 100644 --- a/test/ti-agent/ti_macros.h +++ b/test/ti-agent/ti_macros.h @@ -21,4 +21,10 @@ #define UNREACHABLE __builtin_unreachable +#ifndef NDEBUG +#define ALWAYS_INLINE +#else +#define ALWAYS_INLINE __attribute__ ((always_inline)) +#endif + #endif // ART_TEST_TI_AGENT_TI_MACROS_H_ diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h index 341e1066c3..15fe22ce5a 100644 --- a/test/ti-agent/ti_utf.h +++ b/test/ti-agent/ti_utf.h @@ -21,6 +21,7 @@ #include <string.h> #include "android-base/logging.h" +#include "ti_macros.h" namespace art { namespace ti { @@ -104,6 +105,56 @@ inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) { return surrogate_pair; } +// Note: This is a copy of the code in `libdexfile`. +template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append> +inline void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append) { + static_assert(kUse4ByteSequence || !kReplaceBadSurrogates); + + // Use local helpers instead of macros from `libicu` to avoid the dependency on `libicu`. + auto is_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xd800u; }; + auto is_trail = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xdc00u; }; + auto is_surrogate = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xf800u) == 0xd800u; }; + auto is_surrogate_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0x0400u) == 0u; }; + auto get_supplementary = [](uint16_t lead, uint16_t trail) ALWAYS_INLINE { + constexpr uint32_t offset = (0xd800u << 10) + 0xdc00u - 0x10000u; + return (static_cast<uint32_t>(lead) << 10) + static_cast<uint32_t>(trail) - offset; + }; + + for (size_t i = 0u; i < char_count; ++i) { + auto has_trail = [&]() { return i + 1u != char_count && is_trail(utf16[i + 1u]); }; + + uint16_t ch = utf16[i]; + if (ch < 0x80u && (kUseShortZero || ch != 0u)) { + // One byte. + append(ch); + } else if (ch < 0x800u) { + // Two bytes. + append((ch >> 6) | 0xc0); + append((ch & 0x3f) | 0x80); + } else if (kReplaceBadSurrogates + ? is_surrogate(ch) + : kUse4ByteSequence && is_lead(ch) && has_trail()) { + if (kReplaceBadSurrogates && (!is_surrogate_lead(ch) || !has_trail())) { + append('?'); + } else { + // We have a *valid* surrogate pair. + uint32_t code_point = get_supplementary(ch, utf16[i + 1u]); + ++i; // Consume the leading surrogate. + // Four bytes. + append((code_point >> 18) | 0xf0); + append(((code_point >> 12) & 0x3f) | 0x80); + append(((code_point >> 6) & 0x3f) | 0x80); + append((code_point & 0x3f) | 0x80); + } + } else { + // Three bytes. + append((ch >> 12) | 0xe0); + append(((ch >> 6) & 0x3f) | 0x80); + append((ch & 0x3f) | 0x80); + } + } +} + inline void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, const uint16_t* utf16_in, @@ -118,75 +169,20 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out, } // String contains non-ASCII characters. - while (char_count--) { - const uint16_t ch = *utf16_in++; - if (ch > 0 && ch <= 0x7f) { - *utf8_out++ = ch; - } else { - // Char_count == 0 here implies we've encountered an unpaired - // surrogate and we have no choice but to encode it as 3-byte UTF - // sequence. Note that unpaired surrogates can occur as a part of - // "normal" operation. - if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { - const uint16_t ch2 = *utf16_in; - - // Check if the other half of the pair is within the expected - // range. If it isn't, we will have to emit both "halves" as - // separate 3 byte sequences. - if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { - utf16_in++; - char_count--; - const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; - *utf8_out++ = (code_point >> 18) | 0xf0; - *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; - *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; - *utf8_out++ = (code_point & 0x3f) | 0x80; - continue; - } - } - - if (ch > 0x07ff) { - // Three byte encoding. - *utf8_out++ = (ch >> 12) | 0xe0; - *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; - *utf8_out++ = (ch & 0x3f) | 0x80; - } else /*(ch > 0x7f || ch == 0)*/ { - // Two byte encoding. - *utf8_out++ = (ch >> 6) | 0xc0; - *utf8_out++ = (ch & 0x3f) | 0x80; - } - } - } + // FIXME: We should not emit 4-byte sequences. Bug: 192935764 + auto append = [&](char c) { *utf8_out++ = c; }; + ConvertUtf16ToUtf8</*kUseShortZero=*/ false, + /*kUse4ByteSequence=*/ true, + /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append); } -inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { +inline size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) { + // FIXME: We should not emit 4-byte sequences. Bug: 192935764 size_t result = 0; - const uint16_t *end = chars + char_count; - while (chars < end) { - const uint16_t ch = *chars++; - if (LIKELY(ch != 0 && ch < 0x80)) { - result++; - continue; - } - if (ch < 0x800) { - result += 2; - continue; - } - if (ch >= 0xd800 && ch < 0xdc00) { - if (chars < end) { - const uint16_t ch2 = *chars; - // If we find a properly paired surrogate, we emit it as a 4 byte - // UTF sequence. If we find an unpaired leading or trailing surrogate, - // we emit it as a 3 byte sequence like would have done earlier. - if (ch2 >= 0xdc00 && ch2 < 0xe000) { - chars++; - result += 4; - continue; - } - } - } - result += 3; - } + auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; }; + ConvertUtf16ToUtf8</*kUseShortZero=*/ false, + /*kUse4ByteSequence=*/ true, + /*kReplaceBadSurrogates=*/ false>(chars, char_count, append); return result; } |