summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Vladimir Marko <vmarko@google.com> 2022-12-16 11:16:45 +0000
committer VladimĂ­r Marko <vmarko@google.com> 2022-12-19 08:25:47 +0000
commit419484b8d944122049f2517bf9a2e482d26575b5 (patch)
treef945d37932372b8d359937cb42992c688a7d3d70
parent88a7d206e7b50661a8256d844acca3d1a2dc5922 (diff)
Clean up Modified UTF-8 encoding.
The function `String::GetUtfLength()` is used exclusively in `String::ToModifiedUtf8()`, so rename it to better reflect the intent, namely to `String::GetModifiedUtf8Length(). Similarly, rename the function `CountUtf8Bytes()` used only in the above function to `CountModifiedUtf8BytesInUtf16()`. Rename a copy of the function used in tests. Reimplement `CountModifiedUtf8Bytes()` and the renamed `CountModifiedUtf8BytesInUtf16()` with equivalent code using the helper template function `ConvertUtf16ToUtf8<>()` and flag the fact that despite the name referencing Modified UTF-8, the generated encoding can contain 4-byte sequences. Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: m libctstiagent Bug: 192935764 Change-Id: I942a3b77ee3af0194ee8315e0631ec217a0af67d
-rw-r--r--libdexfile/dex/utf.cc77
-rw-r--r--libdexfile/dex/utf.h14
-rw-r--r--libdexfile/dex/utf_test.cc8
-rw-r--r--runtime/mirror/object_test.cc2
-rw-r--r--runtime/mirror/string-inl.h4
-rw-r--r--runtime/mirror/string.cc2
-rw-r--r--runtime/mirror/string.h2
-rw-r--r--test/906-iterate-heap/iterate_heap.cc2
-rw-r--r--test/913-heaps/heaps.cc2
-rw-r--r--test/ti-agent/ti_macros.h6
-rw-r--r--test/ti-agent/ti_utf.h128
11 files changed, 97 insertions, 150 deletions
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc
index bfc704d4a6..9692a26827 100644
--- a/libdexfile/dex/utf.cc
+++ b/libdexfile/dex/utf.cc
@@ -133,45 +133,11 @@ void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
}
// String contains non-ASCII characters.
- while (char_count--) {
- const uint16_t ch = *utf16_in++;
- if (ch > 0 && ch <= 0x7f) {
- *utf8_out++ = ch;
- } else {
- // Char_count == 0 here implies we've encountered an unpaired
- // surrogate and we have no choice but to encode it as 3-byte UTF
- // sequence. Note that unpaired surrogates can occur as a part of
- // "normal" operation.
- if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
- const uint16_t ch2 = *utf16_in;
-
- // Check if the other half of the pair is within the expected
- // range. If it isn't, we will have to emit both "halves" as
- // separate 3 byte sequences.
- if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
- utf16_in++;
- char_count--;
- const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
- *utf8_out++ = (code_point >> 18) | 0xf0;
- *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
- *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
- *utf8_out++ = (code_point & 0x3f) | 0x80;
- continue;
- }
- }
-
- if (ch > 0x07ff) {
- // Three byte encoding.
- *utf8_out++ = (ch >> 12) | 0xe0;
- *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
- *utf8_out++ = (ch & 0x3f) | 0x80;
- } else /*(ch > 0x7f || ch == 0)*/ {
- // Two byte encoding.
- *utf8_out++ = (ch >> 6) | 0xc0;
- *utf8_out++ = (ch & 0x3f) | 0x80;
- }
- }
- }
+ // FIXME: We should not emit 4-byte sequences. Bug: 192935764
+ auto append = [&](char c) { *utf8_out++ = c; };
+ ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+ /*kUse4ByteSequence=*/ true,
+ /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
}
int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
@@ -240,34 +206,13 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t
}
}
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
+ // FIXME: We should not emit 4-byte sequences. Bug: 192935764
size_t result = 0;
- const uint16_t *end = chars + char_count;
- while (chars < end) {
- const uint16_t ch = *chars++;
- if (LIKELY(ch != 0 && ch < 0x80)) {
- result++;
- continue;
- }
- if (ch < 0x800) {
- result += 2;
- continue;
- }
- if (ch >= 0xd800 && ch < 0xdc00) {
- if (chars < end) {
- const uint16_t ch2 = *chars;
- // If we find a properly paired surrogate, we emit it as a 4 byte
- // UTF sequence. If we find an unpaired leading or trailing surrogate,
- // we emit it as a 3 byte sequence like would have done earlier.
- if (ch2 >= 0xdc00 && ch2 < 0xe000) {
- chars++;
- result += 4;
- continue;
- }
- }
- }
- result += 3;
- }
+ auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; };
+ ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+ /*kUse4ByteSequence=*/ true,
+ /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
return result;
}
diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index 35cbf78463..d372bff662 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h
@@ -41,12 +41,6 @@ size_t CountModifiedUtf8Chars(const char* utf8);
size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count);
/*
- * Returns the number of modified UTF-8 bytes needed to represent the given
- * UTF-16 string.
- */
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count);
-
-/*
* Convert from Modified UTF-8 to UTF-16.
*/
void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, const char* utf8_in);
@@ -85,8 +79,14 @@ template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates
void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append);
/*
+ * Returns the number of modified UTF-8 bytes needed to represent the given
+ * UTF-16 string.
+ */
+size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count);
+
+/*
* Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
- * NUL-terminated. You probably need to call CountUtf8Bytes before calling
+ * NUL-terminated. You probably need to call CountModifiedUtf8BytesInUtf16 before calling
* this anyway, so if you want a NUL-terminated string, you know where to
* put the NUL byte.
*/
diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc
index 919259e4d3..85c74d285c 100644
--- a/libdexfile/dex/utf_test.cc
+++ b/libdexfile/dex/utf_test.cc
@@ -117,7 +117,7 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) {
static void AssertConversion(const std::vector<uint16_t>& input,
const std::vector<uint8_t>& expected) {
- ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
+ ASSERT_EQ(expected.size(), CountModifiedUtf8BytesInUtf16(&input[0], input.size()));
std::vector<uint8_t> output(expected.size());
ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
@@ -229,7 +229,7 @@ size_t CountModifiedUtf8Chars_reference(const char* utf8) {
return len;
}
-static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+static size_t CountModifiedUtf8BytesInUtf16_reference(const uint16_t* chars, size_t char_count) {
size_t result = 0;
while (char_count--) {
const uint16_t ch = *chars++;
@@ -320,8 +320,8 @@ static void testConversions(uint16_t *buf, int char_count) {
int char_count_test, char_count_reference;
// Calculate the number of utf-8 bytes for the utf-16 chars.
- byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
- byte_count_test = CountUtf8Bytes(buf, char_count);
+ byte_count_reference = CountModifiedUtf8BytesInUtf16_reference(buf, char_count);
+ byte_count_test = CountModifiedUtf8BytesInUtf16(buf, char_count);
EXPECT_EQ(byte_count_reference, byte_count_test);
// Convert the utf-16 string to utf-8 bytes.
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 7edeacbb05..6f42c5b8b8 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -528,7 +528,7 @@ TEST_F(ObjectTest, StringLength) {
StackHandleScope<1> hs(soa.Self());
Handle<String> string(hs.NewHandle(String::AllocFromModifiedUtf8(soa.Self(), "android")));
EXPECT_EQ(string->GetLength(), 7);
- EXPECT_EQ(string->GetUtfLength(), 7);
+ EXPECT_EQ(string->GetModifiedUtf8Length(), 7);
}
TEST_F(ObjectTest, DescriptorCompare) {
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 548b194943..883a45c054 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -83,11 +83,11 @@ inline int32_t String::GetHashCode() {
return result;
}
-inline int32_t String::GetUtfLength() {
+inline int32_t String::GetModifiedUtf8Length() {
if (IsCompressed()) {
return GetLength();
} else {
- return CountUtf8Bytes(GetValue(), GetLength());
+ return CountModifiedUtf8BytesInUtf16(GetValue(), GetLength());
}
}
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index 4961776d41..a21c967f1f 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -313,7 +313,7 @@ std::string String::ToModifiedUtf8() {
if (IsCompressed()) {
return std::string(reinterpret_cast<const char*>(GetValueCompressed()), GetLength());
} else {
- size_t byte_count = GetUtfLength();
+ size_t byte_count = GetModifiedUtf8Length();
std::string result(byte_count, static_cast<char>(0));
ConvertUtf16ToModifiedUtf8(&result[0], byte_count, GetValue(), GetLength());
return result;
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 3ce2d4c369..2b0f6f3d19 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -114,7 +114,7 @@ class MANAGED String final : public Object {
// Computes and returns the hash code.
int32_t ComputeHashCode() REQUIRES_SHARED(Locks::mutator_lock_);
- int32_t GetUtfLength() REQUIRES_SHARED(Locks::mutator_lock_);
+ int32_t GetModifiedUtf8Length() REQUIRES_SHARED(Locks::mutator_lock_);
uint16_t CharAt(int32_t index) REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index 521f9a6c72..f0a6624ef2 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc
@@ -198,7 +198,7 @@ extern "C" JNIEXPORT jstring JNICALL Java_art_Test906_iterateThroughHeapString(
void* user_data) {
FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
if (*tag_ptr == p->tag_to_find) {
- size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+ size_t utf_byte_count = ti::CountModifiedUtf8BytesInUtf16(value, value_length);
std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
memset(mod_utf.get(), 0, utf_byte_count + 1);
ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index 98ea9066d7..311b029a9b 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -592,7 +592,7 @@ extern "C" JNIEXPORT jobjectArray JNICALL Java_art_Test913_followReferencesStrin
void* user_data) {
FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
if (*tag_ptr != 0) {
- size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+ size_t utf_byte_count = ti::CountModifiedUtf8BytesInUtf16(value, value_length);
std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
memset(mod_utf.get(), 0, utf_byte_count + 1);
ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);
diff --git a/test/ti-agent/ti_macros.h b/test/ti-agent/ti_macros.h
index a871270dcf..abd54e0e78 100644
--- a/test/ti-agent/ti_macros.h
+++ b/test/ti-agent/ti_macros.h
@@ -21,4 +21,10 @@
#define UNREACHABLE __builtin_unreachable
+#ifndef NDEBUG
+#define ALWAYS_INLINE
+#else
+#define ALWAYS_INLINE __attribute__ ((always_inline))
+#endif
+
#endif // ART_TEST_TI_AGENT_TI_MACROS_H_
diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h
index 341e1066c3..15fe22ce5a 100644
--- a/test/ti-agent/ti_utf.h
+++ b/test/ti-agent/ti_utf.h
@@ -21,6 +21,7 @@
#include <string.h>
#include "android-base/logging.h"
+#include "ti_macros.h"
namespace art {
namespace ti {
@@ -104,6 +105,56 @@ inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) {
return surrogate_pair;
}
+// Note: This is a copy of the code in `libdexfile`.
+template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
+inline void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append) {
+ static_assert(kUse4ByteSequence || !kReplaceBadSurrogates);
+
+ // Use local helpers instead of macros from `libicu` to avoid the dependency on `libicu`.
+ auto is_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xd800u; };
+ auto is_trail = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xdc00u; };
+ auto is_surrogate = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xf800u) == 0xd800u; };
+ auto is_surrogate_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0x0400u) == 0u; };
+ auto get_supplementary = [](uint16_t lead, uint16_t trail) ALWAYS_INLINE {
+ constexpr uint32_t offset = (0xd800u << 10) + 0xdc00u - 0x10000u;
+ return (static_cast<uint32_t>(lead) << 10) + static_cast<uint32_t>(trail) - offset;
+ };
+
+ for (size_t i = 0u; i < char_count; ++i) {
+ auto has_trail = [&]() { return i + 1u != char_count && is_trail(utf16[i + 1u]); };
+
+ uint16_t ch = utf16[i];
+ if (ch < 0x80u && (kUseShortZero || ch != 0u)) {
+ // One byte.
+ append(ch);
+ } else if (ch < 0x800u) {
+ // Two bytes.
+ append((ch >> 6) | 0xc0);
+ append((ch & 0x3f) | 0x80);
+ } else if (kReplaceBadSurrogates
+ ? is_surrogate(ch)
+ : kUse4ByteSequence && is_lead(ch) && has_trail()) {
+ if (kReplaceBadSurrogates && (!is_surrogate_lead(ch) || !has_trail())) {
+ append('?');
+ } else {
+ // We have a *valid* surrogate pair.
+ uint32_t code_point = get_supplementary(ch, utf16[i + 1u]);
+ ++i; // Consume the leading surrogate.
+ // Four bytes.
+ append((code_point >> 18) | 0xf0);
+ append(((code_point >> 12) & 0x3f) | 0x80);
+ append(((code_point >> 6) & 0x3f) | 0x80);
+ append((code_point & 0x3f) | 0x80);
+ }
+ } else {
+ // Three bytes.
+ append((ch >> 12) | 0xe0);
+ append(((ch >> 6) & 0x3f) | 0x80);
+ append((ch & 0x3f) | 0x80);
+ }
+ }
+}
+
inline void ConvertUtf16ToModifiedUtf8(char* utf8_out,
size_t byte_count,
const uint16_t* utf16_in,
@@ -118,75 +169,20 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out,
}
// String contains non-ASCII characters.
- while (char_count--) {
- const uint16_t ch = *utf16_in++;
- if (ch > 0 && ch <= 0x7f) {
- *utf8_out++ = ch;
- } else {
- // Char_count == 0 here implies we've encountered an unpaired
- // surrogate and we have no choice but to encode it as 3-byte UTF
- // sequence. Note that unpaired surrogates can occur as a part of
- // "normal" operation.
- if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
- const uint16_t ch2 = *utf16_in;
-
- // Check if the other half of the pair is within the expected
- // range. If it isn't, we will have to emit both "halves" as
- // separate 3 byte sequences.
- if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
- utf16_in++;
- char_count--;
- const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
- *utf8_out++ = (code_point >> 18) | 0xf0;
- *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
- *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
- *utf8_out++ = (code_point & 0x3f) | 0x80;
- continue;
- }
- }
-
- if (ch > 0x07ff) {
- // Three byte encoding.
- *utf8_out++ = (ch >> 12) | 0xe0;
- *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
- *utf8_out++ = (ch & 0x3f) | 0x80;
- } else /*(ch > 0x7f || ch == 0)*/ {
- // Two byte encoding.
- *utf8_out++ = (ch >> 6) | 0xc0;
- *utf8_out++ = (ch & 0x3f) | 0x80;
- }
- }
- }
+ // FIXME: We should not emit 4-byte sequences. Bug: 192935764
+ auto append = [&](char c) { *utf8_out++ = c; };
+ ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+ /*kUse4ByteSequence=*/ true,
+ /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
}
-inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+inline size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
+ // FIXME: We should not emit 4-byte sequences. Bug: 192935764
size_t result = 0;
- const uint16_t *end = chars + char_count;
- while (chars < end) {
- const uint16_t ch = *chars++;
- if (LIKELY(ch != 0 && ch < 0x80)) {
- result++;
- continue;
- }
- if (ch < 0x800) {
- result += 2;
- continue;
- }
- if (ch >= 0xd800 && ch < 0xdc00) {
- if (chars < end) {
- const uint16_t ch2 = *chars;
- // If we find a properly paired surrogate, we emit it as a 4 byte
- // UTF sequence. If we find an unpaired leading or trailing surrogate,
- // we emit it as a 3 byte sequence like would have done earlier.
- if (ch2 >= 0xdc00 && ch2 < 0xe000) {
- chars++;
- result += 4;
- continue;
- }
- }
- }
- result += 3;
- }
+ auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; };
+ ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+ /*kUse4ByteSequence=*/ true,
+ /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
return result;
}