Clean up Modified UTF-8 encoding.

The function `String::GetUtfLength()` is used exclusively in `String::ToModifiedUtf8()`, so rename it to better reflect the intent, namely to `String::GetModifiedUtf8Length(). Similarly, rename the function `CountUtf8Bytes()` used only in the above function to `CountModifiedUtf8BytesInUtf16()`. Rename a copy of the function used in tests. Reimplement `CountModifiedUtf8Bytes()` and the renamed `CountModifiedUtf8BytesInUtf16()` with equivalent code using the helper template function `ConvertUtf16ToUtf8<>()` and flag the fact that despite the name referencing Modified UTF-8, the generated encoding can contain 4-byte sequences. Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: m libctstiagent Bug: 192935764 Change-Id: I942a3b77ee3af0194ee8315e0631ec217a0af67d
author: Vladimir Marko <vmarko@google.com> 2022-12-16 11:16:45 +0000
committer: Vladimír Marko <vmarko@google.com> 2022-12-19 08:25:47 +0000
commit: 419484b8d944122049f2517bf9a2e482d26575b5 (patch)
tree: f945d37932372b8d359937cb42992c688a7d3d70
parent: 88a7d206e7b50661a8256d844acca3d1a2dc5922 (diff)
11 files changed, 97 insertions, 150 deletions
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc
index bfc704d4a6..9692a26827 100644
--- a/libdexfile/dex/utf.cc
+++ b/libdexfile/dex/utf.cc
@@ -133,45 +133,11 @@ void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
   }
 
   // String contains non-ASCII characters.
-  while (char_count--) {
-    const uint16_t ch = *utf16_in++;
-    if (ch > 0 && ch <= 0x7f) {
-      *utf8_out++ = ch;
-    } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
-      if (ch > 0x07ff) {
-        // Three byte encoding.
-        *utf8_out++ = (ch >> 12) | 0xe0;
-        *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
-        *utf8_out++ = (ch & 0x3f) | 0x80;
-      } else /*(ch > 0x7f || ch == 0)*/ {
-        // Two byte encoding.
-        *utf8_out++ = (ch >> 6) | 0xc0;
-        *utf8_out++ = (ch & 0x3f) | 0x80;
-      }
-    }
-  }
+  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
+  auto append = [&](char c) { *utf8_out++ = c; };
+  ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+                     /*kUse4ByteSequence=*/ true,
+                     /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
 }
 
 int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
@@ -240,34 +206,13 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t
   }
 }
 
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
+  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
   size_t result = 0;
-  const uint16_t *end = chars + char_count;
-  while (chars < end) {
-    const uint16_t ch = *chars++;
-    if (LIKELY(ch != 0 && ch < 0x80)) {
-      result++;
-      continue;
-    }
-    if (ch < 0x800) {
-      result += 2;
-      continue;
-    }
-    if (ch >= 0xd800 && ch < 0xdc00) {
-      if (chars < end) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
-          chars++;
-          result += 4;
-          continue;
-        }
-      }
-    }
-    result += 3;
-  }
+  auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; };
+  ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+                     /*kUse4ByteSequence=*/ true,
+                     /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
   return result;
 }
 
diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index 35cbf78463..d372bff662 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h
@@ -41,12 +41,6 @@ size_t CountModifiedUtf8Chars(const char* utf8);
 size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count);
 
 /*
- * Returns the number of modified UTF-8 bytes needed to represent the given
- * UTF-16 string.
- */
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count);
-
-/*
  * Convert from Modified UTF-8 to UTF-16.
  */
 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, const char* utf8_in);
@@ -85,8 +79,14 @@ template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates
 void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append);
 
 /*
+ * Returns the number of modified UTF-8 bytes needed to represent the given
+ * UTF-16 string.
+ */
+size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count);
+
+/*
  * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
- * NUL-terminated. You probably need to call CountUtf8Bytes before calling
+ * NUL-terminated. You probably need to call CountModifiedUtf8BytesInUtf16 before calling
  * this anyway, so if you want a NUL-terminated string, you know where to
  * put the NUL byte.
  */
diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc
index 919259e4d3..85c74d285c 100644
--- a/libdexfile/dex/utf_test.cc
+++ b/libdexfile/dex/utf_test.cc
@@ -117,7 +117,7 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) {
 
 static void AssertConversion(const std::vector<uint16_t>& input,
                              const std::vector<uint8_t>& expected) {
-  ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
+  ASSERT_EQ(expected.size(), CountModifiedUtf8BytesInUtf16(&input[0], input.size()));
 
   std::vector<uint8_t> output(expected.size());
   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
@@ -229,7 +229,7 @@ size_t CountModifiedUtf8Chars_reference(const char* utf8) {
   return len;
 }
 
-static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+static size_t CountModifiedUtf8BytesInUtf16_reference(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   while (char_count--) {
     const uint16_t ch = *chars++;
@@ -320,8 +320,8 @@ static void testConversions(uint16_t *buf, int char_count) {
   int char_count_test, char_count_reference;
 
   // Calculate the number of utf-8 bytes for the utf-16 chars.
-  byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
-  byte_count_test = CountUtf8Bytes(buf, char_count);
+  byte_count_reference = CountModifiedUtf8BytesInUtf16_reference(buf, char_count);
+  byte_count_test = CountModifiedUtf8BytesInUtf16(buf, char_count);
   EXPECT_EQ(byte_count_reference, byte_count_test);
 
   // Convert the utf-16 string to utf-8 bytes.
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 7edeacbb05..6f42c5b8b8 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -528,7 +528,7 @@ TEST_F(ObjectTest, StringLength) {
   StackHandleScope<1> hs(soa.Self());
   Handle<String> string(hs.NewHandle(String::AllocFromModifiedUtf8(soa.Self(), "android")));
   EXPECT_EQ(string->GetLength(), 7);
-  EXPECT_EQ(string->GetUtfLength(), 7);
+  EXPECT_EQ(string->GetModifiedUtf8Length(), 7);
 }
 
 TEST_F(ObjectTest, DescriptorCompare) {
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 548b194943..883a45c054 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -83,11 +83,11 @@ inline int32_t String::GetHashCode() {
   return result;
 }
 
-inline int32_t String::GetUtfLength() {
+inline int32_t String::GetModifiedUtf8Length() {
   if (IsCompressed()) {
     return GetLength();
   } else {
-    return CountUtf8Bytes(GetValue(), GetLength());
+    return CountModifiedUtf8BytesInUtf16(GetValue(), GetLength());
   }
 }
 
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index 4961776d41..a21c967f1f 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -313,7 +313,7 @@ std::string String::ToModifiedUtf8() {
   if (IsCompressed()) {
     return std::string(reinterpret_cast<const char*>(GetValueCompressed()), GetLength());
   } else {
-    size_t byte_count = GetUtfLength();
+    size_t byte_count = GetModifiedUtf8Length();
     std::string result(byte_count, static_cast<char>(0));
     ConvertUtf16ToModifiedUtf8(&result[0], byte_count, GetValue(), GetLength());
     return result;
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 3ce2d4c369..2b0f6f3d19 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -114,7 +114,7 @@ class MANAGED String final : public Object {
   // Computes and returns the hash code.
   int32_t ComputeHashCode() REQUIRES_SHARED(Locks::mutator_lock_);
 
-  int32_t GetUtfLength() REQUIRES_SHARED(Locks::mutator_lock_);
+  int32_t GetModifiedUtf8Length() REQUIRES_SHARED(Locks::mutator_lock_);
 
   uint16_t CharAt(int32_t index) REQUIRES_SHARED(Locks::mutator_lock_);
 
diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index 521f9a6c72..f0a6624ef2 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc
@@ -198,7 +198,7 @@ extern "C" JNIEXPORT jstring JNICALL Java_art_Test906_iterateThroughHeapString(
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr == p->tag_to_find) {
-        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountModifiedUtf8BytesInUtf16(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index 98ea9066d7..311b029a9b 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -592,7 +592,7 @@ extern "C" JNIEXPORT jobjectArray JNICALL Java_art_Test913_followReferencesStrin
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr != 0) {
-        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountModifiedUtf8BytesInUtf16(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);
diff --git a/test/ti-agent/ti_macros.h b/test/ti-agent/ti_macros.h
index a871270dcf..abd54e0e78 100644
--- a/test/ti-agent/ti_macros.h
+++ b/test/ti-agent/ti_macros.h
@@ -21,4 +21,10 @@
 
 #define UNREACHABLE  __builtin_unreachable
 
+#ifndef NDEBUG
+#define ALWAYS_INLINE
+#else
+#define ALWAYS_INLINE  __attribute__ ((always_inline))
+#endif
+
 #endif  // ART_TEST_TI_AGENT_TI_MACROS_H_
diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h
index 341e1066c3..15fe22ce5a 100644
--- a/test/ti-agent/ti_utf.h
+++ b/test/ti-agent/ti_utf.h
@@ -21,6 +21,7 @@
 #include <string.h>
 
 #include "android-base/logging.h"
+#include "ti_macros.h"
 
 namespace art {
 namespace ti {
@@ -104,6 +105,56 @@ inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) {
   return surrogate_pair;
 }
 
+// Note: This is a copy of the code in `libdexfile`.
+template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
+inline void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append) {
+  static_assert(kUse4ByteSequence || !kReplaceBadSurrogates);
+
+  // Use local helpers instead of macros from `libicu` to avoid the dependency on `libicu`.
+  auto is_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xd800u; };
+  auto is_trail = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xfc00u) == 0xdc00u; };
+  auto is_surrogate = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0xf800u) == 0xd800u; };
+  auto is_surrogate_lead = [](uint16_t ch) ALWAYS_INLINE { return (ch & 0x0400u) == 0u; };
+  auto get_supplementary = [](uint16_t lead, uint16_t trail) ALWAYS_INLINE {
+    constexpr uint32_t offset = (0xd800u << 10) + 0xdc00u - 0x10000u;
+    return (static_cast<uint32_t>(lead) << 10) + static_cast<uint32_t>(trail) - offset;
+  };
+
+  for (size_t i = 0u; i < char_count; ++i) {
+    auto has_trail = [&]() { return i + 1u != char_count && is_trail(utf16[i + 1u]); };
+
+    uint16_t ch = utf16[i];
+    if (ch < 0x80u && (kUseShortZero || ch != 0u)) {
+      // One byte.
+      append(ch);
+    } else if (ch < 0x800u) {
+      // Two bytes.
+      append((ch >> 6) | 0xc0);
+      append((ch & 0x3f) | 0x80);
+    } else if (kReplaceBadSurrogates
+                   ? is_surrogate(ch)
+                   : kUse4ByteSequence && is_lead(ch) && has_trail()) {
+      if (kReplaceBadSurrogates && (!is_surrogate_lead(ch) || !has_trail())) {
+        append('?');
+      } else {
+        // We have a *valid* surrogate pair.
+        uint32_t code_point = get_supplementary(ch, utf16[i + 1u]);
+        ++i;  //  Consume the leading surrogate.
+        // Four bytes.
+        append((code_point >> 18) | 0xf0);
+        append(((code_point >> 12) & 0x3f) | 0x80);
+        append(((code_point >> 6) & 0x3f) | 0x80);
+        append((code_point & 0x3f) | 0x80);
+      }
+    } else {
+      // Three bytes.
+      append((ch >> 12) | 0xe0);
+      append(((ch >> 6) & 0x3f) | 0x80);
+      append((ch & 0x3f) | 0x80);
+    }
+  }
+}
+
 inline void ConvertUtf16ToModifiedUtf8(char* utf8_out,
                                        size_t byte_count,
                                        const uint16_t* utf16_in,
@@ -118,75 +169,20 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out,
   }
 
   // String contains non-ASCII characters.
-  while (char_count--) {
-    const uint16_t ch = *utf16_in++;
-    if (ch > 0 && ch <= 0x7f) {
-      *utf8_out++ = ch;
-    } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
-      if (ch > 0x07ff) {
-        // Three byte encoding.
-        *utf8_out++ = (ch >> 12) | 0xe0;
-        *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
-        *utf8_out++ = (ch & 0x3f) | 0x80;
-      } else /*(ch > 0x7f || ch == 0)*/ {
-        // Two byte encoding.
-        *utf8_out++ = (ch >> 6) | 0xc0;
-        *utf8_out++ = (ch & 0x3f) | 0x80;
-      }
-    }
-  }
+  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
+  auto append = [&](char c) { *utf8_out++ = c; };
+  ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+                     /*kUse4ByteSequence=*/ true,
+                     /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
 }
 
-inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+inline size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
+  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
   size_t result = 0;
-  const uint16_t *end = chars + char_count;
-  while (chars < end) {
-    const uint16_t ch = *chars++;
-    if (LIKELY(ch != 0 && ch < 0x80)) {
-      result++;
-      continue;
-    }
-    if (ch < 0x800) {
-      result += 2;
-      continue;
-    }
-    if (ch >= 0xd800 && ch < 0xdc00) {
-      if (chars < end) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
-          chars++;
-          result += 4;
-          continue;
-        }
-      }
-    }
-    result += 3;
-  }
+  auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; };
+  ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
+                     /*kUse4ByteSequence=*/ true,
+                     /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
   return result;
 }
author	Vladimir Marko <vmarko@google.com>	2022-12-16 11:16:45 +0000
committer	Vladimír Marko <vmarko@google.com>	2022-12-19 08:25:47 +0000
commit	419484b8d944122049f2517bf9a2e482d26575b5 (patch)
tree	f945d37932372b8d359937cb42992c688a7d3d70
parent	88a7d206e7b50661a8256d844acca3d1a2dc5922 (diff)