Revert "Do not create 4-byte sequences in `ConvertUtf16ToModifiedUtf8()`" This reverts commit e0a4f373dc4a738a2f26965a67d31239cbbd4f6a. Reason for revert: DroidMonitor-triggered revert due to breakage https://android-build.googleplex.com/builds/tests/view?invocationId=I44000009917256156&testResultId=TR23027520914949881, bug b/193363191 Bug: 193363191 Change-Id: I2e7fe6de3f63864b58729d2c76e9ab5c95aa7983

commit: 1b9d442dc906d0158300c5178683f417fa59b026 [log] [tgz]
author: Chuck Liao <chuckliao@google.com> Mon Jul 12 01:25:23 2021 +0000
committer: Chuck Liao <chuckliao@google.com> Mon Jul 12 02:58:46 2021 +0000
tree: dd52852cd7a16af7f5480c5f6c9213079a319dc8
parent: 52eebc756c96fbf71ad77b6d30bdff8a67723569 [diff]
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc
index 76fb49f..bfc704d 100644
--- a/libdexfile/dex/utf.cc
+++ b/libdexfile/dex/utf.cc

@@ -121,10 +121,8 @@
   }
 }
 
-void ConvertUtf16ToModifiedUtf8(char* utf8_out,
-                                size_t byte_count,
-                                const uint16_t* utf16_in,
-                                size_t char_count) {
+void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
+                                const uint16_t* utf16_in, size_t char_count) {
   if (LIKELY(byte_count == char_count)) {
     // Common case where all characters are ASCII.
     const uint16_t *utf16_end = utf16_in + char_count;
@@ -140,6 +138,28 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
+      // Char_count == 0 here implies we've encountered an unpaired
+      // surrogate and we have no choice but to encode it as 3-byte UTF
+      // sequence. Note that unpaired surrogates can occur as a part of
+      // "normal" operation.
+      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
+        const uint16_t ch2 = *utf16_in;
+
+        // Check if the other half of the pair is within the expected
+        // range. If it isn't, we will have to emit both "halves" as
+        // separate 3 byte sequences.
+        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+          utf16_in++;
+          char_count--;
+          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
+          *utf8_out++ = (code_point >> 18) | 0xf0;
+          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
+          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
+          *utf8_out++ = (code_point & 0x3f) | 0x80;
+          continue;
+        }
+      }
+
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -220,7 +240,7 @@
   }
 }
 
-size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) {
+size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   const uint16_t *end = chars + char_count;
   while (chars < end) {
@@ -233,6 +253,19 @@
       result += 2;
       continue;
     }
+    if (ch >= 0xd800 && ch < 0xdc00) {
+      if (chars < end) {
+        const uint16_t ch2 = *chars;
+        // If we find a properly paired surrogate, we emit it as a 4 byte
+        // UTF sequence. If we find an unpaired leading or trailing surrogate,
+        // we emit it as a 3 byte sequence like would have done earlier.
+        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+          chars++;
+          result += 4;
+          continue;
+        }
+      }
+    }
     result += 3;
   }
   return result;

diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index 55d025d..e3dc7f9 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h

@@ -43,7 +43,7 @@
  * Returns the number of modified UTF-8 bytes needed to represent the given
  * UTF-16 string.
  */
-size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count);
+size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count);
 
 /*
  * Convert from Modified UTF-8 to UTF-16.
@@ -67,14 +67,12 @@
 
 /*
  * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
- * NUL-terminated. You probably need to call CountModifiedUtf8Bytes before
- * calling this anyway, so if you want a NUL-terminated string, you know
- * where to put the NUL byte.
+ * NUL-terminated. You probably need to call CountUtf8Bytes before calling
+ * this anyway, so if you want a NUL-terminated string, you know where to
+ * put the NUL byte.
  */
-void ConvertUtf16ToModifiedUtf8(char* utf8_out,
-                                size_t byte_count,
-                                const uint16_t* utf16_in,
-                                size_t char_count);
+void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
+                                const uint16_t* utf16_in, size_t char_count);
 
 /*
  * The java.lang.String hashCode() algorithm.

diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc
index fc24d54..919259e 100644
--- a/libdexfile/dex/utf_test.cc
+++ b/libdexfile/dex/utf_test.cc

@@ -117,7 +117,7 @@
 
 static void AssertConversion(const std::vector<uint16_t>& input,
                              const std::vector<uint8_t>& expected) {
-  ASSERT_EQ(expected.size(), CountModifiedUtf8Bytes(&input[0], input.size()));
+  ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
 
   std::vector<uint8_t> output(expected.size());
   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
@@ -126,8 +126,8 @@
 }
 
 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
-  // Surrogate pairs will be converted into two three-byte sequences.
-  AssertConversion({ 0xd801, 0xdc00 }, { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80 });
+  // Surrogate pairs will be converted into 4 byte sequences.
+  AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
 
   // Three byte encodings that are below & above the leading surrogate
   // range respectively.
@@ -143,12 +143,12 @@
   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
 
   AssertConversion({
-      0xd802, 0xdc02,  // Surrogate pair - three byte encodings.
+      0xd802, 0xdc02,  // Surrogate pair.
       0xdef0, 0xdcff,  // Three byte encodings.
       0x0101, 0x0000,  // Two byte encodings.
       'p'   , 'p'      // One byte encoding.
     }, {
-      0xed, 0xa0, 0x82, 0xed, 0xb0, 0x82,
+      0xf0, 0x90, 0xa0, 0x82,
       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
       0xc4, 0x81, 0xc0, 0x80,
       0x70, 0x70
@@ -229,12 +229,31 @@
   return len;
 }
 
-static size_t CountModifiedUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   while (char_count--) {
     const uint16_t ch = *chars++;
     if (ch > 0 && ch <= 0x7f) {
       ++result;
+    } else if (ch >= 0xd800 && ch <= 0xdbff) {
+      if (char_count > 0) {
+        const uint16_t ch2 = *chars;
+        // If we find a properly paired surrogate, we emit it as a 4 byte
+        // UTF sequence. If we find an unpaired leading or trailing surrogate,
+        // we emit it as a 3 byte sequence like would have done earlier.
+        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+          chars++;
+          char_count--;
+
+          result += 4;
+        } else {
+          result += 3;
+        }
+      } else {
+        // This implies we found an unpaired trailing surrogate at the end
+        // of a string.
+        result += 3;
+      }
     } else if (ch > 0x7ff) {
       result += 3;
     } else {
@@ -251,6 +270,28 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
+      // Char_count == 0 here implies we've encountered an unpaired
+      // surrogate and we have no choice but to encode it as 3-byte UTF
+      // sequence. Note that unpaired surrogates can occur as a part of
+      // "normal" operation.
+      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
+        const uint16_t ch2 = *utf16_in;
+
+        // Check if the other half of the pair is within the expected
+        // range. If it isn't, we will have to emit both "halves" as
+        // separate 3 byte sequences.
+        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+          utf16_in++;
+          char_count--;
+          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
+          *utf8_out++ = (code_point >> 18) | 0xf0;
+          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
+          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
+          *utf8_out++ = (code_point & 0x3f) | 0x80;
+          continue;
+        }
+      }
+
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -272,40 +313,39 @@
   second = (code_point & 0x03ff) + 0xdc00;
 }
 
-static void testConversions(uint16_t *buf, size_t char_count) {
+static void testConversions(uint16_t *buf, int char_count) {
+  char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
+  uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
+  int byte_count_test, byte_count_reference;
+  int char_count_test, char_count_reference;
+
   // Calculate the number of utf-8 bytes for the utf-16 chars.
-  size_t byte_count_reference = CountModifiedUtf8Bytes_reference(buf, char_count);
-  size_t byte_count_test = CountModifiedUtf8Bytes(buf, char_count);
-  ASSERT_EQ(byte_count_reference, byte_count_test);
+  byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
+  byte_count_test = CountUtf8Bytes(buf, char_count);
+  EXPECT_EQ(byte_count_reference, byte_count_test);
 
   // Convert the utf-16 string to utf-8 bytes.
-  char bytes_test[8], bytes_reference[9];
-  ASSERT_LT(byte_count_reference, arraysize(bytes_reference));
   ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
-  ASSERT_LE(byte_count_test, arraysize(bytes_test));
   ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
-  for (size_t i = 0; i < byte_count_test; ++i) {
-    ASSERT_EQ(bytes_reference[i], bytes_test[i]);
+  for (int i = 0; i < byte_count_test; ++i) {
+    EXPECT_EQ(bytes_reference[i], bytes_test[i]);
   }
 
   // Calculate the number of utf-16 chars from the utf-8 bytes.
   bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
-  size_t char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
-  size_t char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
-  ASSERT_EQ(char_count, char_count_reference);
-  ASSERT_EQ(char_count, char_count_test);
+  char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
+  char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
+  EXPECT_EQ(char_count, char_count_reference);
+  EXPECT_EQ(char_count, char_count_test);
 
   // Convert the utf-8 bytes back to utf-16 chars.
   // Does not need copied _reference version of the function because the original
   // function with the old API is retained for debug/testing code.
-  uint16_t out_buf_test[4], out_buf_reference[4];
-  ASSERT_LE(char_count_reference, arraysize(out_buf_reference));
   ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
-  ASSERT_LE(char_count_test, arraysize(out_buf_test));
   ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
-  for (size_t i = 0; i < char_count_test; ++i) {
-    ASSERT_EQ(buf[i], out_buf_reference[i]);
-    ASSERT_EQ(buf[i], out_buf_test[i]);
+  for (int i = 0; i < char_count_test; ++i) {
+    EXPECT_EQ(buf[i], out_buf_reference[i]);
+    EXPECT_EQ(buf[i], out_buf_test[i]);
   }
 }
 

diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index cac6eb8..45fbf00 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc

@@ -2022,7 +2022,7 @@
         buf[length] = '\0';
       } else {
         const jchar* chars = s->GetValue();
-        size_t bytes = CountModifiedUtf8Bytes(chars + start, length);
+        size_t bytes = CountUtf8Bytes(chars + start, length);
         ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
         buf[bytes] = '\0';
       }

diff --git a/runtime/jni/jni_internal_test.cc b/runtime/jni/jni_internal_test.cc
index 4c675f4..ed84397 100644
--- a/runtime/jni/jni_internal_test.cc
+++ b/runtime/jni/jni_internal_test.cc

@@ -1510,13 +1510,13 @@
   EXPECT_NE(s, nullptr);
   EXPECT_EQ(2, env_->GetStringLength(s));
 
-  // The surrogate pair gets encoded into two 3-byte sequences...
-  EXPECT_EQ(6, env_->GetStringUTFLength(s));
+  // The surrogate pair gets encoded into a 4 byte UTF sequence..
+  EXPECT_EQ(4, env_->GetStringUTFLength(s));
   const char* chars = env_->GetStringUTFChars(s, nullptr);
-  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars);
+  EXPECT_STREQ("\xf0\x90\x90\x80", chars);
   env_->ReleaseStringUTFChars(s, chars);
 
-  // ... and it is stored as the two surrogates in the utf-16 representation.
+  // .. but is stored as is in the utf-16 representation.
   const jchar* jchars = env_->GetStringChars(s, nullptr);
   EXPECT_EQ(0xd801, jchars[0]);
   EXPECT_EQ(0xdc00, jchars[1]);
@@ -1527,9 +1527,7 @@
   EXPECT_NE(s, nullptr);
 
   // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate
-  // pair {0xd83c, 0xdfe0} which is then converted into a two three byte
-  // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of
-  // the surrogate pair.
+  // pair {0xd83c, 0xdfe0}.
   EXPECT_EQ(5, env_->GetStringLength(s));
   jchars = env_->GetStringChars(s, nullptr);
   // The first surrogate pair, encoded as such in the input.
@@ -1540,9 +1538,9 @@
   EXPECT_EQ(0xdfe0, jchars[4]);
   env_->ReleaseStringChars(s, jchars);
 
-  EXPECT_EQ(13, env_->GetStringUTFLength(s));
+  EXPECT_EQ(9, env_->GetStringUTFLength(s));
   chars = env_->GetStringUTFChars(s, nullptr);
-  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars);
+  EXPECT_STREQ("\xf0\x90\x90\x80 \xf0\x9f\x8f\xa0", chars);
   env_->ReleaseStringUTFChars(s, chars);
 
   // A string with 1, 2, 3 and 4 byte UTF sequences with spaces
@@ -1550,7 +1548,7 @@
   s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0");
   EXPECT_NE(s, nullptr);
   EXPECT_EQ(8, env_->GetStringLength(s));
-  EXPECT_EQ(15, env_->GetStringUTFLength(s));
+  EXPECT_EQ(13, env_->GetStringUTFLength(s));
 }
 
 TEST_F(JniInternalTest, NewStringUTF_Validation) {
@@ -1859,13 +1857,12 @@
   env_->ReleaseStringCritical(s, chars);
 
   if (mirror::kUseStringCompression) {
-    // is_copy_16 has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
+    // is_copy has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
     jboolean is_copy_16 = JNI_TRUE;
     jstring s_16 = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
     chars = env_->GetStringCritical(s_16, &is_copy_16);
-    EXPECT_EQ(JNI_FALSE, is_copy_16);
     EXPECT_EQ(2, env_->GetStringLength(s_16));
-    EXPECT_EQ(6, env_->GetStringUTFLength(s_16));
+    EXPECT_EQ(4, env_->GetStringUTFLength(s_16));
     env_->ReleaseStringCritical(s_16, chars);
   }
 }

diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 11eff0f..dd28036 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h

@@ -88,7 +88,7 @@
   if (IsCompressed()) {
     return GetLength();
   } else {
-    return CountModifiedUtf8Bytes(GetValue(), GetLength());
+    return CountUtf8Bytes(GetValue(), GetLength());
   }
 }
 

diff --git a/test/181-proxy-non-bmp/expected-stderr.txt b/test/181-proxy-non-bmp/expected-stderr.txt
deleted file mode 100644
index e69de29..0000000
--- a/test/181-proxy-non-bmp/expected-stderr.txt
+++ /dev/null


diff --git a/test/181-proxy-non-bmp/expected-stdout.txt b/test/181-proxy-non-bmp/expected-stdout.txt
deleted file mode 100644
index 24fce9b..0000000
--- a/test/181-proxy-non-bmp/expected-stdout.txt
+++ /dev/null

@@ -1 +0,0 @@
-Invoke public abstract void pkg𐀀.PackageTestInterface.interfaceMethod()

diff --git a/test/181-proxy-non-bmp/info.txt b/test/181-proxy-non-bmp/info.txt
deleted file mode 100644
index a13a8f9..0000000
--- a/test/181-proxy-non-bmp/info.txt
+++ /dev/null

@@ -1,6 +0,0 @@
-Regression test for bad handling of package name containing a character outside
-the BMP plane. For a proxy class with a non-public interface in such a package,
-this caused the package name comparison to fail because the dex file encoding
-had two three-byte sequences while the descriptor was encoded with a four-byte
-sequence, leading to IAE when calling a proxy method via the interface.
-Bug: 192935764

diff --git a/test/181-proxy-non-bmp/src/Main.java b/test/181-proxy-non-bmp/src/Main.java
deleted file mode 100644
index e938599..0000000
--- a/test/181-proxy-non-bmp/src/Main.java
+++ /dev/null

@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import pkg𐀀.PackageTest;
-
-public class Main {
-    public static void main(String[] args) {
-        PackageTest.main();
-    }
-}

diff --git "a/test/181-proxy-non-bmp/src/pkg\360\220\200\200/PackageTest.java" "b/test/181-proxy-non-bmp/src/pkg\360\220\200\200/PackageTest.java"
deleted file mode 100644
index e65d92f..0000000
--- "a/test/181-proxy-non-bmp/src/pkg\360\220\200\200/PackageTest.java"
+++ /dev/null

@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package pkg𐀀;
-
-import java.lang.reflect.InvocationHandler;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
-
-public class PackageTest {
-    public static void main() {
-        InvocationHandler handler = new PackageInvocationHandler();
-        Class<?> proxyClass = Proxy.getProxyClass(
-                PackageTestInterface.class.getClassLoader(), PackageTestInterface.class);
-        try {
-            Constructor<?> ctor = proxyClass.getConstructor(InvocationHandler.class);
-            Object proxy = ctor.newInstance(handler);
-            PackageTestInterface asInterface = (PackageTestInterface) proxy;
-            asInterface.interfaceMethod();
-        } catch (Exception e) {
-            System.out.println("failed: " + e);
-        }
-    }
-}
-
-interface PackageTestInterface {
-    public void interfaceMethod();
-}
-
-class PackageInvocationHandler implements InvocationHandler {
-    public Object invoke(Object proxy, Method method, Object[] args) {
-        System.out.println("Invoke " + method);
-        return null;
-    }
-}

diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index cc941d8..521f9a6 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc

@@ -198,7 +198,7 @@
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr == p->tag_to_find) {
-        size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);

diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index 10355c9..28a737d 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc

@@ -586,7 +586,7 @@
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr != 0) {
-        size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);

diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h
index d646c07..341e106 100644
--- a/test/ti-agent/ti_utf.h
+++ b/test/ti-agent/ti_utf.h

@@ -123,6 +123,28 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
+      // Char_count == 0 here implies we've encountered an unpaired
+      // surrogate and we have no choice but to encode it as 3-byte UTF
+      // sequence. Note that unpaired surrogates can occur as a part of
+      // "normal" operation.
+      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
+        const uint16_t ch2 = *utf16_in;
+
+        // Check if the other half of the pair is within the expected
+        // range. If it isn't, we will have to emit both "halves" as
+        // separate 3 byte sequences.
+        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+          utf16_in++;
+          char_count--;
+          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
+          *utf8_out++ = (code_point >> 18) | 0xf0;
+          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
+          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
+          *utf8_out++ = (code_point & 0x3f) | 0x80;
+          continue;
+        }
+      }
+
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -137,7 +159,7 @@
   }
 }
 
-inline size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) {
+inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   const uint16_t *end = chars + char_count;
   while (chars < end) {
@@ -150,6 +172,19 @@
       result += 2;
       continue;
     }
+    if (ch >= 0xd800 && ch < 0xdc00) {
+      if (chars < end) {
+        const uint16_t ch2 = *chars;
+        // If we find a properly paired surrogate, we emit it as a 4 byte
+        // UTF sequence. If we find an unpaired leading or trailing surrogate,
+        // we emit it as a 3 byte sequence like would have done earlier.
+        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+          chars++;
+          result += 4;
+          continue;
+        }
+      }
+    }
     result += 3;
   }
   return result;

diff --git a/test/utils/regen-test-files b/test/utils/regen-test-files
index 12d85b8..f9cb056 100755
--- a/test/utils/regen-test-files
+++ b/test/utils/regen-test-files

@@ -555,9 +555,6 @@
     # Ignore test with a copy of `sun.misc.Unsafe`.
     if os.path.isfile(os.path.join(run_test_path, "src", "sun", "misc", "Unsafe.java")):
       return False
-    # Ignore test with a non-ascii package name `pkg𐀀`. b/193141629
-    if os.path.isdir(os.path.join(run_test_path, "src", "pkg𐀀")):
-      return False
     # Ignore tests with Hidden API specs.
     if os.path.isfile(os.path.join(run_test_path, "hiddenapi-flags.csv")):
       return False
commit	1b9d442dc906d0158300c5178683f417fa59b026	[log] [tgz]
author	Chuck Liao <chuckliao@google.com>	Mon Jul 12 01:25:23 2021 +0000
committer	Chuck Liao <chuckliao@google.com>	Mon Jul 12 02:58:46 2021 +0000
tree	dd52852cd7a16af7f5480c5f6c9213079a319dc8
parent	52eebc756c96fbf71ad77b6d30bdff8a67723569 [diff]