Do not create 4-byte sequences in `ConvertUtf16ToModifiedUtf8()` This encoding was different from the encoding in dex files and this caused wrong `Class::IsInSamePackage()` result for proxy classes if the packages name contained at least one character outside the BMP plane. This essentially reverts commit e16dad1d6388b0305f13e2171308a77f42e7c682 but keeps all tests, only updating expectations. Also rename `CountUtf8Bytes()` to `CountModifiedUtf8Bytes()` to make the format explicit in the function name. Test: New test 181-proxy-non-bmp Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Bug: 192935764 Bug: 193141629 Change-Id: I8e6e8b08174c7a6d3ff84b0e1e929056b3947785

commit: e0a4f373dc4a738a2f26965a67d31239cbbd4f6a [log] [tgz]
author: Vladimir Marko <vmarko@google.com> Thu Jul 08 11:22:53 2021 +0100
committer: Treehugger Robot <treehugger-gerrit@google.com> Fri Jul 09 23:17:15 2021 +0000
tree: b097c82983074f167ef1a14ab529750fcc842b9f
parent: d920b7b01f92676eec2aa504ef6cf13c3f3283fc [diff]
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc
index bfc704d..76fb49f 100644
--- a/libdexfile/dex/utf.cc
+++ b/libdexfile/dex/utf.cc

@@ -121,8 +121,10 @@
   }
 }
 
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
-                                const uint16_t* utf16_in, size_t char_count) {
+void ConvertUtf16ToModifiedUtf8(char* utf8_out,
+                                size_t byte_count,
+                                const uint16_t* utf16_in,
+                                size_t char_count) {
   if (LIKELY(byte_count == char_count)) {
     // Common case where all characters are ASCII.
     const uint16_t *utf16_end = utf16_in + char_count;
@@ -138,28 +140,6 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -240,7 +220,7 @@
   }
 }
 
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   const uint16_t *end = chars + char_count;
   while (chars < end) {
@@ -253,19 +233,6 @@
       result += 2;
       continue;
     }
-    if (ch >= 0xd800 && ch < 0xdc00) {
-      if (chars < end) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
-          chars++;
-          result += 4;
-          continue;
-        }
-      }
-    }
     result += 3;
   }
   return result;

diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index e3dc7f9..55d025d 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h

@@ -43,7 +43,7 @@
  * Returns the number of modified UTF-8 bytes needed to represent the given
  * UTF-16 string.
  */
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count);
+size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count);
 
 /*
  * Convert from Modified UTF-8 to UTF-16.
@@ -67,12 +67,14 @@
 
 /*
  * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
- * NUL-terminated. You probably need to call CountUtf8Bytes before calling
- * this anyway, so if you want a NUL-terminated string, you know where to
- * put the NUL byte.
+ * NUL-terminated. You probably need to call CountModifiedUtf8Bytes before
+ * calling this anyway, so if you want a NUL-terminated string, you know
+ * where to put the NUL byte.
  */
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
-                                const uint16_t* utf16_in, size_t char_count);
+void ConvertUtf16ToModifiedUtf8(char* utf8_out,
+                                size_t byte_count,
+                                const uint16_t* utf16_in,
+                                size_t char_count);
 
 /*
  * The java.lang.String hashCode() algorithm.

diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc
index 919259e..fc24d54 100644
--- a/libdexfile/dex/utf_test.cc
+++ b/libdexfile/dex/utf_test.cc

@@ -117,7 +117,7 @@
 
 static void AssertConversion(const std::vector<uint16_t>& input,
                              const std::vector<uint8_t>& expected) {
-  ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
+  ASSERT_EQ(expected.size(), CountModifiedUtf8Bytes(&input[0], input.size()));
 
   std::vector<uint8_t> output(expected.size());
   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
@@ -126,8 +126,8 @@
 }
 
 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
-  // Surrogate pairs will be converted into 4 byte sequences.
-  AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
+  // Surrogate pairs will be converted into two three-byte sequences.
+  AssertConversion({ 0xd801, 0xdc00 }, { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80 });
 
   // Three byte encodings that are below & above the leading surrogate
   // range respectively.
@@ -143,12 +143,12 @@
   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
 
   AssertConversion({
-      0xd802, 0xdc02,  // Surrogate pair.
+      0xd802, 0xdc02,  // Surrogate pair - three byte encodings.
       0xdef0, 0xdcff,  // Three byte encodings.
       0x0101, 0x0000,  // Two byte encodings.
       'p'   , 'p'      // One byte encoding.
     }, {
-      0xf0, 0x90, 0xa0, 0x82,
+      0xed, 0xa0, 0x82, 0xed, 0xb0, 0x82,
       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
       0xc4, 0x81, 0xc0, 0x80,
       0x70, 0x70
@@ -229,31 +229,12 @@
   return len;
 }
 
-static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+static size_t CountModifiedUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   while (char_count--) {
     const uint16_t ch = *chars++;
     if (ch > 0 && ch <= 0x7f) {
       ++result;
-    } else if (ch >= 0xd800 && ch <= 0xdbff) {
-      if (char_count > 0) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          chars++;
-          char_count--;
-
-          result += 4;
-        } else {
-          result += 3;
-        }
-      } else {
-        // This implies we found an unpaired trailing surrogate at the end
-        // of a string.
-        result += 3;
-      }
     } else if (ch > 0x7ff) {
       result += 3;
     } else {
@@ -270,28 +251,6 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -313,39 +272,40 @@
   second = (code_point & 0x03ff) + 0xdc00;
 }
 
-static void testConversions(uint16_t *buf, int char_count) {
-  char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
-  uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
-  int byte_count_test, byte_count_reference;
-  int char_count_test, char_count_reference;
-
+static void testConversions(uint16_t *buf, size_t char_count) {
   // Calculate the number of utf-8 bytes for the utf-16 chars.
-  byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
-  byte_count_test = CountUtf8Bytes(buf, char_count);
-  EXPECT_EQ(byte_count_reference, byte_count_test);
+  size_t byte_count_reference = CountModifiedUtf8Bytes_reference(buf, char_count);
+  size_t byte_count_test = CountModifiedUtf8Bytes(buf, char_count);
+  ASSERT_EQ(byte_count_reference, byte_count_test);
 
   // Convert the utf-16 string to utf-8 bytes.
+  char bytes_test[8], bytes_reference[9];
+  ASSERT_LT(byte_count_reference, arraysize(bytes_reference));
   ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
+  ASSERT_LE(byte_count_test, arraysize(bytes_test));
   ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
-  for (int i = 0; i < byte_count_test; ++i) {
-    EXPECT_EQ(bytes_reference[i], bytes_test[i]);
+  for (size_t i = 0; i < byte_count_test; ++i) {
+    ASSERT_EQ(bytes_reference[i], bytes_test[i]);
   }
 
   // Calculate the number of utf-16 chars from the utf-8 bytes.
   bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
-  char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
-  char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
-  EXPECT_EQ(char_count, char_count_reference);
-  EXPECT_EQ(char_count, char_count_test);
+  size_t char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
+  size_t char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
+  ASSERT_EQ(char_count, char_count_reference);
+  ASSERT_EQ(char_count, char_count_test);
 
   // Convert the utf-8 bytes back to utf-16 chars.
   // Does not need copied _reference version of the function because the original
   // function with the old API is retained for debug/testing code.
+  uint16_t out_buf_test[4], out_buf_reference[4];
+  ASSERT_LE(char_count_reference, arraysize(out_buf_reference));
   ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
+  ASSERT_LE(char_count_test, arraysize(out_buf_test));
   ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
-  for (int i = 0; i < char_count_test; ++i) {
-    EXPECT_EQ(buf[i], out_buf_reference[i]);
-    EXPECT_EQ(buf[i], out_buf_test[i]);
+  for (size_t i = 0; i < char_count_test; ++i) {
+    ASSERT_EQ(buf[i], out_buf_reference[i]);
+    ASSERT_EQ(buf[i], out_buf_test[i]);
   }
 }
 

diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index 45fbf00..cac6eb8 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc

@@ -2022,7 +2022,7 @@
         buf[length] = '\0';
       } else {
         const jchar* chars = s->GetValue();
-        size_t bytes = CountUtf8Bytes(chars + start, length);
+        size_t bytes = CountModifiedUtf8Bytes(chars + start, length);
         ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
         buf[bytes] = '\0';
       }

diff --git a/runtime/jni/jni_internal_test.cc b/runtime/jni/jni_internal_test.cc
index ed84397..4c675f4 100644
--- a/runtime/jni/jni_internal_test.cc
+++ b/runtime/jni/jni_internal_test.cc

@@ -1510,13 +1510,13 @@
   EXPECT_NE(s, nullptr);
   EXPECT_EQ(2, env_->GetStringLength(s));
 
-  // The surrogate pair gets encoded into a 4 byte UTF sequence..
-  EXPECT_EQ(4, env_->GetStringUTFLength(s));
+  // The surrogate pair gets encoded into two 3-byte sequences...
+  EXPECT_EQ(6, env_->GetStringUTFLength(s));
   const char* chars = env_->GetStringUTFChars(s, nullptr);
-  EXPECT_STREQ("\xf0\x90\x90\x80", chars);
+  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars);
   env_->ReleaseStringUTFChars(s, chars);
 
-  // .. but is stored as is in the utf-16 representation.
+  // ... and it is stored as the two surrogates in the utf-16 representation.
   const jchar* jchars = env_->GetStringChars(s, nullptr);
   EXPECT_EQ(0xd801, jchars[0]);
   EXPECT_EQ(0xdc00, jchars[1]);
@@ -1527,7 +1527,9 @@
   EXPECT_NE(s, nullptr);
 
   // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate
-  // pair {0xd83c, 0xdfe0}.
+  // pair {0xd83c, 0xdfe0} which is then converted into a two three byte
+  // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of
+  // the surrogate pair.
   EXPECT_EQ(5, env_->GetStringLength(s));
   jchars = env_->GetStringChars(s, nullptr);
   // The first surrogate pair, encoded as such in the input.
@@ -1538,9 +1540,9 @@
   EXPECT_EQ(0xdfe0, jchars[4]);
   env_->ReleaseStringChars(s, jchars);
 
-  EXPECT_EQ(9, env_->GetStringUTFLength(s));
+  EXPECT_EQ(13, env_->GetStringUTFLength(s));
   chars = env_->GetStringUTFChars(s, nullptr);
-  EXPECT_STREQ("\xf0\x90\x90\x80 \xf0\x9f\x8f\xa0", chars);
+  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars);
   env_->ReleaseStringUTFChars(s, chars);
 
   // A string with 1, 2, 3 and 4 byte UTF sequences with spaces
@@ -1548,7 +1550,7 @@
   s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0");
   EXPECT_NE(s, nullptr);
   EXPECT_EQ(8, env_->GetStringLength(s));
-  EXPECT_EQ(13, env_->GetStringUTFLength(s));
+  EXPECT_EQ(15, env_->GetStringUTFLength(s));
 }
 
 TEST_F(JniInternalTest, NewStringUTF_Validation) {
@@ -1857,12 +1859,13 @@
   env_->ReleaseStringCritical(s, chars);
 
   if (mirror::kUseStringCompression) {
-    // is_copy has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
+    // is_copy_16 has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
     jboolean is_copy_16 = JNI_TRUE;
     jstring s_16 = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
     chars = env_->GetStringCritical(s_16, &is_copy_16);
+    EXPECT_EQ(JNI_FALSE, is_copy_16);
     EXPECT_EQ(2, env_->GetStringLength(s_16));
-    EXPECT_EQ(4, env_->GetStringUTFLength(s_16));
+    EXPECT_EQ(6, env_->GetStringUTFLength(s_16));
     env_->ReleaseStringCritical(s_16, chars);
   }
 }

diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index dd28036..11eff0f 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h

@@ -88,7 +88,7 @@
   if (IsCompressed()) {
     return GetLength();
   } else {
-    return CountUtf8Bytes(GetValue(), GetLength());
+    return CountModifiedUtf8Bytes(GetValue(), GetLength());
   }
 }
 

diff --git a/test/181-proxy-non-bmp/expected-stderr.txt b/test/181-proxy-non-bmp/expected-stderr.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/181-proxy-non-bmp/expected-stderr.txt


diff --git a/test/181-proxy-non-bmp/expected-stdout.txt b/test/181-proxy-non-bmp/expected-stdout.txt
new file mode 100644
index 0000000..24fce9b
--- /dev/null
+++ b/test/181-proxy-non-bmp/expected-stdout.txt

@@ -0,0 +1 @@
+Invoke public abstract void pkg𐀀.PackageTestInterface.interfaceMethod()

diff --git a/test/181-proxy-non-bmp/info.txt b/test/181-proxy-non-bmp/info.txt
new file mode 100644
index 0000000..a13a8f9
--- /dev/null
+++ b/test/181-proxy-non-bmp/info.txt

@@ -0,0 +1,6 @@
+Regression test for bad handling of package name containing a character outside
+the BMP plane. For a proxy class with a non-public interface in such a package,
+this caused the package name comparison to fail because the dex file encoding
+had two three-byte sequences while the descriptor was encoded with a four-byte
+sequence, leading to IAE when calling a proxy method via the interface.
+Bug: 192935764

diff --git a/test/181-proxy-non-bmp/src/Main.java b/test/181-proxy-non-bmp/src/Main.java
new file mode 100644
index 0000000..e938599
--- /dev/null
+++ b/test/181-proxy-non-bmp/src/Main.java

@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import pkg𐀀.PackageTest;
+
+public class Main {
+    public static void main(String[] args) {
+        PackageTest.main();
+    }
+}

diff --git "a/test/181-proxy-non-bmp/src/pkg\360\220\200\200/PackageTest.java" "b/test/181-proxy-non-bmp/src/pkg\360\220\200\200/PackageTest.java"
new file mode 100644
index 0000000..e65d92f
--- /dev/null
+++ "b/test/181-proxy-non-bmp/src/pkg\360\220\200\200/PackageTest.java"

@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package pkg𐀀;
+
+import java.lang.reflect.InvocationHandler;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+import java.lang.reflect.Proxy;
+
+public class PackageTest {
+    public static void main() {
+        InvocationHandler handler = new PackageInvocationHandler();
+        Class<?> proxyClass = Proxy.getProxyClass(
+                PackageTestInterface.class.getClassLoader(), PackageTestInterface.class);
+        try {
+            Constructor<?> ctor = proxyClass.getConstructor(InvocationHandler.class);
+            Object proxy = ctor.newInstance(handler);
+            PackageTestInterface asInterface = (PackageTestInterface) proxy;
+            asInterface.interfaceMethod();
+        } catch (Exception e) {
+            System.out.println("failed: " + e);
+        }
+    }
+}
+
+interface PackageTestInterface {
+    public void interfaceMethod();
+}
+
+class PackageInvocationHandler implements InvocationHandler {
+    public Object invoke(Object proxy, Method method, Object[] args) {
+        System.out.println("Invoke " + method);
+        return null;
+    }
+}

diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index 521f9a6..cc941d8 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc

@@ -198,7 +198,7 @@
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr == p->tag_to_find) {
-        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);

diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index 28a737d..10355c9 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc

@@ -586,7 +586,7 @@
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr != 0) {
-        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);

diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h
index 341e106..d646c07 100644
--- a/test/ti-agent/ti_utf.h
+++ b/test/ti-agent/ti_utf.h

@@ -123,28 +123,6 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -159,7 +137,7 @@
   }
 }
 
-inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+inline size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   const uint16_t *end = chars + char_count;
   while (chars < end) {
@@ -172,19 +150,6 @@
       result += 2;
       continue;
     }
-    if (ch >= 0xd800 && ch < 0xdc00) {
-      if (chars < end) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
-          chars++;
-          result += 4;
-          continue;
-        }
-      }
-    }
     result += 3;
   }
   return result;

diff --git a/test/utils/regen-test-files b/test/utils/regen-test-files
index f9cb056..12d85b8 100755
--- a/test/utils/regen-test-files
+++ b/test/utils/regen-test-files

@@ -555,6 +555,9 @@
     # Ignore test with a copy of `sun.misc.Unsafe`.
     if os.path.isfile(os.path.join(run_test_path, "src", "sun", "misc", "Unsafe.java")):
       return False
+    # Ignore test with a non-ascii package name `pkg𐀀`. b/193141629
+    if os.path.isdir(os.path.join(run_test_path, "src", "pkg𐀀")):
+      return False
     # Ignore tests with Hidden API specs.
     if os.path.isfile(os.path.join(run_test_path, "hiddenapi-flags.csv")):
       return False
commit	e0a4f373dc4a738a2f26965a67d31239cbbd4f6a	[log] [tgz]
author	Vladimir Marko <vmarko@google.com>	Thu Jul 08 11:22:53 2021 +0100
committer	Treehugger Robot <treehugger-gerrit@google.com>	Fri Jul 09 23:17:15 2021 +0000
tree	b097c82983074f167ef1a14ab529750fcc842b9f
parent	d920b7b01f92676eec2aa504ef6cf13c3f3283fc [diff]