Do not create 4-byte sequences in `ConvertUtf16ToModifiedUtf8()`

This encoding was different from the encoding in dex files and this caused wrong `Class::IsInSamePackage()` result for proxy classes if the packages name contained at least one character outside the BMP plane. This essentially reverts commit e16dad1d6388b0305f13e2171308a77f42e7c682 but keeps all tests, only updating expectations. Also rename `CountUtf8Bytes()` to `CountModifiedUtf8Bytes()` to make the format explicit in the function name. Test: New test 181-proxy-non-bmp Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Bug: 192935764 Bug: 193141629 Change-Id: I8e6e8b08174c7a6d3ff84b0e1e929056b3947785
author: Vladimir Marko <vmarko@google.com> 2021-07-08 11:22:53 +0100
committer: Treehugger Robot <treehugger-gerrit@google.com> 2021-07-09 23:17:15 +0000
commit: e0a4f373dc4a738a2f26965a67d31239cbbd4f6a (patch)
tree: b097c82983074f167ef1a14ab529750fcc842b9f
parent: d920b7b01f92676eec2aa504ef6cf13c3f3283fc (diff)
15 files changed, 138 insertions, 159 deletions
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc
index bfc704d4a6..76fb49f7e7 100644
--- a/libdexfile/dex/utf.cc
+++ b/libdexfile/dex/utf.cc
@@ -121,8 +121,10 @@ void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
   }
 }
 
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
-                                const uint16_t* utf16_in, size_t char_count) {
+void ConvertUtf16ToModifiedUtf8(char* utf8_out,
+                                size_t byte_count,
+                                const uint16_t* utf16_in,
+                                size_t char_count) {
   if (LIKELY(byte_count == char_count)) {
     // Common case where all characters are ASCII.
     const uint16_t *utf16_end = utf16_in + char_count;
@@ -138,28 +140,6 @@ void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -240,7 +220,7 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t
   }
 }
 
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   const uint16_t *end = chars + char_count;
   while (chars < end) {
@@ -253,19 +233,6 @@ size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
       result += 2;
       continue;
     }
-    if (ch >= 0xd800 && ch < 0xdc00) {
-      if (chars < end) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
-          chars++;
-          result += 4;
-          continue;
-        }
-      }
-    }
     result += 3;
   }
   return result;
diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index e3dc7f9474..55d025df42 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h
@@ -43,7 +43,7 @@ size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count);
  * Returns the number of modified UTF-8 bytes needed to represent the given
  * UTF-16 string.
  */
-size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count);
+size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count);
 
 /*
  * Convert from Modified UTF-8 to UTF-16.
@@ -67,12 +67,14 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t
 
 /*
  * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
- * NUL-terminated. You probably need to call CountUtf8Bytes before calling
- * this anyway, so if you want a NUL-terminated string, you know where to
- * put the NUL byte.
+ * NUL-terminated. You probably need to call CountModifiedUtf8Bytes before
+ * calling this anyway, so if you want a NUL-terminated string, you know
+ * where to put the NUL byte.
  */
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
-                                const uint16_t* utf16_in, size_t char_count);
+void ConvertUtf16ToModifiedUtf8(char* utf8_out,
+                                size_t byte_count,
+                                const uint16_t* utf16_in,
+                                size_t char_count);
 
 /*
  * The java.lang.String hashCode() algorithm.
diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc
index 919259e4d3..fc24d545e7 100644
--- a/libdexfile/dex/utf_test.cc
+++ b/libdexfile/dex/utf_test.cc
@@ -117,7 +117,7 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) {
 
 static void AssertConversion(const std::vector<uint16_t>& input,
                              const std::vector<uint8_t>& expected) {
-  ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
+  ASSERT_EQ(expected.size(), CountModifiedUtf8Bytes(&input[0], input.size()));
 
   std::vector<uint8_t> output(expected.size());
   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
@@ -126,8 +126,8 @@ static void AssertConversion(const std::vector<uint16_t>& input,
 }
 
 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
-  // Surrogate pairs will be converted into 4 byte sequences.
-  AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
+  // Surrogate pairs will be converted into two three-byte sequences.
+  AssertConversion({ 0xd801, 0xdc00 }, { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80 });
 
   // Three byte encodings that are below & above the leading surrogate
   // range respectively.
@@ -143,12 +143,12 @@ TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
 
   AssertConversion({
-      0xd802, 0xdc02,  // Surrogate pair.
+      0xd802, 0xdc02,  // Surrogate pair - three byte encodings.
       0xdef0, 0xdcff,  // Three byte encodings.
       0x0101, 0x0000,  // Two byte encodings.
       'p'   , 'p'      // One byte encoding.
     }, {
-      0xf0, 0x90, 0xa0, 0x82,
+      0xed, 0xa0, 0x82, 0xed, 0xb0, 0x82,
       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
       0xc4, 0x81, 0xc0, 0x80,
       0x70, 0x70
@@ -229,31 +229,12 @@ size_t CountModifiedUtf8Chars_reference(const char* utf8) {
   return len;
 }
 
-static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+static size_t CountModifiedUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   while (char_count--) {
     const uint16_t ch = *chars++;
     if (ch > 0 && ch <= 0x7f) {
       ++result;
-    } else if (ch >= 0xd800 && ch <= 0xdbff) {
-      if (char_count > 0) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          chars++;
-          char_count--;
-
-          result += 4;
-        } else {
-          result += 3;
-        }
-      } else {
-        // This implies we found an unpaired trailing surrogate at the end
-        // of a string.
-        result += 3;
-      }
     } else if (ch > 0x7ff) {
       result += 3;
     } else {
@@ -270,28 +251,6 @@ static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t*
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -313,39 +272,40 @@ static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint1
   second = (code_point & 0x03ff) + 0xdc00;
 }
 
-static void testConversions(uint16_t *buf, int char_count) {
-  char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
-  uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
-  int byte_count_test, byte_count_reference;
-  int char_count_test, char_count_reference;
-
+static void testConversions(uint16_t *buf, size_t char_count) {
   // Calculate the number of utf-8 bytes for the utf-16 chars.
-  byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
-  byte_count_test = CountUtf8Bytes(buf, char_count);
-  EXPECT_EQ(byte_count_reference, byte_count_test);
+  size_t byte_count_reference = CountModifiedUtf8Bytes_reference(buf, char_count);
+  size_t byte_count_test = CountModifiedUtf8Bytes(buf, char_count);
+  ASSERT_EQ(byte_count_reference, byte_count_test);
 
   // Convert the utf-16 string to utf-8 bytes.
+  char bytes_test[8], bytes_reference[9];
+  ASSERT_LT(byte_count_reference, arraysize(bytes_reference));
   ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
+  ASSERT_LE(byte_count_test, arraysize(bytes_test));
   ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
-  for (int i = 0; i < byte_count_test; ++i) {
-    EXPECT_EQ(bytes_reference[i], bytes_test[i]);
+  for (size_t i = 0; i < byte_count_test; ++i) {
+    ASSERT_EQ(bytes_reference[i], bytes_test[i]);
   }
 
   // Calculate the number of utf-16 chars from the utf-8 bytes.
   bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
-  char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
-  char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
-  EXPECT_EQ(char_count, char_count_reference);
-  EXPECT_EQ(char_count, char_count_test);
+  size_t char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
+  size_t char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
+  ASSERT_EQ(char_count, char_count_reference);
+  ASSERT_EQ(char_count, char_count_test);
 
   // Convert the utf-8 bytes back to utf-16 chars.
   // Does not need copied _reference version of the function because the original
   // function with the old API is retained for debug/testing code.
+  uint16_t out_buf_test[4], out_buf_reference[4];
+  ASSERT_LE(char_count_reference, arraysize(out_buf_reference));
   ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
+  ASSERT_LE(char_count_test, arraysize(out_buf_test));
   ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
-  for (int i = 0; i < char_count_test; ++i) {
-    EXPECT_EQ(buf[i], out_buf_reference[i]);
-    EXPECT_EQ(buf[i], out_buf_test[i]);
+  for (size_t i = 0; i < char_count_test; ++i) {
+    ASSERT_EQ(buf[i], out_buf_reference[i]);
+    ASSERT_EQ(buf[i], out_buf_test[i]);
   }
 }
 
diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index 45fbf003be..cac6eb8e6a 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc
@@ -2022,7 +2022,7 @@ class JNI {
         buf[length] = '\0';
       } else {
         const jchar* chars = s->GetValue();
-        size_t bytes = CountUtf8Bytes(chars + start, length);
+        size_t bytes = CountModifiedUtf8Bytes(chars + start, length);
         ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
         buf[bytes] = '\0';
       }
diff --git a/runtime/jni/jni_internal_test.cc b/runtime/jni/jni_internal_test.cc
index ed8439729e..4c675f4ba2 100644
--- a/runtime/jni/jni_internal_test.cc
+++ b/runtime/jni/jni_internal_test.cc
@@ -1510,13 +1510,13 @@ TEST_F(JniInternalTest, NewStringUTF) {
   EXPECT_NE(s, nullptr);
   EXPECT_EQ(2, env_->GetStringLength(s));
 
-  // The surrogate pair gets encoded into a 4 byte UTF sequence..
-  EXPECT_EQ(4, env_->GetStringUTFLength(s));
+  // The surrogate pair gets encoded into two 3-byte sequences...
+  EXPECT_EQ(6, env_->GetStringUTFLength(s));
   const char* chars = env_->GetStringUTFChars(s, nullptr);
-  EXPECT_STREQ("\xf0\x90\x90\x80", chars);
+  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars);
   env_->ReleaseStringUTFChars(s, chars);
 
-  // .. but is stored as is in the utf-16 representation.
+  // ... and it is stored as the two surrogates in the utf-16 representation.
   const jchar* jchars = env_->GetStringChars(s, nullptr);
   EXPECT_EQ(0xd801, jchars[0]);
   EXPECT_EQ(0xdc00, jchars[1]);
@@ -1527,7 +1527,9 @@ TEST_F(JniInternalTest, NewStringUTF) {
   EXPECT_NE(s, nullptr);
 
   // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate
-  // pair {0xd83c, 0xdfe0}.
+  // pair {0xd83c, 0xdfe0} which is then converted into a two three byte
+  // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of
+  // the surrogate pair.
   EXPECT_EQ(5, env_->GetStringLength(s));
   jchars = env_->GetStringChars(s, nullptr);
   // The first surrogate pair, encoded as such in the input.
@@ -1538,9 +1540,9 @@ TEST_F(JniInternalTest, NewStringUTF) {
   EXPECT_EQ(0xdfe0, jchars[4]);
   env_->ReleaseStringChars(s, jchars);
 
-  EXPECT_EQ(9, env_->GetStringUTFLength(s));
+  EXPECT_EQ(13, env_->GetStringUTFLength(s));
   chars = env_->GetStringUTFChars(s, nullptr);
-  EXPECT_STREQ("\xf0\x90\x90\x80 \xf0\x9f\x8f\xa0", chars);
+  EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars);
   env_->ReleaseStringUTFChars(s, chars);
 
   // A string with 1, 2, 3 and 4 byte UTF sequences with spaces
@@ -1548,7 +1550,7 @@ TEST_F(JniInternalTest, NewStringUTF) {
   s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0");
   EXPECT_NE(s, nullptr);
   EXPECT_EQ(8, env_->GetStringLength(s));
-  EXPECT_EQ(13, env_->GetStringUTFLength(s));
+  EXPECT_EQ(15, env_->GetStringUTFLength(s));
 }
 
 TEST_F(JniInternalTest, NewStringUTF_Validation) {
@@ -1857,12 +1859,13 @@ TEST_F(JniInternalTest, GetStringCritical_ReleaseStringCritical) {
   env_->ReleaseStringCritical(s, chars);
 
   if (mirror::kUseStringCompression) {
-    // is_copy has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
+    // is_copy_16 has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
     jboolean is_copy_16 = JNI_TRUE;
     jstring s_16 = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
     chars = env_->GetStringCritical(s_16, &is_copy_16);
+    EXPECT_EQ(JNI_FALSE, is_copy_16);
     EXPECT_EQ(2, env_->GetStringLength(s_16));
-    EXPECT_EQ(4, env_->GetStringUTFLength(s_16));
+    EXPECT_EQ(6, env_->GetStringUTFLength(s_16));
     env_->ReleaseStringCritical(s_16, chars);
   }
 }
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index dd280364e1..11eff0f761 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -88,7 +88,7 @@ inline int32_t String::GetUtfLength() {
   if (IsCompressed()) {
     return GetLength();
   } else {
-    return CountUtf8Bytes(GetValue(), GetLength());
+    return CountModifiedUtf8Bytes(GetValue(), GetLength());
   }
 }
 
diff --git a/test/181-proxy-non-bmp/expected-stderr.txt b/test/181-proxy-non-bmp/expected-stderr.txt
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/test/181-proxy-non-bmp/expected-stderr.txt
diff --git a/test/181-proxy-non-bmp/expected-stdout.txt b/test/181-proxy-non-bmp/expected-stdout.txt
new file mode 100644
index 0000000000..24fce9b270
--- /dev/null
+++ b/test/181-proxy-non-bmp/expected-stdout.txt
@@ -0,0 +1 @@
+Invoke public abstract void pkg𐀀.PackageTestInterface.interfaceMethod()
diff --git a/test/181-proxy-non-bmp/info.txt b/test/181-proxy-non-bmp/info.txt
new file mode 100644
index 0000000000..a13a8f9ce9
--- /dev/null
+++ b/test/181-proxy-non-bmp/info.txt
@@ -0,0 +1,6 @@
+Regression test for bad handling of package name containing a character outside
+the BMP plane. For a proxy class with a non-public interface in such a package,
+this caused the package name comparison to fail because the dex file encoding
+had two three-byte sequences while the descriptor was encoded with a four-byte
+sequence, leading to IAE when calling a proxy method via the interface.
+Bug: 192935764
diff --git a/test/181-proxy-non-bmp/src/Main.java b/test/181-proxy-non-bmp/src/Main.java
new file mode 100644
index 0000000000..e93859978e
--- /dev/null
+++ b/test/181-proxy-non-bmp/src/Main.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import pkg𐀀.PackageTest;
+
+public class Main {
+    public static void main(String[] args) {
+        PackageTest.main();
+    }
+}
diff --git a/test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java b/test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java
new file mode 100644
index 0000000000..e65d92f4e1
--- /dev/null
+++ b/test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package pkg𐀀;
+
+import java.lang.reflect.InvocationHandler;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+import java.lang.reflect.Proxy;
+
+public class PackageTest {
+    public static void main() {
+        InvocationHandler handler = new PackageInvocationHandler();
+        Class<?> proxyClass = Proxy.getProxyClass(
+                PackageTestInterface.class.getClassLoader(), PackageTestInterface.class);
+        try {
+            Constructor<?> ctor = proxyClass.getConstructor(InvocationHandler.class);
+            Object proxy = ctor.newInstance(handler);
+            PackageTestInterface asInterface = (PackageTestInterface) proxy;
+            asInterface.interfaceMethod();
+        } catch (Exception e) {
+            System.out.println("failed: " + e);
+        }
+    }
+}
+
+interface PackageTestInterface {
+    public void interfaceMethod();
+}
+
+class PackageInvocationHandler implements InvocationHandler {
+    public Object invoke(Object proxy, Method method, Object[] args) {
+        System.out.println("Invoke " + method);
+        return null;
+    }
+}
diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index 521f9a6c72..cc941d86b1 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc
@@ -198,7 +198,7 @@ extern "C" JNIEXPORT jstring JNICALL Java_art_Test906_iterateThroughHeapString(
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr == p->tag_to_find) {
-        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index 28a737de0f..10355c9c1b 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -586,7 +586,7 @@ extern "C" JNIEXPORT jobjectArray JNICALL Java_art_Test913_followReferencesStrin
                                             void* user_data) {
       FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data);
       if (*tag_ptr != 0) {
-        size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length);
+        size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length);
         std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]);
         memset(mod_utf.get(), 0, utf_byte_count + 1);
         ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length);
diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h
index 341e1066c3..d646c0725f 100644
--- a/test/ti-agent/ti_utf.h
+++ b/test/ti-agent/ti_utf.h
@@ -123,28 +123,6 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out,
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -159,7 +137,7 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out,
   }
 }
 
-inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
+inline size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) {
   size_t result = 0;
   const uint16_t *end = chars + char_count;
   while (chars < end) {
@@ -172,19 +150,6 @@ inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
       result += 2;
       continue;
     }
-    if (ch >= 0xd800 && ch < 0xdc00) {
-      if (chars < end) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 < 0xe000) {
-          chars++;
-          result += 4;
-          continue;
-        }
-      }
-    }
     result += 3;
   }
   return result;
diff --git a/test/utils/regen-test-files b/test/utils/regen-test-files
index f9cb056817..12d85b834d 100755
--- a/test/utils/regen-test-files
+++ b/test/utils/regen-test-files
@@ -555,6 +555,9 @@ class Generator:
     # Ignore test with a copy of `sun.misc.Unsafe`.
     if os.path.isfile(os.path.join(run_test_path, "src", "sun", "misc", "Unsafe.java")):
       return False
+    # Ignore test with a non-ascii package name `pkg𐀀`. b/193141629
+    if os.path.isdir(os.path.join(run_test_path, "src", "pkg𐀀")):
+      return False
     # Ignore tests with Hidden API specs.
     if os.path.isfile(os.path.join(run_test_path, "hiddenapi-flags.csv")):
       return False
author	Vladimir Marko <vmarko@google.com>	2021-07-08 11:22:53 +0100
committer	Treehugger Robot <treehugger-gerrit@google.com>	2021-07-09 23:17:15 +0000
commit	e0a4f373dc4a738a2f26965a67d31239cbbd4f6a (patch)
tree	b097c82983074f167ef1a14ab529750fcc842b9f
parent	d920b7b01f92676eec2aa504ef6cf13c3f3283fc (diff)