Reland "Do not create 4-byte sequences in `ConvertUtf16ToModifiedUtf8()`"

This encoding was different from the encoding in dex files
and this caused several issues for classes with descriptors
containing at least one character outside the BMP plane.
For example `Class.forName(nonBmpClass.getName())` would
fail for such classes.

This reverts commit 1b9d442dc906d0158300c5178683f417fa59b026.

Reason for revert: Reland after
    https://android-review.googlesource.com/1771690 .
This change no longer affects the operation of JNI functions
dealing with String "UTF" encoding, documented as Modified
UTF-8 but producing an odd variation between Modified UTF-8
and normal UTF-8 in ART due to historical reasons.

Test: New test 183-non-bmp-package-name
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 192935764
Bug: 238888095
Change-Id: I7191e030c7b69e4ea6848fe82d0625683a9331d9
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc
index 9692a26..ee55a09 100644
--- a/libdexfile/dex/utf.cc
+++ b/libdexfile/dex/utf.cc
@@ -121,8 +121,10 @@
   }
 }
 
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
-                                const uint16_t* utf16_in, size_t char_count) {
+void ConvertUtf16ToModifiedUtf8(char* utf8_out,
+                                size_t byte_count,
+                                const uint16_t* utf16_in,
+                                size_t char_count) {
   if (LIKELY(byte_count == char_count)) {
     // Common case where all characters are ASCII.
     const uint16_t *utf16_end = utf16_in + char_count;
@@ -133,10 +135,9 @@
   }
 
   // String contains non-ASCII characters.
-  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
   auto append = [&](char c) { *utf8_out++ = c; };
   ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
-                     /*kUse4ByteSequence=*/ true,
+                     /*kUse4ByteSequence=*/ false,
                      /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
 }
 
@@ -207,11 +208,10 @@
 }
 
 size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
-  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
   size_t result = 0;
   auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; };
   ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
-                     /*kUse4ByteSequence=*/ true,
+                     /*kUse4ByteSequence=*/ false,
                      /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
   return result;
 }
diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index d372bff..86ca211 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h
@@ -90,8 +90,10 @@
  * this anyway, so if you want a NUL-terminated string, you know where to
  * put the NUL byte.
  */
-void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
-                                const uint16_t* utf16_in, size_t char_count);
+void ConvertUtf16ToModifiedUtf8(char* utf8_out,
+                                size_t byte_count,
+                                const uint16_t* utf16_in,
+                                size_t char_count);
 
 /*
  * The java.lang.String hashCode() algorithm.
diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc
index 85c74d2..acf217f 100644
--- a/libdexfile/dex/utf_test.cc
+++ b/libdexfile/dex/utf_test.cc
@@ -126,8 +126,8 @@
 }
 
 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
-  // Surrogate pairs will be converted into 4 byte sequences.
-  AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
+  // Surrogate pairs will be converted into two three-byte sequences.
+  AssertConversion({ 0xd801, 0xdc00 }, { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80 });
 
   // Three byte encodings that are below & above the leading surrogate
   // range respectively.
@@ -143,12 +143,12 @@
   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
 
   AssertConversion({
-      0xd802, 0xdc02,  // Surrogate pair.
+      0xd802, 0xdc02,  // Surrogate pair - three byte encodings.
       0xdef0, 0xdcff,  // Three byte encodings.
       0x0101, 0x0000,  // Two byte encodings.
       'p'   , 'p'      // One byte encoding.
     }, {
-      0xf0, 0x90, 0xa0, 0x82,
+      0xed, 0xa0, 0x82, 0xed, 0xb0, 0x82,
       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
       0xc4, 0x81, 0xc0, 0x80,
       0x70, 0x70
@@ -235,25 +235,6 @@
     const uint16_t ch = *chars++;
     if (ch > 0 && ch <= 0x7f) {
       ++result;
-    } else if (ch >= 0xd800 && ch <= 0xdbff) {
-      if (char_count > 0) {
-        const uint16_t ch2 = *chars;
-        // If we find a properly paired surrogate, we emit it as a 4 byte
-        // UTF sequence. If we find an unpaired leading or trailing surrogate,
-        // we emit it as a 3 byte sequence like would have done earlier.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          chars++;
-          char_count--;
-
-          result += 4;
-        } else {
-          result += 3;
-        }
-      } else {
-        // This implies we found an unpaired trailing surrogate at the end
-        // of a string.
-        result += 3;
-      }
     } else if (ch > 0x7ff) {
       result += 3;
     } else {
@@ -270,28 +251,6 @@
     if (ch > 0 && ch <= 0x7f) {
       *utf8_out++ = ch;
     } else {
-      // Char_count == 0 here implies we've encountered an unpaired
-      // surrogate and we have no choice but to encode it as 3-byte UTF
-      // sequence. Note that unpaired surrogates can occur as a part of
-      // "normal" operation.
-      if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
-        const uint16_t ch2 = *utf16_in;
-
-        // Check if the other half of the pair is within the expected
-        // range. If it isn't, we will have to emit both "halves" as
-        // separate 3 byte sequences.
-        if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
-          utf16_in++;
-          char_count--;
-          const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
-          *utf8_out++ = (code_point >> 18) | 0xf0;
-          *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
-          *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
-          *utf8_out++ = (code_point & 0x3f) | 0x80;
-          continue;
-        }
-      }
-
       if (ch > 0x07ff) {
         // Three byte encoding.
         *utf8_out++ = (ch >> 12) | 0xe0;
@@ -313,39 +272,40 @@
   second = (code_point & 0x03ff) + 0xdc00;
 }
 
-static void testConversions(uint16_t *buf, int char_count) {
-  char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
-  uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
-  int byte_count_test, byte_count_reference;
-  int char_count_test, char_count_reference;
-
+static void testConversions(uint16_t *buf, size_t char_count) {
   // Calculate the number of utf-8 bytes for the utf-16 chars.
-  byte_count_reference = CountModifiedUtf8BytesInUtf16_reference(buf, char_count);
-  byte_count_test = CountModifiedUtf8BytesInUtf16(buf, char_count);
-  EXPECT_EQ(byte_count_reference, byte_count_test);
+  size_t byte_count_reference = CountModifiedUtf8BytesInUtf16_reference(buf, char_count);
+  size_t byte_count_test = CountModifiedUtf8BytesInUtf16(buf, char_count);
+  ASSERT_EQ(byte_count_reference, byte_count_test);
 
   // Convert the utf-16 string to utf-8 bytes.
+  char bytes_test[8], bytes_reference[9];
+  ASSERT_LT(byte_count_reference, arraysize(bytes_reference));
   ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
+  ASSERT_LE(byte_count_test, arraysize(bytes_test));
   ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
-  for (int i = 0; i < byte_count_test; ++i) {
-    EXPECT_EQ(bytes_reference[i], bytes_test[i]);
+  for (size_t i = 0; i < byte_count_test; ++i) {
+    ASSERT_EQ(bytes_reference[i], bytes_test[i]);
   }
 
   // Calculate the number of utf-16 chars from the utf-8 bytes.
   bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
-  char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
-  char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
-  EXPECT_EQ(char_count, char_count_reference);
-  EXPECT_EQ(char_count, char_count_test);
+  size_t char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
+  size_t char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
+  ASSERT_EQ(char_count, char_count_reference);
+  ASSERT_EQ(char_count, char_count_test);
 
   // Convert the utf-8 bytes back to utf-16 chars.
   // Does not need copied _reference version of the function because the original
   // function with the old API is retained for debug/testing code.
+  uint16_t out_buf_test[4], out_buf_reference[4];
+  ASSERT_LE(char_count_reference, arraysize(out_buf_reference));
   ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
+  ASSERT_LE(char_count_test, arraysize(out_buf_test));
   ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
-  for (int i = 0; i < char_count_test; ++i) {
-    EXPECT_EQ(buf[i], out_buf_reference[i]);
-    EXPECT_EQ(buf[i], out_buf_test[i]);
+  for (size_t i = 0; i < char_count_test; ++i) {
+    ASSERT_EQ(buf[i], out_buf_reference[i]);
+    ASSERT_EQ(buf[i], out_buf_test[i]);
   }
 }
 
diff --git a/test/183-non-bmp-package-name/expected-stderr.txt b/test/183-non-bmp-package-name/expected-stderr.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/183-non-bmp-package-name/expected-stderr.txt
diff --git a/test/183-non-bmp-package-name/expected-stdout.txt b/test/183-non-bmp-package-name/expected-stdout.txt
new file mode 100644
index 0000000..9058e58
--- /dev/null
+++ b/test/183-non-bmp-package-name/expected-stdout.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+Invoke public abstract void pkg𐀀.PackageTestInterface.interfaceMethod()
diff --git a/test/183-non-bmp-package-name/info.txt b/test/183-non-bmp-package-name/info.txt
new file mode 100644
index 0000000..8b15b5a
--- /dev/null
+++ b/test/183-non-bmp-package-name/info.txt
@@ -0,0 +1,7 @@
+Regression test for bad handling of package name containing a character outside
+the BMP plane. For a proxy class with a non-public interface in such a package,
+this caused the package name comparison to fail because the dex file encoding
+had two three-byte sequences while the descriptor was encoded with a four-byte
+sequence, leading to IllegalArgumentException when calling a proxy method via
+the interface.
+Bug: 192935764
diff --git a/test/183-non-bmp-package-name/jni_find_class.cc b/test/183-non-bmp-package-name/jni_find_class.cc
new file mode 100644
index 0000000..8a9e967
--- /dev/null
+++ b/test/183-non-bmp-package-name/jni_find_class.cc
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/logging.h"
+#include "jni.h"
+#include "scoped_thread_state_change-inl.h"
+#include "thread-current-inl.h"
+
+namespace art {
+
+extern "C" JNIEXPORT jclass JNICALL Java_Main_jniFindClass(
+    JNIEnv* env, jclass, jstring class_name) {
+  CHECK(class_name != nullptr);
+  // FIXME: We should test consistency with `env->GetStringUTFChars(...)` here
+  // but the JNI uses the wrong encoding. Bug: 238888095
+  if ((true)) {
+    ScopedObjectAccess soa(Thread::Current());
+    std::string name = soa.Decode<mirror::String>(class_name)->ToModifiedUtf8();
+    jclass clazz = env->FindClass(name.c_str());
+    return clazz;
+  } else {
+    const char* name = env->GetStringUTFChars(class_name, nullptr);
+    CHECK(name != nullptr);
+    jclass clazz = env->FindClass(name);
+    env->ReleaseStringUTFChars(class_name, name);
+    return clazz;
+  }
+}
+
+}  // namespace art
+
diff --git a/test/183-non-bmp-package-name/src/Main.java b/test/183-non-bmp-package-name/src/Main.java
new file mode 100644
index 0000000..973da97
--- /dev/null
+++ b/test/183-non-bmp-package-name/src/Main.java
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import pkg𐀀.PackageTest;
+
+public class Main {
+    public static void main(String[] args) {
+        System.loadLibrary(args[0]);
+
+        PackageTest.main();
+    }
+
+    public static native Class<?> jniFindClass(String className);
+}
diff --git "a/test/183-non-bmp-package-name/src/pkg\360\220\200\200/PackageTest.java" "b/test/183-non-bmp-package-name/src/pkg\360\220\200\200/PackageTest.java"
new file mode 100644
index 0000000..2b3b968
--- /dev/null
+++ "b/test/183-non-bmp-package-name/src/pkg\360\220\200\200/PackageTest.java"
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package pkg𐀀;
+
+import java.lang.reflect.InvocationHandler;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+import java.lang.reflect.Proxy;
+
+public class PackageTest {
+    public static void main() {
+        testJniFindClass();
+        testClassForName();
+        testProxy();
+    }
+
+    private static void testJniFindClass() {
+        try {
+            // Cannot import "Main". Use reflection to call `Main.jniFindClass()`.
+            Class<?> mainClass = Class.forName("Main");
+            Method jniFindClass = mainClass.getMethod("jniFindClass", String.class);
+            Class<?> klass = (Class<?>) jniFindClass.invoke(null, "pkg𐀀/PackageTest");
+            if (klass != PackageTest.class) {
+              System.out.println("Unexpected class: " + klass);
+            }
+        } catch (InvocationTargetException ite) {
+            ite.getCause().printStackTrace(System.out);  // Unwrap the cause.
+        } catch (Throwable t) {
+            t.printStackTrace(System.out);
+        }
+    }
+
+    private static void testClassForName() {
+        try {
+            Class<?> klass = Class.forName("pkg𐀀.PackageTest");
+            if (klass != PackageTest.class) {
+              System.out.println("Unexpected class: " + klass);
+            }
+        } catch (Throwable t) {
+            t.printStackTrace(System.out);
+        }
+    }
+
+    private static void testProxy() {
+        try {
+            InvocationHandler handler = new PackageInvocationHandler();
+            Class<?> proxyClass = Proxy.getProxyClass(
+                    PackageTestInterface.class.getClassLoader(), PackageTestInterface.class);
+            Constructor<?> ctor = proxyClass.getConstructor(InvocationHandler.class);
+            Object proxy = ctor.newInstance(handler);
+            PackageTestInterface asInterface = (PackageTestInterface) proxy;
+            asInterface.interfaceMethod();
+        } catch (Throwable t) {
+            t.printStackTrace(System.out);
+        }
+    }
+}
+
+interface PackageTestInterface {
+    public void interfaceMethod();
+}
+
+class PackageInvocationHandler implements InvocationHandler {
+    public Object invoke(Object proxy, Method method, Object[] args) {
+        System.out.println("Invoke " + method);
+        return null;
+    }
+}
diff --git a/test/Android.bp b/test/Android.bp
index c5322cb..4111009 100644
--- a/test/Android.bp
+++ b/test/Android.bp
@@ -938,6 +938,7 @@
         "177-visibly-initialized-deadlock/visibly_initialized.cc",
         "178-app-image-native-method/native_methods.cc",
         "179-nonvirtual-jni/nonvirtual-call.cc",
+        "183-non-bmp-package-name/jni_find_class.cc",
         "1945-proxy-method-arguments/get_args.cc",
         "203-multi-checkpoint/multi_checkpoint.cc",
         "305-other-fault-handler/fault_handler.cc",
diff --git a/test/knownfailures.json b/test/knownfailures.json
index da318d9..98156f4 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -1131,6 +1131,7 @@
     },
     {
         "tests": ["178-app-image-native-method",
+                  "183-non-bmp-package-name",
                   "530-checker-peel-unroll",
                   "616-cha-unloading",
                   "674-hiddenapi",
diff --git a/test/run_test_build.py b/test/run_test_build.py
index 75cd64f..b81ef93 100755
--- a/test/run_test_build.py
+++ b/test/run_test_build.py
@@ -102,6 +102,7 @@
       "SMALI": args.smali.absolute(),
       "SOONG_ZIP": args.soong_zip.absolute(),
       "TEST_NAME": self.test_name,
+      "LANG": "en_US.UTF-8",  # Needed to pass UTF-8 command line arguments.
     }
 
   def bash(self, cmd):
diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h
index 15fe22c..a585b6e 100644
--- a/test/ti-agent/ti_utf.h
+++ b/test/ti-agent/ti_utf.h
@@ -169,19 +169,17 @@
   }
 
   // String contains non-ASCII characters.
-  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
   auto append = [&](char c) { *utf8_out++ = c; };
   ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
-                     /*kUse4ByteSequence=*/ true,
+                     /*kUse4ByteSequence=*/ false,
                      /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
 }
 
 inline size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
-  // FIXME: We should not emit 4-byte sequences. Bug: 192935764
   size_t result = 0;
   auto append = [&](char c ATTRIBUTE_UNUSED) { ++result; };
   ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
-                     /*kUse4ByteSequence=*/ true,
+                     /*kUse4ByteSequence=*/ false,
                      /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
   return result;
 }
diff --git a/test/utils/regen-test-files b/test/utils/regen-test-files
index 5b72479..7b43d4a 100755
--- a/test/utils/regen-test-files
+++ b/test/utils/regen-test-files
@@ -399,6 +399,9 @@
     # Skip test with a copy of `sun.misc.Unsafe`.
     if os.path.isfile(os.path.join(run_test_path, "src", "sun", "misc", "Unsafe.java")):
       return False
+    # Ignore test with a non-ascii package name `pkg𐀀`. b/193141629
+    if os.path.isdir(os.path.join(run_test_path, "src", "pkg𐀀")):
+      return False
     # Skip tests with Hidden API specs.
     if os.path.isfile(os.path.join(run_test_path, "hiddenapi-flags.csv")):
       return False