Rewrite JNI NewStringUTF().

For historical reasons, NewStringUTF() accepts 4-byte UTF-8
sequences which are not valid Modified UTF-8. This can be
considered an extension of the JNI specification.

Rewrite the function to deal with the decoding explicitly,
so that we can later avoid dealing with the 4-byte sequences
in internal ART routines intended for Modified UTF-8.
Note that the DexFileVerifier rejects them in dex files.

Change the decoding to accept improperly encoded ASCII
characters as two-byte or three-byte sequences.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 192935764
Change-Id: I3f91da990110e419c78138ce997123277bf3aaba
diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index 45fbf00..b417bec 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc
@@ -81,19 +81,17 @@
   va_list* args;
 };
 
-constexpr char kBadUtf8ReplacementChar[] = "?";
+constexpr char kBadUtf8ReplacementChar = '?';
 
-// This is a modified version of CountModifiedUtf8Chars() from utf.cc
+// This is a modified version of `CountModifiedUtf8Chars()` from utf.cc,
 // with extra checks and different output options.
 //
 // The `good` functor can process valid characters.
-// The `bad` functor is called when we find an invalid character and
-// returns true to abort processing, or false to continue processing.
+// The `bad` functor is called when we find an invalid character.
 //
-// When aborted, VisitModifiedUtf8Chars() returns 0, otherwise the
-// number of UTF-16 chars.
+// Returns the number of UTF-16 characters.
 template <typename GoodFunc, typename BadFunc>
-size_t VisitModifiedUtf8Chars(const char* utf8, size_t byte_count, GoodFunc good, BadFunc bad) {
+size_t VisitUtf8Chars(const char* utf8, size_t byte_count, GoodFunc good, BadFunc bad) {
   DCHECK_LE(byte_count, strlen(utf8));
   size_t len = 0;
   const char* end = utf8 + byte_count;
@@ -106,24 +104,15 @@
       len += 1u;
       continue;
     }
-    auto is_ascii = [utf8]() {
-      const char* ptr = utf8;  // Make a copy that can be modified by GetUtf16FromUtf8().
-      return mirror::String::IsASCII(dchecked_integral_cast<uint16_t>(GetUtf16FromUtf8(&ptr)));
-    };
-    // Note: Neither CountModifiedUtf8Chars() nor GetUtf16FromUtf8() checks whether
-    // the bit 0x40 is correctly set in the leading byte of a multi-byte sequence.
+    // Note: We do not check whether the bit 0x40 is correctly set in the leading byte of
+    // a multi-byte sequence. Nor do we verify the top two bits of continuation characters.
     if ((ic & 0x20) == 0) {
       // Two-byte encoding.
       if (static_cast<size_t>(end - utf8) < 2u) {
-        return bad() ? 0u : len + 1u;  // Reached end of sequence
+        bad();
+        return len + 1u;  // Reached end of sequence.
       }
-      if (mirror::kUseStringCompression && is_ascii()) {
-        if (bad()) {
-          return 0u;
-        }
-      } else {
-        good(utf8, 2u);
-      }
+      good(utf8, 2u);
       utf8 += 2u;
       len += 1u;
       continue;
@@ -131,24 +120,19 @@
     if ((ic & 0x10) == 0) {
       // Three-byte encoding.
       if (static_cast<size_t>(end - utf8) < 3u) {
-        return bad() ? 0u : len + 1u;  // Reached end of sequence
+        bad();
+        return len + 1u;  // Reached end of sequence
       }
-      if (mirror::kUseStringCompression && is_ascii()) {
-        if (bad()) {
-          return 0u;
-        }
-      } else {
-        good(utf8, 3u);
-      }
+      good(utf8, 3u);
       utf8 += 3u;
       len += 1u;
       continue;
     }
 
     // Four-byte encoding: needs to be converted into a surrogate pair.
-    // The decoded chars are never ASCII.
     if (static_cast<size_t>(end - utf8) < 4u) {
-      return bad() ? 0u : len + 1u;  // Reached end of sequence
+      bad();
+      return len + 1u;  // Reached end of sequence.
     }
     good(utf8, 4u);
     utf8 += 4u;
@@ -157,6 +141,79 @@
   return len;
 }
 
+ALWAYS_INLINE
+static inline uint16_t DecodeModifiedUtf8Character(const char* ptr, size_t length) {
+  switch (length) {
+    case 1:
+      return ptr[0];
+    case 2:
+      return ((ptr[0] & 0x1fu) << 6) | (ptr[1] & 0x3fu);
+    case 3:
+      return ((ptr[0] & 0x0fu) << 12) | ((ptr[1] & 0x3fu) << 6) | (ptr[2] & 0x3fu);
+    default:
+      LOG(FATAL) << "UNREACHABLE";  // 4-byte sequences are not valid Modified UTF-8.
+      UNREACHABLE();
+  }
+}
+
+class NewStringUTFVisitor {
+ public:
+  NewStringUTFVisitor(const char* utf, size_t utf8_length, int32_t count, bool has_bad_char)
+      : utf_(utf), utf8_length_(utf8_length), count_(count), has_bad_char_(has_bad_char) {}
+
+  void operator()(ObjPtr<mirror::Object> obj, size_t usable_size ATTRIBUTE_UNUSED) const
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    // Avoid AsString as object is not yet in live bitmap or allocation stack.
+    ObjPtr<mirror::String> string = ObjPtr<mirror::String>::DownCast(obj);
+    string->SetCount(count_);
+    DCHECK(!string->IsCompressed() || mirror::kUseStringCompression);
+    if (string->IsCompressed()) {
+      uint8_t* value_compressed = string->GetValueCompressed();
+      auto good = [&](const char* ptr, size_t length) {
+        uint16_t c = DecodeModifiedUtf8Character(ptr, length);
+        DCHECK(mirror::String::IsASCII(c));
+        *value_compressed++ = dchecked_integral_cast<uint8_t>(c);
+      };
+      auto bad = [&]() {
+        DCHECK(has_bad_char_);
+        *value_compressed++ = kBadUtf8ReplacementChar;
+      };
+      VisitUtf8Chars(utf_, utf8_length_, good, bad);
+    } else {
+      // Uncompressed.
+      uint16_t* value = string->GetValue();
+      auto good = [&](const char* ptr, size_t length) {
+        if (length != 4u) {
+          *value++ = DecodeModifiedUtf8Character(ptr, length);
+        } else {
+          const uint32_t code_point = ((ptr[0] & 0x0fu) << 18) |
+                                      ((ptr[1] & 0x3fu) << 12) |
+                                      ((ptr[2] & 0x3fu) << 6) |
+                                      (ptr[3] & 0x3fu);
+          // TODO: What do we do about values outside the range [U+10000, U+10FFFF]?
+          // The spec says they're invalid but nobody appears to check for them.
+          const uint32_t code_point_bits = code_point - 0x10000u;
+          *value++ = 0xd800u | ((code_point_bits >> 10) & 0x3ffu);
+          *value++ = 0xdc00u | (code_point_bits & 0x3ffu);
+        }
+      };
+      auto bad = [&]() {
+        DCHECK(has_bad_char_);
+        *value++ = kBadUtf8ReplacementChar;
+      };
+      VisitUtf8Chars(utf_, utf8_length_, good, bad);
+      DCHECK(!mirror::kUseStringCompression ||
+             !mirror::String::AllASCII(string->GetValue(), string->GetLength()));
+    }
+  }
+
+ private:
+  const char* utf_;
+  size_t utf8_length_;
+  const int32_t count_;
+  bool has_bad_char_;
+};
+
 }  // namespace
 
 // Consider turning this on when there is errors which could be related to JNI array copies such as
@@ -1905,6 +1962,9 @@
     return soa.AddLocalReference<jstring>(result);
   }
 
+  // For historical reasons, NewStringUTF() accepts 4-byte UTF-8
+  // sequences which are not valid Modified UTF-8. This can be
+  // considered an extension of the JNI specification.
   static jstring NewStringUTF(JNIEnv* env, const char* utf) {
     if (utf == nullptr) {
       return nullptr;
@@ -1912,27 +1972,48 @@
 
     // The input may come from an untrusted source, so we need to validate it.
     // We do not perform full validation, only as much as necessary to avoid reading
-    // beyond the terminating null character or breaking string compression invariants.
-    // CheckJNI performs stronger validation.
+    // beyond the terminating null character. CheckJNI performs stronger validation.
     size_t utf8_length = strlen(utf);
-    if (UNLIKELY(utf8_length > static_cast<uint32_t>(std::numeric_limits<int32_t>::max()))) {
-      // Converting the utf8_length to int32_t for String::AllocFromModifiedUtf8() would
-      // overflow. Throw OOME eagerly to avoid 2GiB allocation when trying to replace
-      // invalid sequences (even if such replacements could reduce the size below 2GiB).
+    bool compressible = mirror::kUseStringCompression;
+    bool has_bad_char = false;
+    size_t utf16_length = VisitUtf8Chars(
+        utf,
+        utf8_length,
+        /*good=*/ [&compressible](const char* ptr, size_t length) {
+          if (mirror::kUseStringCompression) {
+            switch (length) {
+              case 1:
+                DCHECK(mirror::String::IsASCII(*ptr));
+                break;
+              case 2:
+              case 3:
+                if (!mirror::String::IsASCII(DecodeModifiedUtf8Character(ptr, length))) {
+                  compressible = false;
+                }
+                break;
+              default:
+                // 4-byte sequences lead to uncompressible surroate pairs.
+                DCHECK_EQ(length, 4u);
+                compressible = false;
+                break;
+            }
+          }
+        },
+        /*bad=*/ [&has_bad_char]() {
+          static_assert(mirror::String::IsASCII(kBadUtf8ReplacementChar));  // Compressible.
+          has_bad_char = true;
+        });
+    if (UNLIKELY(utf16_length > static_cast<uint32_t>(std::numeric_limits<int32_t>::max()))) {
+      // Converting the utf16_length to int32_t would overflow. Explicitly throw an OOME.
       std::string error =
-          android::base::StringPrintf("NewStringUTF input is 2 GiB or more: %zu", utf8_length);
+          android::base::StringPrintf("NewStringUTF input has 2^31 or more characters: %zu",
+                                      utf16_length);
       ScopedObjectAccess soa(env);
       soa.Self()->ThrowOutOfMemoryError(error.c_str());
       return nullptr;
     }
-    std::optional<std::string> replacement_utf;
-    size_t utf16_length = VisitModifiedUtf8Chars(
-        utf,
-        utf8_length,
-        /*good=*/ [](const char* ptr ATTRIBUTE_UNUSED, size_t length ATTRIBUTE_UNUSED) {},
-        /*bad=*/ []() { return true; });  // Abort processing and return 0 for bad characters.
-    if (UNLIKELY(utf8_length != 0u && utf16_length == 0u)) {
-      // VisitModifiedUtf8Chars() aborted for a bad character.
+    if (UNLIKELY(has_bad_char)) {
+      // VisitUtf8Chars() found a bad character.
       android_errorWriteLog(0x534e4554, "172655291");  // Report to SafetyNet.
       // Report the error to logcat but avoid too much spam.
       static const uint64_t kMinDelay = UINT64_C(10000000000);  // 10s
@@ -1943,28 +2024,14 @@
           prev_bad_input_time.compare_exchange_strong(prev_time, now, std::memory_order_relaxed)) {
         LOG(ERROR) << "Invalid UTF-8 input to JNI::NewStringUTF()";
       }
-      // Copy the input to the `replacement_utf` and replace bad characters.
-      replacement_utf.emplace();
-      replacement_utf->reserve(utf8_length);
-      utf16_length = VisitModifiedUtf8Chars(
-          utf,
-          utf8_length,
-          /*good=*/ [&](const char* ptr, size_t length) {
-            replacement_utf->append(ptr, length);
-          },
-          /*bad=*/ [&]() {
-            replacement_utf->append(kBadUtf8ReplacementChar, sizeof(kBadUtf8ReplacementChar) - 1u);
-            return false;  // Continue processing.
-          });
-      utf = replacement_utf->c_str();
-      utf8_length = replacement_utf->length();
     }
-    DCHECK_LE(utf16_length, utf8_length);
-    DCHECK_LE(utf8_length, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
+    const int32_t length_with_flag = mirror::String::GetFlaggedCount(utf16_length, compressible);
+    NewStringUTFVisitor visitor(utf, utf8_length, length_with_flag, has_bad_char);
 
     ScopedObjectAccess soa(env);
+    gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
     ObjPtr<mirror::String> result =
-        mirror::String::AllocFromModifiedUtf8(soa.Self(), utf16_length, utf, utf8_length);
+        mirror::String::Alloc(soa.Self(), length_with_flag, allocator_type, visitor);
     return soa.AddLocalReference<jstring>(result);
   }
 
diff --git a/runtime/jni/jni_internal_test.cc b/runtime/jni/jni_internal_test.cc
index ed84397..eca5aec 100644
--- a/runtime/jni/jni_internal_test.cc
+++ b/runtime/jni/jni_internal_test.cc
@@ -1612,7 +1612,6 @@
   env_->ReleaseStringChars(s, jchars);
 
   // Replace the leading two-byte sequence with a two-byte sequence that decodes as ASCII (0x40).
-  // The sequence shall be replaced if string compression is used.
   utf_src[0] = '\xc1';
   utf_src[1] = '\x80';
   s = base_env->NewStringUTF(env_, utf_src);
@@ -1620,7 +1619,7 @@
   ASSERT_EQ(mirror::String::GetFlaggedCount(kPageSize - 2u, /* compressible= */ true),
             env_->GetIntField(s, count_fid));
   jchars = env_->GetStringChars(s, nullptr);
-  EXPECT_EQ(mirror::kUseStringCompression ? '?' : '\x40', jchars[0]);
+  EXPECT_EQ('\x40', jchars[0]);
   for (size_t pos = 1; pos != kPageSize - 3u; ++pos) {
     ASSERT_EQ('x', jchars[pos]) << pos;
   }
@@ -1628,7 +1627,6 @@
   env_->ReleaseStringChars(s, jchars);
 
   // Replace the leading three bytes with a three-byte sequence that decodes as ASCII (0x40).
-  // The sequence shall be replaced if string compression is used.
   utf_src[0] = '\xe0';
   utf_src[1] = '\x81';
   utf_src[2] = '\x80';
@@ -1637,7 +1635,7 @@
   ASSERT_EQ(mirror::String::GetFlaggedCount(kPageSize - 3u, /* compressible= */ true),
             env_->GetIntField(s, count_fid));
   jchars = env_->GetStringChars(s, nullptr);
-  EXPECT_EQ(mirror::kUseStringCompression ? '?' : '\x40', jchars[0]);
+  EXPECT_EQ('\x40', jchars[0]);
   for (size_t pos = 1; pos != kPageSize - 4u; ++pos) {
     ASSERT_EQ('x', jchars[pos]) << pos;
   }
@@ -1651,12 +1649,28 @@
   ASSERT_EQ(mirror::String::GetFlaggedCount(kPageSize - 4u, /* compressible= */ false),
             env_->GetIntField(s, count_fid));
   jchars = env_->GetStringChars(s, nullptr);
-  EXPECT_EQ(mirror::kUseStringCompression ? '?' : '\x40', jchars[0]);
+  EXPECT_EQ('\x40', jchars[0]);
   for (size_t pos = 1; pos != kPageSize - 5u; ++pos) {
     ASSERT_EQ('x', jchars[pos]) << pos;
   }
   EXPECT_EQ('\0', jchars[kPageSize - 5u]);
   env_->ReleaseStringChars(s, jchars);
+
+  // Replace the last three characters with a three-byte sequence that decodes as 0.
+  // This is an incorrect encoding but `NewStringUTF()` is permissive.
+  utf_src[kPageSize - 4u] = '\xe0';
+  utf_src[kPageSize - 3u] = '\x80';
+  utf_src[kPageSize - 2u] = '\x80';
+  s = base_env->NewStringUTF(env_, utf_src);
+  ASSERT_EQ(mirror::String::GetFlaggedCount(kPageSize - 5u, /* compressible= */ false),
+            env_->GetIntField(s, count_fid));
+  jchars = env_->GetStringChars(s, nullptr);
+  EXPECT_EQ('\x40', jchars[0]);
+  for (size_t pos = 1; pos != kPageSize - 6u; ++pos) {
+    ASSERT_EQ('x', jchars[pos]) << pos;
+  }
+  EXPECT_EQ('\0', jchars[kPageSize - 6u]);
+  env_->ReleaseStringChars(s, jchars);
 }
 
 TEST_F(JniInternalTest, NewString) {
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 338a1af..53fcf82 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -120,6 +120,13 @@
 
   ObjPtr<String> Intern() REQUIRES_SHARED(Locks::mutator_lock_);
 
+  template <bool kIsInstrumented = true, typename PreFenceVisitor>
+  ALWAYS_INLINE static ObjPtr<String> Alloc(Thread* self,
+                                            int32_t utf16_length_with_flag,
+                                            gc::AllocatorType allocator_type,
+                                            const PreFenceVisitor& pre_fence_visitor)
+      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
+
   template <bool kIsInstrumented = true>
   ALWAYS_INLINE static ObjPtr<String> AllocFromByteArray(Thread* self,
                                                          int32_t byte_length,
@@ -253,13 +260,6 @@
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, hash_code_), new_hash_code);
   }
 
-  template <bool kIsInstrumented = true, typename PreFenceVisitor>
-  ALWAYS_INLINE static ObjPtr<String> Alloc(Thread* self,
-                                            int32_t utf16_length_with_flag,
-                                            gc::AllocatorType allocator_type,
-                                            const PreFenceVisitor& pre_fence_visitor)
-      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
-
   // Field order required by test "ValidateFieldOrderOfJavaCppUnionClasses".
 
   // If string compression is enabled, count_ holds the StringCompressionFlag in the