Be more lenient with 4 byte UTF-8 sequences.

Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.

Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.

bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index fb42d28..9b345a6 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -67,7 +67,7 @@
     ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
     ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
     for (int32_t i = 0; i < expected_utf16_length; i++) {
-      EXPECT_EQ(utf16_expected[i], string->CharAt(i));
+      EXPECT_EQ(utf16_expected[i], string->UncheckedCharAt(i));
     }
     EXPECT_EQ(expected_hash, string->GetHashCode());
   }
@@ -424,6 +424,12 @@
   AssertString(1, "\xe1\x88\xb4",   "\x12\x34",                 0x1234);
   AssertString(1, "\xef\xbf\xbf",   "\xff\xff",                 0xffff);
   AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69);
+
+  // Test four-byte characters.
+  AssertString(2, "\xf0\x9f\x8f\xa0",  "\xd8\x3c\xdf\xe0", (31 * 0xd83c) + 0xdfe0);
+  AssertString(2, "\xf0\x9f\x9a\x80",  "\xd8\x3d\xde\x80", (31 * 0xd83d) + 0xde80);
+  AssertString(4, "h\xf0\x9f\x9a\x80i", "\x00\x68\xd8\x3d\xde\x80\x00\x69",
+               (31 * (31 * (31 * 0x68 +  0xd83d) + 0xde80) + 0x69));
 }
 
 TEST_F(ObjectTest, StringEqualsUtf8) {
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 14d7de2..4a95519 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -33,6 +33,10 @@
   return Class::ComputeClassSize(true, vtable_entries, 0, 1, 0, 1, 2);
 }
 
+inline uint16_t String::UncheckedCharAt(int32_t index) {
+  return GetCharArray()->Get(index + GetOffset());
+}
+
 inline CharArray* String::GetCharArray() {
   return GetFieldObject<CharArray>(ValueOffset());
 }
@@ -54,20 +58,6 @@
   return Runtime::Current()->GetInternTable()->InternWeak(this);
 }
 
-inline uint16_t String::CharAt(int32_t index) {
-  // TODO: do we need this? Equals is the only caller, and could
-  // bounds check itself.
-  DCHECK_GE(count_, 0);  // ensures the unsigned comparison is safe.
-  if (UNLIKELY(static_cast<uint32_t>(index) >= static_cast<uint32_t>(count_))) {
-    Thread* self = Thread::Current();
-    ThrowLocation throw_location = self->GetCurrentLocationForThrow();
-    self->ThrowNewExceptionF(throw_location, "Ljava/lang/StringIndexOutOfBoundsException;",
-                             "length=%i; index=%i", count_, index);
-    return 0;
-  }
-  return GetCharArray()->Get(index + GetOffset());
-}
-
 inline int32_t String::GetHashCode() {
   int32_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(String, hash_code_));
   if (UNLIKELY(result == 0)) {
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index e199d0e..e7c88c5 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -147,7 +147,7 @@
     // Note: don't short circuit on hash code as we're presumably here as the
     // hash code was already equal
     for (int32_t i = 0; i < that->GetLength(); ++i) {
-      if (this->CharAt(i) != that->CharAt(i)) {
+      if (this->UncheckedCharAt(i) != that->UncheckedCharAt(i)) {
         return false;
       }
     }
@@ -160,7 +160,7 @@
     return false;
   } else {
     for (int32_t i = 0; i < that_length; ++i) {
-      if (this->CharAt(i) != that_chars[that_offset + i]) {
+      if (this->UncheckedCharAt(i) != that_chars[that_offset + i]) {
         return false;
       }
     }
@@ -169,22 +169,52 @@
 }
 
 bool String::Equals(const char* modified_utf8) {
-  for (int32_t i = 0; i < GetLength(); ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&modified_utf8);
-    if (ch == '\0' || ch != CharAt(i)) {
+  const int32_t length = GetLength();
+  int32_t i = 0;
+  while (i < length) {
+    const uint32_t ch = GetUtf16FromUtf8(&modified_utf8);
+    if (ch == '\0') {
       return false;
     }
+
+    if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i++)) {
+      return false;
+    }
+
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+    if (trailing != 0) {
+      if (i == length) {
+        return false;
+      }
+
+      if (UncheckedCharAt(i++) != trailing) {
+        return false;
+      }
+    }
   }
   return *modified_utf8 == '\0';
 }
 
 bool String::Equals(const StringPiece& modified_utf8) {
+  const int32_t length = GetLength();
   const char* p = modified_utf8.data();
-  for (int32_t i = 0; i < GetLength(); ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&p);
-    if (ch != CharAt(i)) {
+  for (int32_t i = 0; i < length; ++i) {
+    uint32_t ch = GetUtf16FromUtf8(&p);
+
+    if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i)) {
       return false;
     }
+
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+    if (trailing != 0) {
+      if (i == (length - 1)) {
+        return false;
+      }
+
+      if (UncheckedCharAt(++i) != trailing) {
+        return false;
+      }
+    }
   }
   return true;
 }
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 30b8aa3..6c22b9b 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -69,8 +69,6 @@
 
   int32_t GetUtfLength() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  uint16_t CharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   String* Intern() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   static String* AllocFromUtf16(Thread* self,
@@ -86,9 +84,14 @@
                                        const char* utf8_data_in)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // TODO: This is only used in the interpreter to compare against
+  // entries from a dex files constant pool (ArtField names). Should
+  // we unify this with Equals(const StringPiece&); ?
   bool Equals(const char* modified_utf8) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // TODO: do we need this overload? give it a more intention-revealing name.
+  // TODO: This is only used to compare DexCache.location with
+  // a dex_file's location (which is an std::string). Do we really
+  // need this in mirror::String just for that one usage ?
   bool Equals(const StringPiece& modified_utf8)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -127,6 +130,9 @@
   static void VisitRoots(RootCallback* callback, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // TODO: Make this private. It's only used on ObjectTest at the moment.
+  uint16_t UncheckedCharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
  private:
   void SetHashCode(int32_t new_hash_code) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Hash code is invariant so use non-transactional mode. Also disable check as we may run inside