Be more lenient with 4 byte UTF-8 sequences.

Accept 4 byte sequences and convert them into surrogate pairs instead of expecting 2 separate 3 byte sequences each encoding one half of a surrogate pair. Note that in addition to supporting 4 byte sequences in strings from JNI, we also tolerate them in dex files. This is mainly for consistency, and there's no need to claim any sort of official support. bug: 18848397 bug: https://code.google.com/p/android/issues/detail?id=81341 Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
author: Narayan Kamath <narayan@google.com> 2015-01-29 20:06:46 +0000
committer: Narayan Kamath <narayan@google.com> 2015-02-12 11:54:37 +0000
commit: a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree: 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/mirror/string.cc
parent: 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)
1 files changed, 38 insertions, 8 deletions
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index e199d0e2ef..e7c88c5425 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -147,7 +147,7 @@ bool String::Equals(String* that) {
     // Note: don't short circuit on hash code as we're presumably here as the
     // hash code was already equal
     for (int32_t i = 0; i < that->GetLength(); ++i) {
-      if (this->CharAt(i) != that->CharAt(i)) {
+      if (this->UncheckedCharAt(i) != that->UncheckedCharAt(i)) {
         return false;
       }
     }
@@ -160,7 +160,7 @@ bool String::Equals(const uint16_t* that_chars, int32_t that_offset, int32_t tha
     return false;
   } else {
     for (int32_t i = 0; i < that_length; ++i) {
-      if (this->CharAt(i) != that_chars[that_offset + i]) {
+      if (this->UncheckedCharAt(i) != that_chars[that_offset + i]) {
         return false;
       }
     }
@@ -169,22 +169,52 @@ bool String::Equals(const uint16_t* that_chars, int32_t that_offset, int32_t tha
 }
 
 bool String::Equals(const char* modified_utf8) {
-  for (int32_t i = 0; i < GetLength(); ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&modified_utf8);
-    if (ch == '\0' || ch != CharAt(i)) {
+  const int32_t length = GetLength();
+  int32_t i = 0;
+  while (i < length) {
+    const uint32_t ch = GetUtf16FromUtf8(&modified_utf8);
+    if (ch == '\0') {
       return false;
     }
+
+    if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i++)) {
+      return false;
+    }
+
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+    if (trailing != 0) {
+      if (i == length) {
+        return false;
+      }
+
+      if (UncheckedCharAt(i++) != trailing) {
+        return false;
+      }
+    }
   }
   return *modified_utf8 == '\0';
 }
 
 bool String::Equals(const StringPiece& modified_utf8) {
+  const int32_t length = GetLength();
   const char* p = modified_utf8.data();
-  for (int32_t i = 0; i < GetLength(); ++i) {
-    uint16_t ch = GetUtf16FromUtf8(&p);
-    if (ch != CharAt(i)) {
+  for (int32_t i = 0; i < length; ++i) {
+    uint32_t ch = GetUtf16FromUtf8(&p);
+
+    if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i)) {
       return false;
     }
+
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+    if (trailing != 0) {
+      if (i == (length - 1)) {
+        return false;
+      }
+
+      if (UncheckedCharAt(++i) != trailing) {
+        return false;
+      }
+    }
   }
   return true;
 }
author	Narayan Kamath <narayan@google.com>	2015-01-29 20:06:46 +0000
committer	Narayan Kamath <narayan@google.com>	2015-02-12 11:54:37 +0000
commit	a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree	424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/mirror/string.cc
parent	5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)