Be more lenient with 4 byte UTF-8 sequences.

Accept 4 byte sequences and convert them into surrogate pairs instead of expecting 2 separate 3 byte sequences each encoding one half of a surrogate pair. Note that in addition to supporting 4 byte sequences in strings from JNI, we also tolerate them in dex files. This is mainly for consistency, and there's no need to claim any sort of official support. bug: 18848397 bug: https://code.google.com/p/android/issues/detail?id=81341 Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
author: Narayan Kamath <narayan@google.com> 2015-01-29 20:06:46 +0000
committer: Narayan Kamath <narayan@google.com> 2015-02-12 11:54:37 +0000
commit: a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree: 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf.cc
parent: 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)
1 files changed, 38 insertions, 6 deletions
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 7ff296bf0c..39c8d153d5 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -38,15 +38,30 @@ size_t CountModifiedUtf8Chars(const char* utf8) {
       // two-byte encoding
       continue;
     }
-    // three-byte encoding
     utf8++;
+    if ((ic & 0x10) == 0) {
+      // three-byte encoding
+      continue;
+    }
+
+    // four-byte encoding: needs to be converted into a surrogate
+    // pair.
+    utf8++;
+    len++;
   }
   return len;
 }
 
 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
   while (*utf8_data_in != '\0') {
-    *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+    const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
+    const uint16_t leading = GetLeadingUtf16Char(ch);
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+    *utf16_data_out++ = leading;
+    if (trailing != 0) {
+      *utf16_data_out++ = trailing;
+    }
   }
 }
 
@@ -102,12 +117,29 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t
       return 1;
     }
 
-    int c1 = GetUtf16FromUtf8(&utf8);
-    int c2 = *utf16++;
+    const uint32_t pair = GetUtf16FromUtf8(&utf8);
+
+    // First compare the leading utf16 char.
+    const uint16_t lhs = GetLeadingUtf16Char(pair);
+    const uint16_t rhs = *utf16++;
     --utf16_length;
+    if (lhs != rhs) {
+      return lhs > rhs ? 1 : -1;
+    }
 
-    if (c1 != c2) {
-      return c1 > c2 ? 1 : -1;
+    // Then compare the trailing utf16 char. First check if there
+    // are any characters left to consume.
+    const uint16_t lhs2 = GetTrailingUtf16Char(pair);
+    if (lhs2 != 0) {
+      if (utf16_length == 0) {
+        return 1;
+      }
+
+      const uint16_t rhs2 = *utf16++;
+      --utf16_length;
+      if (lhs2 != rhs2) {
+        return lhs2 > rhs2 ? 1 : -1;
+      }
     }
   }
 }
author	Narayan Kamath <narayan@google.com>	2015-01-29 20:06:46 +0000
committer	Narayan Kamath <narayan@google.com>	2015-02-12 11:54:37 +0000
commit	a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree	424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf.cc
parent	5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)