Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 7ff296b..39c8d15 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -38,15 +38,30 @@
// two-byte encoding
continue;
}
- // three-byte encoding
utf8++;
+ if ((ic & 0x10) == 0) {
+ // three-byte encoding
+ continue;
+ }
+
+ // four-byte encoding: needs to be converted into a surrogate
+ // pair.
+ utf8++;
+ len++;
}
return len;
}
void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
while (*utf8_data_in != '\0') {
- *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+ const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
+ const uint16_t leading = GetLeadingUtf16Char(ch);
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+ *utf16_data_out++ = leading;
+ if (trailing != 0) {
+ *utf16_data_out++ = trailing;
+ }
}
}
@@ -102,12 +117,29 @@
return 1;
}
- int c1 = GetUtf16FromUtf8(&utf8);
- int c2 = *utf16++;
- --utf16_length;
+ const uint32_t pair = GetUtf16FromUtf8(&utf8);
- if (c1 != c2) {
- return c1 > c2 ? 1 : -1;
+ // First compare the leading utf16 char.
+ const uint16_t lhs = GetLeadingUtf16Char(pair);
+ const uint16_t rhs = *utf16++;
+ --utf16_length;
+ if (lhs != rhs) {
+ return lhs > rhs ? 1 : -1;
+ }
+
+ // Then compare the trailing utf16 char. First check if there
+ // are any characters left to consume.
+ const uint16_t lhs2 = GetTrailingUtf16Char(pair);
+ if (lhs2 != 0) {
+ if (utf16_length == 0) {
+ return 1;
+ }
+
+ const uint16_t rhs2 = *utf16++;
+ --utf16_length;
+ if (lhs2 != rhs2) {
+ return lhs2 > rhs2 ? 1 : -1;
+ }
}
}
}