summaryrefslogtreecommitdiff
path: root/runtime/utf.cc
diff options
context:
space:
mode:
author Narayan Kamath <narayan@google.com> 2015-01-29 20:06:46 +0000
committer Narayan Kamath <narayan@google.com> 2015-02-12 11:54:37 +0000
commita5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf.cc
parent5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate pairs instead of expecting 2 separate 3 byte sequences each encoding one half of a surrogate pair. Note that in addition to supporting 4 byte sequences in strings from JNI, we also tolerate them in dex files. This is mainly for consistency, and there's no need to claim any sort of official support. bug: 18848397 bug: https://code.google.com/p/android/issues/detail?id=81341 Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utf.cc')
-rw-r--r--runtime/utf.cc44
1 files changed, 38 insertions, 6 deletions
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 7ff296bf0c..39c8d153d5 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -38,15 +38,30 @@ size_t CountModifiedUtf8Chars(const char* utf8) {
// two-byte encoding
continue;
}
- // three-byte encoding
utf8++;
+ if ((ic & 0x10) == 0) {
+ // three-byte encoding
+ continue;
+ }
+
+ // four-byte encoding: needs to be converted into a surrogate
+ // pair.
+ utf8++;
+ len++;
}
return len;
}
void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
while (*utf8_data_in != '\0') {
- *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+ const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
+ const uint16_t leading = GetLeadingUtf16Char(ch);
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+ *utf16_data_out++ = leading;
+ if (trailing != 0) {
+ *utf16_data_out++ = trailing;
+ }
}
}
@@ -102,12 +117,29 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t
return 1;
}
- int c1 = GetUtf16FromUtf8(&utf8);
- int c2 = *utf16++;
+ const uint32_t pair = GetUtf16FromUtf8(&utf8);
+
+ // First compare the leading utf16 char.
+ const uint16_t lhs = GetLeadingUtf16Char(pair);
+ const uint16_t rhs = *utf16++;
--utf16_length;
+ if (lhs != rhs) {
+ return lhs > rhs ? 1 : -1;
+ }
- if (c1 != c2) {
- return c1 > c2 ? 1 : -1;
+ // Then compare the trailing utf16 char. First check if there
+ // are any characters left to consume.
+ const uint16_t lhs2 = GetTrailingUtf16Char(pair);
+ if (lhs2 != 0) {
+ if (utf16_length == 0) {
+ return 1;
+ }
+
+ const uint16_t rhs2 = *utf16++;
+ --utf16_length;
+ if (lhs2 != rhs2) {
+ return lhs2 > rhs2 ? 1 : -1;
+ }
}
}
}