diff options
author | 2015-01-29 20:06:46 +0000 | |
---|---|---|
committer | 2015-02-12 11:54:37 +0000 | |
commit | a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch) | |
tree | 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utils.cc | |
parent | 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff) |
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utils.cc')
-rw-r--r-- | runtime/utils.cc | 101 |
1 files changed, 63 insertions, 38 deletions
diff --git a/runtime/utils.cc b/runtime/utils.cc index af16d7e118..3ec9561f4d 100644 --- a/runtime/utils.cc +++ b/runtime/utils.cc @@ -625,7 +625,7 @@ std::string PrintableString(const char* utf) { const char* p = utf; size_t char_count = CountModifiedUtf8Chars(p); for (size_t i = 0; i < char_count; ++i) { - uint16_t ch = GetUtf16FromUtf8(&p); + uint32_t ch = GetUtf16FromUtf8(&p); if (ch == '\\') { result += "\\\\"; } else if (ch == '\n') { @@ -634,10 +634,20 @@ std::string PrintableString(const char* utf) { result += "\\r"; } else if (ch == '\t') { result += "\\t"; - } else if (NeedsEscaping(ch)) { - StringAppendF(&result, "\\u%04x", ch); } else { - result += ch; + const uint16_t leading = GetLeadingUtf16Char(ch); + + if (NeedsEscaping(leading)) { + StringAppendF(&result, "\\u%04x", leading); + } else { + result += leading; + } + + const uint32_t trailing = GetTrailingUtf16Char(ch); + if (trailing != 0) { + // All high surrogates will need escaping. + StringAppendF(&result, "\\u%04x", trailing); + } } } result += '"'; @@ -650,7 +660,7 @@ std::string MangleForJni(const std::string& s) { size_t char_count = CountModifiedUtf8Chars(s.c_str()); const char* cp = &s[0]; for (size_t i = 0; i < char_count; ++i) { - uint16_t ch = GetUtf16FromUtf8(&cp); + uint32_t ch = GetUtf16FromUtf8(&cp); if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) { result.push_back(ch); } else if (ch == '.' || ch == '/') { @@ -662,7 +672,13 @@ std::string MangleForJni(const std::string& s) { } else if (ch == '[') { result += "_3"; } else { - StringAppendF(&result, "_0%04x", ch); + const uint16_t leading = GetLeadingUtf16Char(ch); + const uint32_t trailing = GetTrailingUtf16Char(ch); + + StringAppendF(&result, "_0%04x", leading); + if (trailing != 0) { + StringAppendF(&result, "_0%04x", trailing); + } } } return result; @@ -757,41 +773,50 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { * document. */ - uint16_t utf16 = GetUtf16FromUtf8(pUtf8Ptr); - - // Perform follow-up tests based on the high 8 bits. - switch (utf16 >> 8) { - case 0x00: - // It's only valid if it's above the ISO-8859-1 high space (0xa0). - return (utf16 > 0x00a0); - case 0xd8: - case 0xd9: - case 0xda: - case 0xdb: - // It's a leading surrogate. Check to see that a trailing - // surrogate follows. - utf16 = GetUtf16FromUtf8(pUtf8Ptr); - return (utf16 >= 0xdc00) && (utf16 <= 0xdfff); - case 0xdc: - case 0xdd: - case 0xde: - case 0xdf: - // It's a trailing surrogate, which is not valid at this point. - return false; - case 0x20: - case 0xff: - // It's in the range that has spaces, controls, and specials. - switch (utf16 & 0xfff8) { - case 0x2000: - case 0x2008: - case 0x2028: - case 0xfff0: - case 0xfff8: + const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr); + + const uint16_t leading = GetLeadingUtf16Char(pair); + const uint32_t trailing = GetTrailingUtf16Char(pair); + + if (trailing == 0) { + // Perform follow-up tests based on the high 8 bits of the + // lower surrogate. + switch (leading >> 8) { + case 0x00: + // It's only valid if it's above the ISO-8859-1 high space (0xa0). + return (leading > 0x00a0); + case 0xd8: + case 0xd9: + case 0xda: + case 0xdb: + // It looks like a leading surrogate but we didn't find a trailing + // surrogate if we're here. return false; + case 0xdc: + case 0xdd: + case 0xde: + case 0xdf: + // It's a trailing surrogate, which is not valid at this point. + return false; + case 0x20: + case 0xff: + // It's in the range that has spaces, controls, and specials. + switch (leading & 0xfff8) { + case 0x2000: + case 0x2008: + case 0x2028: + case 0xfff0: + case 0xfff8: + return false; + } + break; } - break; + + return true; } - return true; + + // We have a surrogate pair. Check that trailing surrogate is well formed. + return (trailing >= 0xdc00 && trailing <= 0xdfff); } /* Return whether the pointed-at modified-UTF-8 encoded character is |