diff options
author | 2015-01-29 20:06:46 +0000 | |
---|---|---|
committer | 2015-02-12 11:54:37 +0000 | |
commit | a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch) | |
tree | 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf.h | |
parent | 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff) |
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utf.h')
-rw-r--r-- | runtime/utf.h | 22 |
1 files changed, 19 insertions, 3 deletions
diff --git a/runtime/utf.h b/runtime/utf.h index 3ee07fe65d..dd38afa172 100644 --- a/runtime/utf.h +++ b/runtime/utf.h @@ -85,12 +85,16 @@ int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count); size_t ComputeModifiedUtf8Hash(const char* chars); /* - * Retrieve the next UTF-16 character from a UTF-8 string. + * Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string. + * single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16 + * character whereas 4-byte UTF-8 sequences result in a surrogate pair. Use + * GetLeadingUtf16Char and GetTrailingUtf16Char to process the return value + * of this function. * * Advances "*utf8_data_in" to the start of the next character. * * WARNING: If a string is corrupted by dropping a '\0' in the middle - * of a 3-byte sequence, you can end up overrunning the buffer with + * of a multi byte sequence, you can end up overrunning the buffer with * reads (and possibly with the writes if the length was computed and * cached before the damage). For performance reasons, this function * assumes that the string being parsed is known to be valid (e.g., by @@ -98,7 +102,19 @@ size_t ComputeModifiedUtf8Hash(const char* chars); * out of dex files or other internal translations, so the only real * risk comes from the JNI NewStringUTF call. */ -uint16_t GetUtf16FromUtf8(const char** utf8_data_in); +uint32_t GetUtf16FromUtf8(const char** utf8_data_in); + +/** + * Gets the leading UTF-16 character from a surrogate pair, or the sole + * UTF-16 character from the return value of GetUtf16FromUtf8. + */ +ALWAYS_INLINE uint16_t GetLeadingUtf16Char(uint32_t maybe_pair); + +/** + * Gets the trailing UTF-16 character from a surrogate pair, or 0 otherwise + * from the return value of GetUtf16FromUtf8. + */ +ALWAYS_INLINE uint16_t GetTrailingUtf16Char(uint32_t maybe_pair); } // namespace art |