Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 06d258d..24d96ba 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -144,6 +144,7 @@
runtime/reference_table_test.cc \
runtime/thread_pool_test.cc \
runtime/transaction_test.cc \
+ runtime/utf_test.cc \
runtime/utils_test.cc \
runtime/verifier/method_verifier_test.cc \
runtime/verifier/reg_type_test.cc \
diff --git a/runtime/check_jni.cc b/runtime/check_jni.cc
index e45d3a3..6ec0949 100644
--- a/runtime/check_jni.cc
+++ b/runtime/check_jni.cc
@@ -1095,6 +1095,8 @@
return true;
}
+ // Checks whether |bytes| is valid modified UTF-8. We also accept 4 byte UTF
+ // sequences in place of encoded surrogate pairs.
static uint8_t CheckUtfBytes(const char* bytes, const char** errorKind) {
while (*bytes != '\0') {
uint8_t utf8 = *(bytes++);
@@ -1114,14 +1116,26 @@
case 0x09:
case 0x0a:
case 0x0b:
- case 0x0f:
- /*
- * Bit pattern 10xx or 1111, which are illegal start bytes.
- * Note: 1111 is valid for normal UTF-8, but not the
- * Modified UTF-8 used here.
- */
+ // Bit patterns 10xx, which are illegal start bytes.
*errorKind = "start";
return utf8;
+ case 0x0f:
+ // Bit pattern 1111, which might be the start of a 4 byte sequence.
+ if ((utf8 & 0x08) == 0) {
+ // Bit pattern 1111 0xxx, which is the start of a 4 byte sequence.
+ // We consume one continuation byte here, and fall through to consume two more.
+ utf8 = *(bytes++);
+ if ((utf8 & 0xc0) != 0x80) {
+ *errorKind = "continuation";
+ return utf8;
+ }
+ } else {
+ *errorKind = "start";
+ return utf8;
+ }
+
+ // Fall through to the cases below to consume two more continuation bytes.
+ FALLTHROUGH_INTENDED;
case 0x0e:
// Bit pattern 1110, so there are two additional bytes.
utf8 = *(bytes++);
@@ -1129,7 +1143,9 @@
*errorKind = "continuation";
return utf8;
}
- FALLTHROUGH_INTENDED; // Fall-through to take care of the final byte.
+
+ // Fall through to consume one more continuation byte.
+ FALLTHROUGH_INTENDED;
case 0x0c:
case 0x0d:
// Bit pattern 110x, so there is one additional byte.
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 906aa4c..1048214 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -1351,7 +1351,36 @@
EXPECT_EQ(5, env_->GetStringLength(s));
EXPECT_EQ(5, env_->GetStringUTFLength(s));
- // TODO: check some non-ASCII strings.
+ // Encoded surrogate pair.
+ s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
+ EXPECT_NE(s, nullptr);
+ EXPECT_EQ(2, env_->GetStringLength(s));
+ // Note that this uses 2 x 3 byte UTF sequences, one
+ // for each half of the surrogate pair.
+ EXPECT_EQ(6, env_->GetStringUTFLength(s));
+ const char* chars = env_->GetStringUTFChars(s, nullptr);
+ EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars);
+ env_->ReleaseStringUTFChars(s, chars);
+
+ // 4 byte UTF sequence appended to an encoded surrogate pair.
+ s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80 \xf0\x9f\x8f\xa0");
+ EXPECT_NE(s, nullptr);
+ EXPECT_EQ(5, env_->GetStringLength(s));
+ EXPECT_EQ(13, env_->GetStringUTFLength(s));
+ chars = env_->GetStringUTFChars(s, nullptr);
+ // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate
+ // pair {0xd83c, 0xdfe0} which is then converted into a two three byte
+ // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of
+ // the surrogate pair.
+ EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars);
+ env_->ReleaseStringUTFChars(s, chars);
+
+ // A string with 1, 2, 3 and 4 byte UTF sequences with spaces
+ // between them
+ s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0");
+ EXPECT_NE(s, nullptr);
+ EXPECT_EQ(8, env_->GetStringLength(s));
+ EXPECT_EQ(15, env_->GetStringUTFLength(s));
}
TEST_F(JniInternalTest, NewString) {
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index fb42d28..9b345a6 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -67,7 +67,7 @@
ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
for (int32_t i = 0; i < expected_utf16_length; i++) {
- EXPECT_EQ(utf16_expected[i], string->CharAt(i));
+ EXPECT_EQ(utf16_expected[i], string->UncheckedCharAt(i));
}
EXPECT_EQ(expected_hash, string->GetHashCode());
}
@@ -424,6 +424,12 @@
AssertString(1, "\xe1\x88\xb4", "\x12\x34", 0x1234);
AssertString(1, "\xef\xbf\xbf", "\xff\xff", 0xffff);
AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69);
+
+ // Test four-byte characters.
+ AssertString(2, "\xf0\x9f\x8f\xa0", "\xd8\x3c\xdf\xe0", (31 * 0xd83c) + 0xdfe0);
+ AssertString(2, "\xf0\x9f\x9a\x80", "\xd8\x3d\xde\x80", (31 * 0xd83d) + 0xde80);
+ AssertString(4, "h\xf0\x9f\x9a\x80i", "\x00\x68\xd8\x3d\xde\x80\x00\x69",
+ (31 * (31 * (31 * 0x68 + 0xd83d) + 0xde80) + 0x69));
}
TEST_F(ObjectTest, StringEqualsUtf8) {
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 14d7de2..4a95519 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -33,6 +33,10 @@
return Class::ComputeClassSize(true, vtable_entries, 0, 1, 0, 1, 2);
}
+inline uint16_t String::UncheckedCharAt(int32_t index) {
+ return GetCharArray()->Get(index + GetOffset());
+}
+
inline CharArray* String::GetCharArray() {
return GetFieldObject<CharArray>(ValueOffset());
}
@@ -54,20 +58,6 @@
return Runtime::Current()->GetInternTable()->InternWeak(this);
}
-inline uint16_t String::CharAt(int32_t index) {
- // TODO: do we need this? Equals is the only caller, and could
- // bounds check itself.
- DCHECK_GE(count_, 0); // ensures the unsigned comparison is safe.
- if (UNLIKELY(static_cast<uint32_t>(index) >= static_cast<uint32_t>(count_))) {
- Thread* self = Thread::Current();
- ThrowLocation throw_location = self->GetCurrentLocationForThrow();
- self->ThrowNewExceptionF(throw_location, "Ljava/lang/StringIndexOutOfBoundsException;",
- "length=%i; index=%i", count_, index);
- return 0;
- }
- return GetCharArray()->Get(index + GetOffset());
-}
-
inline int32_t String::GetHashCode() {
int32_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(String, hash_code_));
if (UNLIKELY(result == 0)) {
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index e199d0e..e7c88c5 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -147,7 +147,7 @@
// Note: don't short circuit on hash code as we're presumably here as the
// hash code was already equal
for (int32_t i = 0; i < that->GetLength(); ++i) {
- if (this->CharAt(i) != that->CharAt(i)) {
+ if (this->UncheckedCharAt(i) != that->UncheckedCharAt(i)) {
return false;
}
}
@@ -160,7 +160,7 @@
return false;
} else {
for (int32_t i = 0; i < that_length; ++i) {
- if (this->CharAt(i) != that_chars[that_offset + i]) {
+ if (this->UncheckedCharAt(i) != that_chars[that_offset + i]) {
return false;
}
}
@@ -169,22 +169,52 @@
}
bool String::Equals(const char* modified_utf8) {
- for (int32_t i = 0; i < GetLength(); ++i) {
- uint16_t ch = GetUtf16FromUtf8(&modified_utf8);
- if (ch == '\0' || ch != CharAt(i)) {
+ const int32_t length = GetLength();
+ int32_t i = 0;
+ while (i < length) {
+ const uint32_t ch = GetUtf16FromUtf8(&modified_utf8);
+ if (ch == '\0') {
return false;
}
+
+ if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i++)) {
+ return false;
+ }
+
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+ if (trailing != 0) {
+ if (i == length) {
+ return false;
+ }
+
+ if (UncheckedCharAt(i++) != trailing) {
+ return false;
+ }
+ }
}
return *modified_utf8 == '\0';
}
bool String::Equals(const StringPiece& modified_utf8) {
+ const int32_t length = GetLength();
const char* p = modified_utf8.data();
- for (int32_t i = 0; i < GetLength(); ++i) {
- uint16_t ch = GetUtf16FromUtf8(&p);
- if (ch != CharAt(i)) {
+ for (int32_t i = 0; i < length; ++i) {
+ uint32_t ch = GetUtf16FromUtf8(&p);
+
+ if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i)) {
return false;
}
+
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+ if (trailing != 0) {
+ if (i == (length - 1)) {
+ return false;
+ }
+
+ if (UncheckedCharAt(++i) != trailing) {
+ return false;
+ }
+ }
}
return true;
}
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 30b8aa3..6c22b9b 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -69,8 +69,6 @@
int32_t GetUtfLength() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- uint16_t CharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
String* Intern() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
static String* AllocFromUtf16(Thread* self,
@@ -86,9 +84,14 @@
const char* utf8_data_in)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ // TODO: This is only used in the interpreter to compare against
+ // entries from a dex files constant pool (ArtField names). Should
+ // we unify this with Equals(const StringPiece&); ?
bool Equals(const char* modified_utf8) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- // TODO: do we need this overload? give it a more intention-revealing name.
+ // TODO: This is only used to compare DexCache.location with
+ // a dex_file's location (which is an std::string). Do we really
+ // need this in mirror::String just for that one usage ?
bool Equals(const StringPiece& modified_utf8)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -127,6 +130,9 @@
static void VisitRoots(RootCallback* callback, void* arg)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ // TODO: Make this private. It's only used on ObjectTest at the moment.
+ uint16_t UncheckedCharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
private:
void SetHashCode(int32_t new_hash_code) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
// Hash code is invariant so use non-transactional mode. Also disable check as we may run inside
diff --git a/runtime/utf-inl.h b/runtime/utf-inl.h
index 1373d17..b2d6765 100644
--- a/runtime/utf-inl.h
+++ b/runtime/utf-inl.h
@@ -21,26 +21,57 @@
namespace art {
-inline uint16_t GetUtf16FromUtf8(const char** utf8_data_in) {
- uint8_t one = *(*utf8_data_in)++;
+inline uint16_t GetTrailingUtf16Char(uint32_t maybe_pair) {
+ return static_cast<uint16_t>(maybe_pair >> 16);
+}
+
+inline uint16_t GetLeadingUtf16Char(uint32_t maybe_pair) {
+ return static_cast<uint16_t>(maybe_pair & 0x0000FFFF);
+}
+
+inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) {
+ const uint8_t one = *(*utf8_data_in)++;
if ((one & 0x80) == 0) {
// one-byte encoding
return one;
}
- // two- or three-byte encoding
- uint8_t two = *(*utf8_data_in)++;
+
+ const uint8_t two = *(*utf8_data_in)++;
if ((one & 0x20) == 0) {
// two-byte encoding
return ((one & 0x1f) << 6) | (two & 0x3f);
}
- // three-byte encoding
- uint8_t three = *(*utf8_data_in)++;
- return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+
+ const uint8_t three = *(*utf8_data_in)++;
+ if ((one & 0x10) == 0) {
+ return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+ }
+
+ // Four byte encodings need special handling. We'll have
+ // to convert them into a surrogate pair.
+ const uint8_t four = *(*utf8_data_in)++;
+
+ // Since this is a 4 byte UTF-8 sequence, it will lie between
+ // U+10000 and U+1FFFFF.
+ //
+ // TODO: What do we do about values in (U+10FFFF, U+1FFFFF) ? The
+ // spec says they're invalid but nobody appears to check for them.
+ const uint32_t code_point = ((one & 0x0f) << 18) | ((two & 0x3f) << 12)
+ | ((three & 0x3f) << 6) | (four & 0x3f);
+
+ uint32_t surrogate_pair = 0;
+ // Step two: Write out the high (leading) surrogate to the bottom 16 bits
+ // of the of the 32 bit type.
+ surrogate_pair |= ((code_point >> 10) + 0xd7c0) & 0xffff;
+ // Step three : Write out the low (trailing) surrogate to the top 16 bits.
+ surrogate_pair |= ((code_point & 0x03ff) + 0xdc00) << 16;
+
+ return surrogate_pair;
}
inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1,
const char* utf8_2) {
- uint16_t c1, c2;
+ uint32_t c1, c2;
do {
c1 = *utf8_1;
c2 = *utf8_2;
@@ -50,50 +81,17 @@
} else if (c2 == 0) {
return 1;
}
- // Assume 1-byte value and handle all cases first.
- utf8_1++;
- utf8_2++;
- if ((c1 & 0x80) == 0) {
- if (c1 == c2) {
- // Matching 1-byte values.
- continue;
- } else {
- // Non-matching values.
- if ((c2 & 0x80) == 0) {
- // 1-byte value, do nothing.
- } else if ((c2 & 0x20) == 0) {
- // 2-byte value.
- c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
- } else {
- // 3-byte value.
- c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
- }
- return static_cast<int>(c1) - static_cast<int>(c2);
- }
- }
- // Non-matching or multi-byte values.
- if ((c1 & 0x20) == 0) {
- // 2-byte value.
- c1 = ((c1 & 0x1f) << 6) | (*utf8_1 & 0x3f);
- utf8_1++;
- } else {
- // 3-byte value.
- c1 = ((c1 & 0x0f) << 12) | ((utf8_1[0] & 0x3f) << 6) | (utf8_1[1] & 0x3f);
- utf8_1 += 2;
- }
- if ((c2 & 0x80) == 0) {
- // 1-byte value, do nothing.
- } else if ((c2 & 0x20) == 0) {
- // 2-byte value.
- c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
- utf8_2++;
- } else {
- // 3-byte value.
- c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
- utf8_2 += 2;
- }
+
+ c1 = GetUtf16FromUtf8(&utf8_1);
+ c2 = GetUtf16FromUtf8(&utf8_2);
} while (c1 == c2);
- return static_cast<int>(c1) - static_cast<int>(c2);
+
+ const uint32_t leading_surrogate_diff = GetLeadingUtf16Char(c1) - GetLeadingUtf16Char(c2);
+ if (leading_surrogate_diff != 0) {
+ return static_cast<int>(leading_surrogate_diff);
+ }
+
+ return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2);
}
} // namespace art
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 7ff296b..39c8d15 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -38,15 +38,30 @@
// two-byte encoding
continue;
}
- // three-byte encoding
utf8++;
+ if ((ic & 0x10) == 0) {
+ // three-byte encoding
+ continue;
+ }
+
+ // four-byte encoding: needs to be converted into a surrogate
+ // pair.
+ utf8++;
+ len++;
}
return len;
}
void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
while (*utf8_data_in != '\0') {
- *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+ const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
+ const uint16_t leading = GetLeadingUtf16Char(ch);
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+ *utf16_data_out++ = leading;
+ if (trailing != 0) {
+ *utf16_data_out++ = trailing;
+ }
}
}
@@ -102,12 +117,29 @@
return 1;
}
- int c1 = GetUtf16FromUtf8(&utf8);
- int c2 = *utf16++;
- --utf16_length;
+ const uint32_t pair = GetUtf16FromUtf8(&utf8);
- if (c1 != c2) {
- return c1 > c2 ? 1 : -1;
+ // First compare the leading utf16 char.
+ const uint16_t lhs = GetLeadingUtf16Char(pair);
+ const uint16_t rhs = *utf16++;
+ --utf16_length;
+ if (lhs != rhs) {
+ return lhs > rhs ? 1 : -1;
+ }
+
+ // Then compare the trailing utf16 char. First check if there
+ // are any characters left to consume.
+ const uint16_t lhs2 = GetTrailingUtf16Char(pair);
+ if (lhs2 != 0) {
+ if (utf16_length == 0) {
+ return 1;
+ }
+
+ const uint16_t rhs2 = *utf16++;
+ --utf16_length;
+ if (lhs2 != rhs2) {
+ return lhs2 > rhs2 ? 1 : -1;
+ }
}
}
}
diff --git a/runtime/utf.h b/runtime/utf.h
index 3ee07fe..dd38afa 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -85,12 +85,16 @@
size_t ComputeModifiedUtf8Hash(const char* chars);
/*
- * Retrieve the next UTF-16 character from a UTF-8 string.
+ * Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string.
+ * single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16
+ * character whereas 4-byte UTF-8 sequences result in a surrogate pair. Use
+ * GetLeadingUtf16Char and GetTrailingUtf16Char to process the return value
+ * of this function.
*
* Advances "*utf8_data_in" to the start of the next character.
*
* WARNING: If a string is corrupted by dropping a '\0' in the middle
- * of a 3-byte sequence, you can end up overrunning the buffer with
+ * of a multi byte sequence, you can end up overrunning the buffer with
* reads (and possibly with the writes if the length was computed and
* cached before the damage). For performance reasons, this function
* assumes that the string being parsed is known to be valid (e.g., by
@@ -98,7 +102,19 @@
* out of dex files or other internal translations, so the only real
* risk comes from the JNI NewStringUTF call.
*/
-uint16_t GetUtf16FromUtf8(const char** utf8_data_in);
+uint32_t GetUtf16FromUtf8(const char** utf8_data_in);
+
+/**
+ * Gets the leading UTF-16 character from a surrogate pair, or the sole
+ * UTF-16 character from the return value of GetUtf16FromUtf8.
+ */
+ALWAYS_INLINE uint16_t GetLeadingUtf16Char(uint32_t maybe_pair);
+
+/**
+ * Gets the trailing UTF-16 character from a surrogate pair, or 0 otherwise
+ * from the return value of GetUtf16FromUtf8.
+ */
+ALWAYS_INLINE uint16_t GetTrailingUtf16Char(uint32_t maybe_pair);
} // namespace art
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
new file mode 100644
index 0000000..8048bbd
--- /dev/null
+++ b/runtime/utf_test.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utf.h"
+
+#include "common_runtime_test.h"
+#include "utf-inl.h"
+
+namespace art {
+
+class UtfTest : public CommonRuntimeTest {};
+
+TEST_F(UtfTest, GetLeadingUtf16Char) {
+ EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
+}
+
+TEST_F(UtfTest, GetTrailingUtf16Char) {
+ EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
+ EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
+}
+
+#define EXPECT_ARRAY_POSITION(expected, end, start) \
+ EXPECT_EQ(static_cast<uintptr_t>(expected), \
+ reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
+
+// A test string containing one, two, three and four byte UTF-8 sequences.
+static const uint8_t kAllSequences[] = {
+ 0x24,
+ 0xc2, 0xa2,
+ 0xe2, 0x82, 0xac,
+ 0xf0, 0x9f, 0x8f, 0xa0,
+ 0x00
+};
+
+// A test string that contains a UTF-8 encoding of a surrogate pair
+// (code point = U+10400)
+static const uint8_t kSurrogateEncoding[] = {
+ 0xed, 0xa0, 0x81,
+ 0xed, 0xb0, 0x80,
+ 0x00
+};
+
+TEST_F(UtfTest, GetUtf16FromUtf8) {
+ const char* const start = reinterpret_cast<const char*>(kAllSequences);
+ const char* ptr = start;
+ uint32_t pair = 0;
+
+ // Single byte sequence.
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(1, ptr, start);
+
+ // Two byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(3, ptr, start);
+
+ // Three byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(6, ptr, start);
+
+ // Four byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(10, ptr, start);
+
+ // Null terminator
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(11, ptr, start);
+}
+
+TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
+ const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
+ const char* ptr = start;
+ uint32_t pair = 0;
+
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(3, ptr, start);
+
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(6, ptr, start);
+}
+
+TEST_F(UtfTest, CountModifiedUtf8Chars) {
+ EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
+ EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
+}
+
+} // namespace art
diff --git a/runtime/utils.cc b/runtime/utils.cc
index af16d7e..3ec9561 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -625,7 +625,7 @@
const char* p = utf;
size_t char_count = CountModifiedUtf8Chars(p);
for (size_t i = 0; i < char_count; ++i) {
- uint16_t ch = GetUtf16FromUtf8(&p);
+ uint32_t ch = GetUtf16FromUtf8(&p);
if (ch == '\\') {
result += "\\\\";
} else if (ch == '\n') {
@@ -634,10 +634,20 @@
result += "\\r";
} else if (ch == '\t') {
result += "\\t";
- } else if (NeedsEscaping(ch)) {
- StringAppendF(&result, "\\u%04x", ch);
} else {
- result += ch;
+ const uint16_t leading = GetLeadingUtf16Char(ch);
+
+ if (NeedsEscaping(leading)) {
+ StringAppendF(&result, "\\u%04x", leading);
+ } else {
+ result += leading;
+ }
+
+ const uint32_t trailing = GetTrailingUtf16Char(ch);
+ if (trailing != 0) {
+ // All high surrogates will need escaping.
+ StringAppendF(&result, "\\u%04x", trailing);
+ }
}
}
result += '"';
@@ -650,7 +660,7 @@
size_t char_count = CountModifiedUtf8Chars(s.c_str());
const char* cp = &s[0];
for (size_t i = 0; i < char_count; ++i) {
- uint16_t ch = GetUtf16FromUtf8(&cp);
+ uint32_t ch = GetUtf16FromUtf8(&cp);
if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
result.push_back(ch);
} else if (ch == '.' || ch == '/') {
@@ -662,7 +672,13 @@
} else if (ch == '[') {
result += "_3";
} else {
- StringAppendF(&result, "_0%04x", ch);
+ const uint16_t leading = GetLeadingUtf16Char(ch);
+ const uint32_t trailing = GetTrailingUtf16Char(ch);
+
+ StringAppendF(&result, "_0%04x", leading);
+ if (trailing != 0) {
+ StringAppendF(&result, "_0%04x", trailing);
+ }
}
}
return result;
@@ -757,41 +773,50 @@
* document.
*/
- uint16_t utf16 = GetUtf16FromUtf8(pUtf8Ptr);
+ const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr);
- // Perform follow-up tests based on the high 8 bits.
- switch (utf16 >> 8) {
- case 0x00:
- // It's only valid if it's above the ISO-8859-1 high space (0xa0).
- return (utf16 > 0x00a0);
- case 0xd8:
- case 0xd9:
- case 0xda:
- case 0xdb:
- // It's a leading surrogate. Check to see that a trailing
- // surrogate follows.
- utf16 = GetUtf16FromUtf8(pUtf8Ptr);
- return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
- case 0xdc:
- case 0xdd:
- case 0xde:
- case 0xdf:
- // It's a trailing surrogate, which is not valid at this point.
- return false;
- case 0x20:
- case 0xff:
- // It's in the range that has spaces, controls, and specials.
- switch (utf16 & 0xfff8) {
- case 0x2000:
- case 0x2008:
- case 0x2028:
- case 0xfff0:
- case 0xfff8:
+ const uint16_t leading = GetLeadingUtf16Char(pair);
+ const uint32_t trailing = GetTrailingUtf16Char(pair);
+
+ if (trailing == 0) {
+ // Perform follow-up tests based on the high 8 bits of the
+ // lower surrogate.
+ switch (leading >> 8) {
+ case 0x00:
+ // It's only valid if it's above the ISO-8859-1 high space (0xa0).
+ return (leading > 0x00a0);
+ case 0xd8:
+ case 0xd9:
+ case 0xda:
+ case 0xdb:
+ // It looks like a leading surrogate but we didn't find a trailing
+ // surrogate if we're here.
return false;
+ case 0xdc:
+ case 0xdd:
+ case 0xde:
+ case 0xdf:
+ // It's a trailing surrogate, which is not valid at this point.
+ return false;
+ case 0x20:
+ case 0xff:
+ // It's in the range that has spaces, controls, and specials.
+ switch (leading & 0xfff8) {
+ case 0x2000:
+ case 0x2008:
+ case 0x2028:
+ case 0xfff0:
+ case 0xfff8:
+ return false;
+ }
+ break;
}
- break;
+
+ return true;
}
- return true;
+
+ // We have a surrogate pair. Check that trailing surrogate is well formed.
+ return (trailing >= 0xdc00 && trailing <= 0xdfff);
}
/* Return whether the pointed-at modified-UTF-8 encoded character is