Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index fb42d28..9b345a6 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -67,7 +67,7 @@
ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
for (int32_t i = 0; i < expected_utf16_length; i++) {
- EXPECT_EQ(utf16_expected[i], string->CharAt(i));
+ EXPECT_EQ(utf16_expected[i], string->UncheckedCharAt(i));
}
EXPECT_EQ(expected_hash, string->GetHashCode());
}
@@ -424,6 +424,12 @@
AssertString(1, "\xe1\x88\xb4", "\x12\x34", 0x1234);
AssertString(1, "\xef\xbf\xbf", "\xff\xff", 0xffff);
AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69);
+
+ // Test four-byte characters.
+ AssertString(2, "\xf0\x9f\x8f\xa0", "\xd8\x3c\xdf\xe0", (31 * 0xd83c) + 0xdfe0);
+ AssertString(2, "\xf0\x9f\x9a\x80", "\xd8\x3d\xde\x80", (31 * 0xd83d) + 0xde80);
+ AssertString(4, "h\xf0\x9f\x9a\x80i", "\x00\x68\xd8\x3d\xde\x80\x00\x69",
+ (31 * (31 * (31 * 0x68 + 0xd83d) + 0xde80) + 0x69));
}
TEST_F(ObjectTest, StringEqualsUtf8) {
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 14d7de2..4a95519 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -33,6 +33,10 @@
return Class::ComputeClassSize(true, vtable_entries, 0, 1, 0, 1, 2);
}
+inline uint16_t String::UncheckedCharAt(int32_t index) {
+ return GetCharArray()->Get(index + GetOffset());
+}
+
inline CharArray* String::GetCharArray() {
return GetFieldObject<CharArray>(ValueOffset());
}
@@ -54,20 +58,6 @@
return Runtime::Current()->GetInternTable()->InternWeak(this);
}
-inline uint16_t String::CharAt(int32_t index) {
- // TODO: do we need this? Equals is the only caller, and could
- // bounds check itself.
- DCHECK_GE(count_, 0); // ensures the unsigned comparison is safe.
- if (UNLIKELY(static_cast<uint32_t>(index) >= static_cast<uint32_t>(count_))) {
- Thread* self = Thread::Current();
- ThrowLocation throw_location = self->GetCurrentLocationForThrow();
- self->ThrowNewExceptionF(throw_location, "Ljava/lang/StringIndexOutOfBoundsException;",
- "length=%i; index=%i", count_, index);
- return 0;
- }
- return GetCharArray()->Get(index + GetOffset());
-}
-
inline int32_t String::GetHashCode() {
int32_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(String, hash_code_));
if (UNLIKELY(result == 0)) {
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index e199d0e..e7c88c5 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -147,7 +147,7 @@
// Note: don't short circuit on hash code as we're presumably here as the
// hash code was already equal
for (int32_t i = 0; i < that->GetLength(); ++i) {
- if (this->CharAt(i) != that->CharAt(i)) {
+ if (this->UncheckedCharAt(i) != that->UncheckedCharAt(i)) {
return false;
}
}
@@ -160,7 +160,7 @@
return false;
} else {
for (int32_t i = 0; i < that_length; ++i) {
- if (this->CharAt(i) != that_chars[that_offset + i]) {
+ if (this->UncheckedCharAt(i) != that_chars[that_offset + i]) {
return false;
}
}
@@ -169,22 +169,52 @@
}
bool String::Equals(const char* modified_utf8) {
- for (int32_t i = 0; i < GetLength(); ++i) {
- uint16_t ch = GetUtf16FromUtf8(&modified_utf8);
- if (ch == '\0' || ch != CharAt(i)) {
+ const int32_t length = GetLength();
+ int32_t i = 0;
+ while (i < length) {
+ const uint32_t ch = GetUtf16FromUtf8(&modified_utf8);
+ if (ch == '\0') {
return false;
}
+
+ if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i++)) {
+ return false;
+ }
+
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+ if (trailing != 0) {
+ if (i == length) {
+ return false;
+ }
+
+ if (UncheckedCharAt(i++) != trailing) {
+ return false;
+ }
+ }
}
return *modified_utf8 == '\0';
}
bool String::Equals(const StringPiece& modified_utf8) {
+ const int32_t length = GetLength();
const char* p = modified_utf8.data();
- for (int32_t i = 0; i < GetLength(); ++i) {
- uint16_t ch = GetUtf16FromUtf8(&p);
- if (ch != CharAt(i)) {
+ for (int32_t i = 0; i < length; ++i) {
+ uint32_t ch = GetUtf16FromUtf8(&p);
+
+ if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i)) {
return false;
}
+
+ const uint16_t trailing = GetTrailingUtf16Char(ch);
+ if (trailing != 0) {
+ if (i == (length - 1)) {
+ return false;
+ }
+
+ if (UncheckedCharAt(++i) != trailing) {
+ return false;
+ }
+ }
}
return true;
}
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 30b8aa3..6c22b9b 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -69,8 +69,6 @@
int32_t GetUtfLength() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- uint16_t CharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
String* Intern() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
static String* AllocFromUtf16(Thread* self,
@@ -86,9 +84,14 @@
const char* utf8_data_in)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ // TODO: This is only used in the interpreter to compare against
+ // entries from a dex files constant pool (ArtField names). Should
+ // we unify this with Equals(const StringPiece&); ?
bool Equals(const char* modified_utf8) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- // TODO: do we need this overload? give it a more intention-revealing name.
+ // TODO: This is only used to compare DexCache.location with
+ // a dex_file's location (which is an std::string). Do we really
+ // need this in mirror::String just for that one usage ?
bool Equals(const StringPiece& modified_utf8)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -127,6 +130,9 @@
static void VisitRoots(RootCallback* callback, void* arg)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ // TODO: Make this private. It's only used on ObjectTest at the moment.
+ uint16_t UncheckedCharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
private:
void SetHashCode(int32_t new_hash_code) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
// Hash code is invariant so use non-transactional mode. Also disable check as we may run inside