Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
new file mode 100644
index 0000000..8048bbd
--- /dev/null
+++ b/runtime/utf_test.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utf.h"
+
+#include "common_runtime_test.h"
+#include "utf-inl.h"
+
+namespace art {
+
+class UtfTest : public CommonRuntimeTest {};
+
+TEST_F(UtfTest, GetLeadingUtf16Char) {
+ EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
+}
+
+TEST_F(UtfTest, GetTrailingUtf16Char) {
+ EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
+ EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
+}
+
+#define EXPECT_ARRAY_POSITION(expected, end, start) \
+ EXPECT_EQ(static_cast<uintptr_t>(expected), \
+ reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
+
+// A test string containing one, two, three and four byte UTF-8 sequences.
+static const uint8_t kAllSequences[] = {
+ 0x24,
+ 0xc2, 0xa2,
+ 0xe2, 0x82, 0xac,
+ 0xf0, 0x9f, 0x8f, 0xa0,
+ 0x00
+};
+
+// A test string that contains a UTF-8 encoding of a surrogate pair
+// (code point = U+10400)
+static const uint8_t kSurrogateEncoding[] = {
+ 0xed, 0xa0, 0x81,
+ 0xed, 0xb0, 0x80,
+ 0x00
+};
+
+TEST_F(UtfTest, GetUtf16FromUtf8) {
+ const char* const start = reinterpret_cast<const char*>(kAllSequences);
+ const char* ptr = start;
+ uint32_t pair = 0;
+
+ // Single byte sequence.
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(1, ptr, start);
+
+ // Two byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(3, ptr, start);
+
+ // Three byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(6, ptr, start);
+
+ // Four byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(10, ptr, start);
+
+ // Null terminator
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(11, ptr, start);
+}
+
+TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
+ const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
+ const char* ptr = start;
+ uint32_t pair = 0;
+
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(3, ptr, start);
+
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(6, ptr, start);
+}
+
+TEST_F(UtfTest, CountModifiedUtf8Chars) {
+ EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
+ EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
+}
+
+} // namespace art