diff options
author | 2021-07-12 01:25:23 +0000 | |
---|---|---|
committer | 2021-07-12 02:58:46 +0000 | |
commit | 1b9d442dc906d0158300c5178683f417fa59b026 (patch) | |
tree | dd52852cd7a16af7f5480c5f6c9213079a319dc8 | |
parent | 52eebc756c96fbf71ad77b6d30bdff8a67723569 (diff) |
Revert "Do not create 4-byte sequences in `ConvertUtf16ToModifiedUtf8()`"
This reverts commit e0a4f373dc4a738a2f26965a67d31239cbbd4f6a.
Reason for revert: DroidMonitor-triggered revert due to breakage https://android-build.googleplex.com/builds/tests/view?invocationId=I44000009917256156&testResultId=TR23027520914949881, bug b/193363191
Bug: 193363191
Change-Id: I2e7fe6de3f63864b58729d2c76e9ab5c95aa7983
-rw-r--r-- | libdexfile/dex/utf.cc | 43 | ||||
-rw-r--r-- | libdexfile/dex/utf.h | 14 | ||||
-rw-r--r-- | libdexfile/dex/utf_test.cc | 90 | ||||
-rw-r--r-- | runtime/jni/jni_internal.cc | 2 | ||||
-rw-r--r-- | runtime/jni/jni_internal_test.cc | 23 | ||||
-rw-r--r-- | runtime/mirror/string-inl.h | 2 | ||||
-rw-r--r-- | test/181-proxy-non-bmp/expected-stderr.txt | 0 | ||||
-rw-r--r-- | test/181-proxy-non-bmp/expected-stdout.txt | 1 | ||||
-rw-r--r-- | test/181-proxy-non-bmp/info.txt | 6 | ||||
-rw-r--r-- | test/181-proxy-non-bmp/src/Main.java | 23 | ||||
-rw-r--r-- | test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java | 49 | ||||
-rw-r--r-- | test/906-iterate-heap/iterate_heap.cc | 2 | ||||
-rw-r--r-- | test/913-heaps/heaps.cc | 2 | ||||
-rw-r--r-- | test/ti-agent/ti_utf.h | 37 | ||||
-rwxr-xr-x | test/utils/regen-test-files | 3 |
15 files changed, 159 insertions, 138 deletions
diff --git a/libdexfile/dex/utf.cc b/libdexfile/dex/utf.cc index 76fb49f7e7..bfc704d4a6 100644 --- a/libdexfile/dex/utf.cc +++ b/libdexfile/dex/utf.cc @@ -121,10 +121,8 @@ void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars, } } -void ConvertUtf16ToModifiedUtf8(char* utf8_out, - size_t byte_count, - const uint16_t* utf16_in, - size_t char_count) { +void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, + const uint16_t* utf16_in, size_t char_count) { if (LIKELY(byte_count == char_count)) { // Common case where all characters are ASCII. const uint16_t *utf16_end = utf16_in + char_count; @@ -140,6 +138,28 @@ void ConvertUtf16ToModifiedUtf8(char* utf8_out, if (ch > 0 && ch <= 0x7f) { *utf8_out++ = ch; } else { + // Char_count == 0 here implies we've encountered an unpaired + // surrogate and we have no choice but to encode it as 3-byte UTF + // sequence. Note that unpaired surrogates can occur as a part of + // "normal" operation. + if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { + const uint16_t ch2 = *utf16_in; + + // Check if the other half of the pair is within the expected + // range. If it isn't, we will have to emit both "halves" as + // separate 3 byte sequences. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + utf16_in++; + char_count--; + const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; + *utf8_out++ = (code_point >> 18) | 0xf0; + *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; + *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; + *utf8_out++ = (code_point & 0x3f) | 0x80; + continue; + } + } + if (ch > 0x07ff) { // Three byte encoding. *utf8_out++ = (ch >> 12) | 0xe0; @@ -220,7 +240,7 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t } } -size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) { +size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { size_t result = 0; const uint16_t *end = chars + char_count; while (chars < end) { @@ -233,6 +253,19 @@ size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) { result += 2; continue; } + if (ch >= 0xd800 && ch < 0xdc00) { + if (chars < end) { + const uint16_t ch2 = *chars; + // If we find a properly paired surrogate, we emit it as a 4 byte + // UTF sequence. If we find an unpaired leading or trailing surrogate, + // we emit it as a 3 byte sequence like would have done earlier. + if (ch2 >= 0xdc00 && ch2 < 0xe000) { + chars++; + result += 4; + continue; + } + } + } result += 3; } return result; diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h index 55d025df42..e3dc7f9474 100644 --- a/libdexfile/dex/utf.h +++ b/libdexfile/dex/utf.h @@ -43,7 +43,7 @@ size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count); * Returns the number of modified UTF-8 bytes needed to represent the given * UTF-16 string. */ -size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count); +size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count); /* * Convert from Modified UTF-8 to UTF-16. @@ -67,14 +67,12 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t /* * Convert from UTF-16 to Modified UTF-8. Note that the output is _not_ - * NUL-terminated. You probably need to call CountModifiedUtf8Bytes before - * calling this anyway, so if you want a NUL-terminated string, you know - * where to put the NUL byte. + * NUL-terminated. You probably need to call CountUtf8Bytes before calling + * this anyway, so if you want a NUL-terminated string, you know where to + * put the NUL byte. */ -void ConvertUtf16ToModifiedUtf8(char* utf8_out, - size_t byte_count, - const uint16_t* utf16_in, - size_t char_count); +void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, + const uint16_t* utf16_in, size_t char_count); /* * The java.lang.String hashCode() algorithm. diff --git a/libdexfile/dex/utf_test.cc b/libdexfile/dex/utf_test.cc index fc24d545e7..919259e4d3 100644 --- a/libdexfile/dex/utf_test.cc +++ b/libdexfile/dex/utf_test.cc @@ -117,7 +117,7 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) { static void AssertConversion(const std::vector<uint16_t>& input, const std::vector<uint8_t>& expected) { - ASSERT_EQ(expected.size(), CountModifiedUtf8Bytes(&input[0], input.size())); + ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); std::vector<uint8_t> output(expected.size()); ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(), @@ -126,8 +126,8 @@ static void AssertConversion(const std::vector<uint16_t>& input, } TEST_F(UtfTest, CountAndConvertUtf8Bytes) { - // Surrogate pairs will be converted into two three-byte sequences. - AssertConversion({ 0xd801, 0xdc00 }, { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80 }); + // Surrogate pairs will be converted into 4 byte sequences. + AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 }); // Three byte encodings that are below & above the leading surrogate // range respectively. @@ -143,12 +143,12 @@ TEST_F(UtfTest, CountAndConvertUtf8Bytes) { AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); AssertConversion({ - 0xd802, 0xdc02, // Surrogate pair - three byte encodings. + 0xd802, 0xdc02, // Surrogate pair. 0xdef0, 0xdcff, // Three byte encodings. 0x0101, 0x0000, // Two byte encodings. 'p' , 'p' // One byte encoding. }, { - 0xed, 0xa0, 0x82, 0xed, 0xb0, 0x82, + 0xf0, 0x90, 0xa0, 0x82, 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, 0xc4, 0x81, 0xc0, 0x80, 0x70, 0x70 @@ -229,12 +229,31 @@ size_t CountModifiedUtf8Chars_reference(const char* utf8) { return len; } -static size_t CountModifiedUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { +static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { size_t result = 0; while (char_count--) { const uint16_t ch = *chars++; if (ch > 0 && ch <= 0x7f) { ++result; + } else if (ch >= 0xd800 && ch <= 0xdbff) { + if (char_count > 0) { + const uint16_t ch2 = *chars; + // If we find a properly paired surrogate, we emit it as a 4 byte + // UTF sequence. If we find an unpaired leading or trailing surrogate, + // we emit it as a 3 byte sequence like would have done earlier. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + chars++; + char_count--; + + result += 4; + } else { + result += 3; + } + } else { + // This implies we found an unpaired trailing surrogate at the end + // of a string. + result += 3; + } } else if (ch > 0x7ff) { result += 3; } else { @@ -251,6 +270,28 @@ static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* if (ch > 0 && ch <= 0x7f) { *utf8_out++ = ch; } else { + // Char_count == 0 here implies we've encountered an unpaired + // surrogate and we have no choice but to encode it as 3-byte UTF + // sequence. Note that unpaired surrogates can occur as a part of + // "normal" operation. + if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { + const uint16_t ch2 = *utf16_in; + + // Check if the other half of the pair is within the expected + // range. If it isn't, we will have to emit both "halves" as + // separate 3 byte sequences. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + utf16_in++; + char_count--; + const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; + *utf8_out++ = (code_point >> 18) | 0xf0; + *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; + *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; + *utf8_out++ = (code_point & 0x3f) | 0x80; + continue; + } + } + if (ch > 0x07ff) { // Three byte encoding. *utf8_out++ = (ch >> 12) | 0xe0; @@ -272,40 +313,39 @@ static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint1 second = (code_point & 0x03ff) + 0xdc00; } -static void testConversions(uint16_t *buf, size_t char_count) { +static void testConversions(uint16_t *buf, int char_count) { + char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 }; + uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 }; + int byte_count_test, byte_count_reference; + int char_count_test, char_count_reference; + // Calculate the number of utf-8 bytes for the utf-16 chars. - size_t byte_count_reference = CountModifiedUtf8Bytes_reference(buf, char_count); - size_t byte_count_test = CountModifiedUtf8Bytes(buf, char_count); - ASSERT_EQ(byte_count_reference, byte_count_test); + byte_count_reference = CountUtf8Bytes_reference(buf, char_count); + byte_count_test = CountUtf8Bytes(buf, char_count); + EXPECT_EQ(byte_count_reference, byte_count_test); // Convert the utf-16 string to utf-8 bytes. - char bytes_test[8], bytes_reference[9]; - ASSERT_LT(byte_count_reference, arraysize(bytes_reference)); ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count); - ASSERT_LE(byte_count_test, arraysize(bytes_test)); ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count); - for (size_t i = 0; i < byte_count_test; ++i) { - ASSERT_EQ(bytes_reference[i], bytes_test[i]); + for (int i = 0; i < byte_count_test; ++i) { + EXPECT_EQ(bytes_reference[i], bytes_test[i]); } // Calculate the number of utf-16 chars from the utf-8 bytes. bytes_reference[byte_count_reference] = 0; // Reference function needs null termination. - size_t char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference); - size_t char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test); - ASSERT_EQ(char_count, char_count_reference); - ASSERT_EQ(char_count, char_count_test); + char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference); + char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test); + EXPECT_EQ(char_count, char_count_reference); + EXPECT_EQ(char_count, char_count_test); // Convert the utf-8 bytes back to utf-16 chars. // Does not need copied _reference version of the function because the original // function with the old API is retained for debug/testing code. - uint16_t out_buf_test[4], out_buf_reference[4]; - ASSERT_LE(char_count_reference, arraysize(out_buf_reference)); ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference); - ASSERT_LE(char_count_test, arraysize(out_buf_test)); ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test); - for (size_t i = 0; i < char_count_test; ++i) { - ASSERT_EQ(buf[i], out_buf_reference[i]); - ASSERT_EQ(buf[i], out_buf_test[i]); + for (int i = 0; i < char_count_test; ++i) { + EXPECT_EQ(buf[i], out_buf_reference[i]); + EXPECT_EQ(buf[i], out_buf_test[i]); } } diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc index cac6eb8e6a..45fbf003be 100644 --- a/runtime/jni/jni_internal.cc +++ b/runtime/jni/jni_internal.cc @@ -2022,7 +2022,7 @@ class JNI { buf[length] = '\0'; } else { const jchar* chars = s->GetValue(); - size_t bytes = CountModifiedUtf8Bytes(chars + start, length); + size_t bytes = CountUtf8Bytes(chars + start, length); ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length); buf[bytes] = '\0'; } diff --git a/runtime/jni/jni_internal_test.cc b/runtime/jni/jni_internal_test.cc index 4c675f4ba2..ed8439729e 100644 --- a/runtime/jni/jni_internal_test.cc +++ b/runtime/jni/jni_internal_test.cc @@ -1510,13 +1510,13 @@ TEST_F(JniInternalTest, NewStringUTF) { EXPECT_NE(s, nullptr); EXPECT_EQ(2, env_->GetStringLength(s)); - // The surrogate pair gets encoded into two 3-byte sequences... - EXPECT_EQ(6, env_->GetStringUTFLength(s)); + // The surrogate pair gets encoded into a 4 byte UTF sequence.. + EXPECT_EQ(4, env_->GetStringUTFLength(s)); const char* chars = env_->GetStringUTFChars(s, nullptr); - EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars); + EXPECT_STREQ("\xf0\x90\x90\x80", chars); env_->ReleaseStringUTFChars(s, chars); - // ... and it is stored as the two surrogates in the utf-16 representation. + // .. but is stored as is in the utf-16 representation. const jchar* jchars = env_->GetStringChars(s, nullptr); EXPECT_EQ(0xd801, jchars[0]); EXPECT_EQ(0xdc00, jchars[1]); @@ -1527,9 +1527,7 @@ TEST_F(JniInternalTest, NewStringUTF) { EXPECT_NE(s, nullptr); // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate - // pair {0xd83c, 0xdfe0} which is then converted into a two three byte - // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of - // the surrogate pair. + // pair {0xd83c, 0xdfe0}. EXPECT_EQ(5, env_->GetStringLength(s)); jchars = env_->GetStringChars(s, nullptr); // The first surrogate pair, encoded as such in the input. @@ -1540,9 +1538,9 @@ TEST_F(JniInternalTest, NewStringUTF) { EXPECT_EQ(0xdfe0, jchars[4]); env_->ReleaseStringChars(s, jchars); - EXPECT_EQ(13, env_->GetStringUTFLength(s)); + EXPECT_EQ(9, env_->GetStringUTFLength(s)); chars = env_->GetStringUTFChars(s, nullptr); - EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars); + EXPECT_STREQ("\xf0\x90\x90\x80 \xf0\x9f\x8f\xa0", chars); env_->ReleaseStringUTFChars(s, chars); // A string with 1, 2, 3 and 4 byte UTF sequences with spaces @@ -1550,7 +1548,7 @@ TEST_F(JniInternalTest, NewStringUTF) { s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0"); EXPECT_NE(s, nullptr); EXPECT_EQ(8, env_->GetStringLength(s)); - EXPECT_EQ(15, env_->GetStringUTFLength(s)); + EXPECT_EQ(13, env_->GetStringUTFLength(s)); } TEST_F(JniInternalTest, NewStringUTF_Validation) { @@ -1859,13 +1857,12 @@ TEST_F(JniInternalTest, GetStringCritical_ReleaseStringCritical) { env_->ReleaseStringCritical(s, chars); if (mirror::kUseStringCompression) { - // is_copy_16 has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible + // is_copy has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible jboolean is_copy_16 = JNI_TRUE; jstring s_16 = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80"); chars = env_->GetStringCritical(s_16, &is_copy_16); - EXPECT_EQ(JNI_FALSE, is_copy_16); EXPECT_EQ(2, env_->GetStringLength(s_16)); - EXPECT_EQ(6, env_->GetStringUTFLength(s_16)); + EXPECT_EQ(4, env_->GetStringUTFLength(s_16)); env_->ReleaseStringCritical(s_16, chars); } } diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h index 11eff0f761..dd280364e1 100644 --- a/runtime/mirror/string-inl.h +++ b/runtime/mirror/string-inl.h @@ -88,7 +88,7 @@ inline int32_t String::GetUtfLength() { if (IsCompressed()) { return GetLength(); } else { - return CountModifiedUtf8Bytes(GetValue(), GetLength()); + return CountUtf8Bytes(GetValue(), GetLength()); } } diff --git a/test/181-proxy-non-bmp/expected-stderr.txt b/test/181-proxy-non-bmp/expected-stderr.txt deleted file mode 100644 index e69de29bb2..0000000000 --- a/test/181-proxy-non-bmp/expected-stderr.txt +++ /dev/null diff --git a/test/181-proxy-non-bmp/expected-stdout.txt b/test/181-proxy-non-bmp/expected-stdout.txt deleted file mode 100644 index 24fce9b270..0000000000 --- a/test/181-proxy-non-bmp/expected-stdout.txt +++ /dev/null @@ -1 +0,0 @@ -Invoke public abstract void pkg𐀀.PackageTestInterface.interfaceMethod() diff --git a/test/181-proxy-non-bmp/info.txt b/test/181-proxy-non-bmp/info.txt deleted file mode 100644 index a13a8f9ce9..0000000000 --- a/test/181-proxy-non-bmp/info.txt +++ /dev/null @@ -1,6 +0,0 @@ -Regression test for bad handling of package name containing a character outside -the BMP plane. For a proxy class with a non-public interface in such a package, -this caused the package name comparison to fail because the dex file encoding -had two three-byte sequences while the descriptor was encoded with a four-byte -sequence, leading to IAE when calling a proxy method via the interface. -Bug: 192935764 diff --git a/test/181-proxy-non-bmp/src/Main.java b/test/181-proxy-non-bmp/src/Main.java deleted file mode 100644 index e93859978e..0000000000 --- a/test/181-proxy-non-bmp/src/Main.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2021 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import pkg𐀀.PackageTest; - -public class Main { - public static void main(String[] args) { - PackageTest.main(); - } -} diff --git a/test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java b/test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java deleted file mode 100644 index e65d92f4e1..0000000000 --- a/test/181-proxy-non-bmp/src/pkg𐀀/PackageTest.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2021 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package pkg𐀀; - -import java.lang.reflect.InvocationHandler; -import java.lang.reflect.Constructor; -import java.lang.reflect.Method; -import java.lang.reflect.Proxy; - -public class PackageTest { - public static void main() { - InvocationHandler handler = new PackageInvocationHandler(); - Class<?> proxyClass = Proxy.getProxyClass( - PackageTestInterface.class.getClassLoader(), PackageTestInterface.class); - try { - Constructor<?> ctor = proxyClass.getConstructor(InvocationHandler.class); - Object proxy = ctor.newInstance(handler); - PackageTestInterface asInterface = (PackageTestInterface) proxy; - asInterface.interfaceMethod(); - } catch (Exception e) { - System.out.println("failed: " + e); - } - } -} - -interface PackageTestInterface { - public void interfaceMethod(); -} - -class PackageInvocationHandler implements InvocationHandler { - public Object invoke(Object proxy, Method method, Object[] args) { - System.out.println("Invoke " + method); - return null; - } -} diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc index cc941d86b1..521f9a6c72 100644 --- a/test/906-iterate-heap/iterate_heap.cc +++ b/test/906-iterate-heap/iterate_heap.cc @@ -198,7 +198,7 @@ extern "C" JNIEXPORT jstring JNICALL Java_art_Test906_iterateThroughHeapString( void* user_data) { FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data); if (*tag_ptr == p->tag_to_find) { - size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length); + size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length); std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]); memset(mod_utf.get(), 0, utf_byte_count + 1); ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length); diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc index 10355c9c1b..28a737de0f 100644 --- a/test/913-heaps/heaps.cc +++ b/test/913-heaps/heaps.cc @@ -586,7 +586,7 @@ extern "C" JNIEXPORT jobjectArray JNICALL Java_art_Test913_followReferencesStrin void* user_data) { FindStringCallbacks* p = reinterpret_cast<FindStringCallbacks*>(user_data); if (*tag_ptr != 0) { - size_t utf_byte_count = ti::CountModifiedUtf8Bytes(value, value_length); + size_t utf_byte_count = ti::CountUtf8Bytes(value, value_length); std::unique_ptr<char[]> mod_utf(new char[utf_byte_count + 1]); memset(mod_utf.get(), 0, utf_byte_count + 1); ti::ConvertUtf16ToModifiedUtf8(mod_utf.get(), utf_byte_count, value, value_length); diff --git a/test/ti-agent/ti_utf.h b/test/ti-agent/ti_utf.h index d646c0725f..341e1066c3 100644 --- a/test/ti-agent/ti_utf.h +++ b/test/ti-agent/ti_utf.h @@ -123,6 +123,28 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out, if (ch > 0 && ch <= 0x7f) { *utf8_out++ = ch; } else { + // Char_count == 0 here implies we've encountered an unpaired + // surrogate and we have no choice but to encode it as 3-byte UTF + // sequence. Note that unpaired surrogates can occur as a part of + // "normal" operation. + if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { + const uint16_t ch2 = *utf16_in; + + // Check if the other half of the pair is within the expected + // range. If it isn't, we will have to emit both "halves" as + // separate 3 byte sequences. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + utf16_in++; + char_count--; + const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; + *utf8_out++ = (code_point >> 18) | 0xf0; + *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; + *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; + *utf8_out++ = (code_point & 0x3f) | 0x80; + continue; + } + } + if (ch > 0x07ff) { // Three byte encoding. *utf8_out++ = (ch >> 12) | 0xe0; @@ -137,7 +159,7 @@ inline void ConvertUtf16ToModifiedUtf8(char* utf8_out, } } -inline size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) { +inline size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { size_t result = 0; const uint16_t *end = chars + char_count; while (chars < end) { @@ -150,6 +172,19 @@ inline size_t CountModifiedUtf8Bytes(const uint16_t* chars, size_t char_count) { result += 2; continue; } + if (ch >= 0xd800 && ch < 0xdc00) { + if (chars < end) { + const uint16_t ch2 = *chars; + // If we find a properly paired surrogate, we emit it as a 4 byte + // UTF sequence. If we find an unpaired leading or trailing surrogate, + // we emit it as a 3 byte sequence like would have done earlier. + if (ch2 >= 0xdc00 && ch2 < 0xe000) { + chars++; + result += 4; + continue; + } + } + } result += 3; } return result; diff --git a/test/utils/regen-test-files b/test/utils/regen-test-files index 12d85b834d..f9cb056817 100755 --- a/test/utils/regen-test-files +++ b/test/utils/regen-test-files @@ -555,9 +555,6 @@ class Generator: # Ignore test with a copy of `sun.misc.Unsafe`. if os.path.isfile(os.path.join(run_test_path, "src", "sun", "misc", "Unsafe.java")): return False - # Ignore test with a non-ascii package name `pkg𐀀`. b/193141629 - if os.path.isdir(os.path.join(run_test_path, "src", "pkg𐀀")): - return False # Ignore tests with Hidden API specs. if os.path.isfile(os.path.join(run_test_path, "hiddenapi-flags.csv")): return False |