diff options
author | 2015-10-28 15:06:12 +0300 | |
---|---|---|
committer | 2015-11-24 13:36:30 +0300 | |
commit | 1646d7a22e43a1fb25452ead47a4073e63d7f391 (patch) | |
tree | afa5614a1d262e83b3da57930a5266715e8acd97 | |
parent | 0f0d5f370dca8bbdb97c61ea30e1b0476ee7749a (diff) |
Optimize some commonly used utf8 functions by:
- using counted loops instead of searching for terminating null. In
the important cases the caller already knows the length: change
the API to pass it in. Keep the old API version as well to avoid
extensive changes to non-critical debug and test code.
- ensure the common cases are at the start of if/then/else chains.
Usually 99+% of characters are ASCII even in mixed strings.
- for the "convert" functions, when both utf8 and utf16 lengths are
passed, and are equal, it means the entire string is ASCII, and a
specialized loop can be used. The compiler might then unroll or
even vectorize this.
The functions improved are (tested on Nexus 5 with a 44 character
ASCII string):
CountModifiedUtf8Chars : 20% faster
ConvertUtf16ToModifiedUtf8: 80% faster
ConvertModifiedUtf8ToUtf16: 200% faster
Also for completeness CountUtf8Bytes has been cleaned up a little, but
the speed is unchanged. Unlike CountModifiedUtf8Chars, it was already
passed the length, rather than searching for null.
Change-Id: I1c9b7dea3eda869fc9f5f6b4dd6be8cdd5bc3ac0
-rw-r--r-- | runtime/jni_internal.cc | 5 | ||||
-rw-r--r-- | runtime/mirror/string.cc | 15 | ||||
-rw-r--r-- | runtime/mirror/string.h | 4 | ||||
-rw-r--r-- | runtime/utf.cc | 116 | ||||
-rw-r--r-- | runtime/utf.h | 6 | ||||
-rw-r--r-- | runtime/utf_test.cc | 201 |
6 files changed, 302 insertions, 45 deletions
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc index 415109fb06..5e3fa199e5 100644 --- a/runtime/jni_internal.cc +++ b/runtime/jni_internal.cc @@ -1689,7 +1689,8 @@ class JNI { } else { CHECK_NON_NULL_MEMCPY_ARGUMENT(length, buf); const jchar* chars = s->GetValue(); - ConvertUtf16ToModifiedUtf8(buf, chars + start, length); + size_t bytes = CountUtf8Bytes(chars + start, length); + ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length); } } @@ -1772,7 +1773,7 @@ class JNI { char* bytes = new char[byte_count + 1]; CHECK(bytes != nullptr); // bionic aborts anyway. const uint16_t* chars = s->GetValue(); - ConvertUtf16ToModifiedUtf8(bytes, chars, s->GetLength()); + ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength()); bytes[byte_count] = '\0'; return bytes; } diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc index be869d4e6a..33aca0304c 100644 --- a/runtime/mirror/string.cc +++ b/runtime/mirror/string.cc @@ -109,12 +109,17 @@ String* String::AllocFromUtf16(Thread* self, int32_t utf16_length, const uint16_ String* String::AllocFromModifiedUtf8(Thread* self, const char* utf) { DCHECK(utf != nullptr); - size_t char_count = CountModifiedUtf8Chars(utf); - return AllocFromModifiedUtf8(self, char_count, utf); + size_t byte_count = strlen(utf); + size_t char_count = CountModifiedUtf8Chars(utf, byte_count); + return AllocFromModifiedUtf8(self, char_count, utf, byte_count); +} + +String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, const char* utf8_data_in) { + return AllocFromModifiedUtf8(self, utf16_length, utf8_data_in, strlen(utf8_data_in)); } String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, - const char* utf8_data_in) { + const char* utf8_data_in, int32_t utf8_length) { gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator(); SetStringCountVisitor visitor(utf16_length); String* string = Alloc<true>(self, utf16_length, allocator_type, visitor); @@ -122,7 +127,7 @@ String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, return nullptr; } uint16_t* utf16_data_out = string->GetValue(); - ConvertModifiedUtf8ToUtf16(utf16_data_out, utf8_data_in); + ConvertModifiedUtf8ToUtf16(utf16_data_out, utf16_length, utf8_data_in, utf8_length); return string; } @@ -217,7 +222,7 @@ std::string String::ToModifiedUtf8() { const uint16_t* chars = GetValue(); size_t byte_count = GetUtfLength(); std::string result(byte_count, static_cast<char>(0)); - ConvertUtf16ToModifiedUtf8(&result[0], chars, GetLength()); + ConvertUtf16ToModifiedUtf8(&result[0], byte_count, chars, GetLength()); return result; } diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h index 80ebd2cf0f..e2cfb8d5ad 100644 --- a/runtime/mirror/string.h +++ b/runtime/mirror/string.h @@ -116,6 +116,10 @@ class MANAGED String FINAL : public Object { static String* AllocFromModifiedUtf8(Thread* self, const char* utf) SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_); + static String* AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, + const char* utf8_data_in, int32_t utf8_length) + SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_); + static String* AllocFromModifiedUtf8(Thread* self, int32_t utf16_length, const char* utf8_data_in) SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_); diff --git a/runtime/utf.cc b/runtime/utf.cc index 10600e2153..5a116980c9 100644 --- a/runtime/utf.cc +++ b/runtime/utf.cc @@ -23,28 +23,50 @@ namespace art { +// This is used only from debugger and test code. size_t CountModifiedUtf8Chars(const char* utf8) { + return CountModifiedUtf8Chars(utf8, strlen(utf8)); +} + +/* + * This does not validate UTF8 rules (nor did older code). But it gets the right answer + * for valid UTF-8 and that's fine because it's used only to size a buffer for later + * conversion. + * + * Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows: + * U+0001 - U+007F 0xxxxxxx + * U+0080 - U+07FF 110xxxxx 10xxxxxx + * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx + * U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from + * standard UTF-8). + * The four byte encoding converts to two utf16 characters. + */ +size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) { + DCHECK_LE(byte_count, strlen(utf8)); size_t len = 0; - int ic; - while ((ic = *utf8++) != '\0') { + const char* end = utf8 + byte_count; + for (; utf8 < end; ++utf8) { + int ic = *utf8; len++; - if ((ic & 0x80) == 0) { - // one-byte encoding + if (LIKELY((ic & 0x80) == 0)) { + // One-byte encoding. continue; } - // two- or three-byte encoding + // Two- or three-byte encoding. utf8++; if ((ic & 0x20) == 0) { - // two-byte encoding + // Two-byte encoding. continue; } utf8++; if ((ic & 0x10) == 0) { - // three-byte encoding + // Three-byte encoding. continue; } - // four-byte encoding: needs to be converted into a surrogate + // Four-byte encoding: needs to be converted into a surrogate // pair. utf8++; len++; @@ -52,6 +74,7 @@ size_t CountModifiedUtf8Chars(const char* utf8) { return len; } +// This is used only from debugger and test code. void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) { while (*utf8_data_in != '\0') { const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in); @@ -65,13 +88,53 @@ void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_ } } -void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count) { +void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars, + const char* utf8_data_in, size_t in_bytes) { + const char *in_start = utf8_data_in; + const char *in_end = utf8_data_in + in_bytes; + uint16_t *out_p = utf16_data_out; + + if (LIKELY(out_chars == in_bytes)) { + // Common case where all characters are ASCII. + for (const char *p = in_start; p < in_end;) { + // Safe even if char is signed because ASCII characters always have + // the high bit cleared. + *out_p++ = dchecked_integral_cast<uint16_t>(*p++); + } + return; + } + + // String contains non-ASCII characters. + for (const char *p = in_start; p < in_end;) { + const uint32_t ch = GetUtf16FromUtf8(&p); + const uint16_t leading = GetLeadingUtf16Char(ch); + const uint16_t trailing = GetTrailingUtf16Char(ch); + + *out_p++ = leading; + if (trailing != 0) { + *out_p++ = trailing; + } + } +} + +void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, + const uint16_t* utf16_in, size_t char_count) { + if (LIKELY(byte_count == char_count)) { + // Common case where all characters are ASCII. + const uint16_t *utf16_end = utf16_in + char_count; + for (const uint16_t *p = utf16_in; p < utf16_end;) { + *utf8_out++ = dchecked_integral_cast<char>(*p++); + } + return; + } + + // String contains non-ASCII characters. while (char_count--) { const uint16_t ch = *utf16_in++; if (ch > 0 && ch <= 0x7f) { *utf8_out++ = ch; } else { - // char_count == 0 here implies we've encountered an unpaired + // Char_count == 0 here implies we've encountered an unpaired // surrogate and we have no choice but to encode it as 3-byte UTF // sequence. Note that unpaired surrogates can occur as a part of // "normal" operation. @@ -161,34 +224,31 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { size_t result = 0; - while (char_count--) { + const uint16_t *end = chars + char_count; + while (chars < end) { const uint16_t ch = *chars++; - if (ch > 0 && ch <= 0x7f) { - ++result; - } else if (ch >= 0xd800 && ch <= 0xdbff) { - if (char_count > 0) { + if (LIKELY(ch != 0 && ch < 0x80)) { + result++; + continue; + } + if (ch < 0x800) { + result += 2; + continue; + } + if (ch >= 0xd800 && ch < 0xdc00) { + if (chars < end) { const uint16_t ch2 = *chars; // If we find a properly paired surrogate, we emit it as a 4 byte // UTF sequence. If we find an unpaired leading or trailing surrogate, // we emit it as a 3 byte sequence like would have done earlier. - if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + if (ch2 >= 0xdc00 && ch2 < 0xe000) { chars++; - char_count--; - result += 4; - } else { - result += 3; + continue; } - } else { - // This implies we found an unpaired trailing surrogate at the end - // of a string. - result += 3; } - } else if (ch > 0x7ff) { - result += 3; - } else { - result += 2; } + result += 3; } return result; } diff --git a/runtime/utf.h b/runtime/utf.h index 1193d29c7d..03158c492d 100644 --- a/runtime/utf.h +++ b/runtime/utf.h @@ -40,6 +40,7 @@ namespace mirror { * Returns the number of UTF-16 characters in the given modified UTF-8 string. */ size_t CountModifiedUtf8Chars(const char* utf8); +size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count); /* * Returns the number of modified UTF-8 bytes needed to represent the given @@ -51,6 +52,8 @@ size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count); * Convert from Modified UTF-8 to UTF-16. */ void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, const char* utf8_in); +void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, size_t out_chars, + const char* utf8_in, size_t in_bytes); /* * Compare two modified UTF-8 strings as UTF-16 code point values in a non-locale sensitive manner @@ -71,7 +74,8 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t * this anyway, so if you want a NUL-terminated string, you know where to * put the NUL byte. */ -void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count); +void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count, + const uint16_t* utf16_in, size_t char_count); /* * The java.lang.String hashCode() algorithm. diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc index 94a6ea57e2..64e4eb798b 100644 --- a/runtime/utf_test.cc +++ b/runtime/utf_test.cc @@ -48,7 +48,7 @@ static const uint8_t kAllSequences[] = { }; // A test string that contains a UTF-8 encoding of a surrogate pair -// (code point = U+10400) +// (code point = U+10400). static const uint8_t kSurrogateEncoding[] = { 0xed, 0xa0, 0x81, 0xed, 0xb0, 0x80, @@ -66,13 +66,13 @@ TEST_F(UtfTest, GetUtf16FromUtf8) { EXPECT_EQ(0, GetTrailingUtf16Char(pair)); EXPECT_ARRAY_POSITION(1, ptr, start); - // Two byte sequence + // Two byte sequence. pair = GetUtf16FromUtf8(&ptr); EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); EXPECT_EQ(0, GetTrailingUtf16Char(pair)); EXPECT_ARRAY_POSITION(3, ptr, start); - // Three byte sequence + // Three byte sequence. pair = GetUtf16FromUtf8(&ptr); EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); EXPECT_EQ(0, GetTrailingUtf16Char(pair)); @@ -84,7 +84,7 @@ TEST_F(UtfTest, GetUtf16FromUtf8) { EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); EXPECT_ARRAY_POSITION(10, ptr, start); - // Null terminator + // Null terminator. pair = GetUtf16FromUtf8(&ptr); EXPECT_EQ(0, GetLeadingUtf16Char(pair)); EXPECT_EQ(0, GetTrailingUtf16Char(pair)); @@ -117,7 +117,8 @@ static void AssertConversion(const std::vector<uint16_t> input, ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); std::vector<uint8_t> output(expected.size()); - ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size()); + ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(), + &input[0], input.size()); EXPECT_EQ(expected, output); } @@ -139,10 +140,10 @@ TEST_F(UtfTest, CountAndConvertUtf8Bytes) { AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); AssertConversion({ - 0xd802, 0xdc02, // Surrogate pair - 0xdef0, 0xdcff, // Three byte encodings - 0x0101, 0x0000, // Two byte encodings - 'p' , 'p' // One byte encoding + 0xd802, 0xdc02, // Surrogate pair. + 0xdef0, 0xdcff, // Three byte encodings. + 0x0101, 0x0000, // Two byte encodings. + 'p' , 'p' // One byte encoding. }, { 0xf0, 0x90, 0xa0, 0x82, 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, @@ -160,4 +161,186 @@ TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' }); } +// Old versions of functions, here to compare answers with optimized versions. + +size_t CountModifiedUtf8Chars_reference(const char* utf8) { + size_t len = 0; + int ic; + while ((ic = *utf8++) != '\0') { + len++; + if ((ic & 0x80) == 0) { + // one-byte encoding + continue; + } + // two- or three-byte encoding + utf8++; + if ((ic & 0x20) == 0) { + // two-byte encoding + continue; + } + utf8++; + if ((ic & 0x10) == 0) { + // three-byte encoding + continue; + } + + // four-byte encoding: needs to be converted into a surrogate + // pair. + utf8++; + len++; + } + return len; +} + +static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { + size_t result = 0; + while (char_count--) { + const uint16_t ch = *chars++; + if (ch > 0 && ch <= 0x7f) { + ++result; + } else if (ch >= 0xd800 && ch <= 0xdbff) { + if (char_count > 0) { + const uint16_t ch2 = *chars; + // If we find a properly paired surrogate, we emit it as a 4 byte + // UTF sequence. If we find an unpaired leading or trailing surrogate, + // we emit it as a 3 byte sequence like would have done earlier. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + chars++; + char_count--; + + result += 4; + } else { + result += 3; + } + } else { + // This implies we found an unpaired trailing surrogate at the end + // of a string. + result += 3; + } + } else if (ch > 0x7ff) { + result += 3; + } else { + result += 2; + } + } + return result; +} + +static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in, + size_t char_count) { + while (char_count--) { + const uint16_t ch = *utf16_in++; + if (ch > 0 && ch <= 0x7f) { + *utf8_out++ = ch; + } else { + // Char_count == 0 here implies we've encountered an unpaired + // surrogate and we have no choice but to encode it as 3-byte UTF + // sequence. Note that unpaired surrogates can occur as a part of + // "normal" operation. + if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { + const uint16_t ch2 = *utf16_in; + + // Check if the other half of the pair is within the expected + // range. If it isn't, we will have to emit both "halves" as + // separate 3 byte sequences. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + utf16_in++; + char_count--; + const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; + *utf8_out++ = (code_point >> 18) | 0xf0; + *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; + *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; + *utf8_out++ = (code_point & 0x3f) | 0x80; + continue; + } + } + + if (ch > 0x07ff) { + // Three byte encoding. + *utf8_out++ = (ch >> 12) | 0xe0; + *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; + *utf8_out++ = (ch & 0x3f) | 0x80; + } else /*(ch > 0x7f || ch == 0)*/ { + // Two byte encoding. + *utf8_out++ = (ch >> 6) | 0xc0; + *utf8_out++ = (ch & 0x3f) | 0x80; + } + } + } +} + +// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again. + +static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) { + first = (code_point >> 10) + 0xd7c0; + second = (code_point & 0x03ff) + 0xdc00; +} + +static void testConversions(uint16_t *buf, int char_count) { + char bytes_test[8], bytes_reference[8]; + uint16_t out_buf_test[4], out_buf_reference[4]; + int byte_count_test, byte_count_reference; + int char_count_test, char_count_reference; + + // Calculate the number of utf-8 bytes for the utf-16 chars. + byte_count_reference = CountUtf8Bytes_reference(buf, char_count); + byte_count_test = CountUtf8Bytes(buf, char_count); + EXPECT_EQ(byte_count_reference, byte_count_test); + + // Convert the utf-16 string to utf-8 bytes. + ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count); + ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count); + for (int i = 0; i < byte_count_test; ++i) { + EXPECT_EQ(bytes_reference[i], bytes_test[i]); + } + + // Calculate the number of utf-16 chars from the utf-8 bytes. + bytes_reference[byte_count_reference] = 0; // Reference function needs null termination. + char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference); + char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test); + EXPECT_EQ(char_count, char_count_reference); + EXPECT_EQ(char_count, char_count_test); + + // Convert the utf-8 bytes back to utf-16 chars. + // Does not need copied _reference version of the function because the original + // function with the old API is retained for debug/testing code. + ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference); + ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test); + for (int i = 0; i < char_count_test; ++i) { + EXPECT_EQ(buf[i], out_buf_reference[i]); + EXPECT_EQ(buf[i], out_buf_test[i]); + } +} + +TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) { + for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) { + uint16_t buf[4]; + if (codePoint <= 0xffff) { + if (codePoint >= 0xd800 && codePoint <= 0xdfff) { + // According to the Unicode standard, no character will ever + // be assigned to these code points, and they can not be encoded + // into either utf-16 or utf-8. + continue; + } + buf[0] = 'h'; + buf[1] = codePoint; + buf[2] = 'e'; + testConversions(buf, 2); + testConversions(buf, 3); + testConversions(buf + 1, 1); + testConversions(buf + 1, 2); + } else { + buf[0] = 'h'; + codePointToSurrogatePair(codePoint, buf[1], buf[2]); + buf[3] = 'e'; + testConversions(buf, 2); + testConversions(buf, 3); + testConversions(buf, 4); + testConversions(buf + 1, 1); + testConversions(buf + 1, 2); + testConversions(buf + 1, 3); + } + } +} + } // namespace art |