Add newStringFromUtf8Bytes native implementation.
Porting a part of StringFactory.newStringFromBytes from libcore to
native
for UTF-8 character set. It can improve the UX score of Antutu v8 a
little
bit.
Only watch UX score. We test 10 times and average the total score.
Bechmark results
- before(Java implementation):
10 times avg,: 13133.9
- after (Native implementation):
10 times avg.: 13324.2
Diff.: +1.4%
Bug: 176514597
Test: ./test.py --host
Change-Id: I6b601c09663b21fdacde7f14b0db1ac4f0a94c0f
diff --git a/runtime/native/java_lang_StringFactory.cc b/runtime/native/java_lang_StringFactory.cc
index 178d5da..9086ee9 100644
--- a/runtime/native/java_lang_StringFactory.cc
+++ b/runtime/native/java_lang_StringFactory.cc
@@ -89,10 +89,186 @@
return soa.AddLocalReference<jstring>(result);
}
+static jstring StringFactory_newStringFromUtf8Bytes(JNIEnv* env, jclass, jbyteArray java_data,
+ jint offset, jint byte_count) {
+ // Local Define in here
+ static const jchar kReplacementChar = 0xfffd;
+ static const int kDefaultBufferSize = 256;
+ static const int kTableUtf8Needed[] = {
+ // 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
+ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
+ };
+
+ ScopedFastNativeObjectAccess soa(env);
+ if (UNLIKELY(java_data == nullptr)) {
+ ThrowNullPointerException("data == null");
+ return nullptr;
+ }
+
+ StackHandleScope<1> hs(soa.Self());
+ Handle<mirror::ByteArray> byte_array(hs.NewHandle(soa.Decode<mirror::ByteArray>(java_data)));
+ int32_t data_size = byte_array->GetLength();
+ if ((offset | byte_count) < 0 || byte_count > data_size - offset) {
+ soa.Self()->ThrowNewExceptionF("Ljava/lang/StringIndexOutOfBoundsException;",
+ "length=%d; regionStart=%d; regionLength=%d", data_size,
+ offset, byte_count);
+ return nullptr;
+ }
+
+ /*
+ * This code converts a UTF-8 byte sequence to a Java String (UTF-16).
+ * It implements the W3C recommended UTF-8 decoder.
+ * https://www.w3.org/TR/encoding/#utf-8-decoder
+ *
+ * Unicode 3.2 Well-Formed UTF-8 Byte Sequences
+ * Code Points First Second Third Fourth
+ * U+0000..U+007F 00..7F
+ * U+0080..U+07FF C2..DF 80..BF
+ * U+0800..U+0FFF E0 A0..BF 80..BF
+ * U+1000..U+CFFF E1..EC 80..BF 80..BF
+ * U+D000..U+D7FF ED 80..9F 80..BF
+ * U+E000..U+FFFF EE..EF 80..BF 80..BF
+ * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+ * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+ * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+ *
+ * Please refer to Unicode as the authority.
+ * p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
+ *
+ * Handling Malformed Input
+ * The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
+ * the longest code unit subsequence starting at an unconvertible offset that is either
+ * 1) the initial subsequence of a well-formed code unit sequence, or
+ * 2) a subsequence of length one:
+ * One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
+ * of a valid sequence, and with the conversion to restart after the incomplete sequence.
+ *
+ * For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
+ * "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
+ * but "C0" can't be the initial subsequence of any well-formed code unit sequence.
+ * Thus, the output should be "A\ufffd\ufffdA\ufffdA".
+ *
+ * Please refer to section "Best Practices for Using U+FFFD." in
+ * http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
+ */
+
+ // Initial value
+ jchar stack_buffer[kDefaultBufferSize];
+ std::unique_ptr<jchar[]> allocated_buffer;
+ jchar* v;
+ if (byte_count <= kDefaultBufferSize) {
+ v = stack_buffer;
+ } else {
+ allocated_buffer.reset(new jchar[byte_count]);
+ v = allocated_buffer.get();
+ }
+
+ jbyte* d = byte_array->GetData();
+ DCHECK(d != nullptr);
+
+ int idx = offset;
+ int last = offset + byte_count;
+ int s = 0;
+
+ int code_point = 0;
+ int utf8_bytes_seen = 0;
+ int utf8_bytes_needed = 0;
+ int lower_bound = 0x80;
+ int upper_bound = 0xbf;
+ while (idx < last) {
+ int b = d[idx++] & 0xff;
+ if (utf8_bytes_needed == 0) {
+ if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx
+ v[s++] = (jchar) b;
+ continue;
+ }
+
+ if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte
+ v[s++] = kReplacementChar;
+ continue;
+ }
+
+ // 11xxxxxx
+ int tableLookupIndex = b & 0x3f;
+ utf8_bytes_needed = kTableUtf8Needed[tableLookupIndex];
+ if (utf8_bytes_needed == 0) {
+ v[s++] = kReplacementChar;
+ continue;
+ }
+
+ // utf8_bytes_needed
+ // 1: b & 0x1f
+ // 2: b & 0x0f
+ // 3: b & 0x07
+ code_point = b & (0x3f >> utf8_bytes_needed);
+ if (b == 0xe0) {
+ lower_bound = 0xa0;
+ } else if (b == 0xed) {
+ upper_bound = 0x9f;
+ } else if (b == 0xf0) {
+ lower_bound = 0x90;
+ } else if (b == 0xf4) {
+ upper_bound = 0x8f;
+ }
+ } else {
+ if (b < lower_bound || b > upper_bound) {
+ // The bytes seen are ill-formed. Substitute them with U+FFFD
+ v[s++] = kReplacementChar;
+ code_point = 0;
+ utf8_bytes_needed = 0;
+ utf8_bytes_seen = 0;
+ lower_bound = 0x80;
+ upper_bound = 0xbf;
+ /*
+ * According to the Unicode Standard,
+ * "a UTF-8 conversion process is required to never consume well-formed
+ * subsequences as part of its error handling for ill-formed subsequences"
+ * The current byte could be part of well-formed subsequences. Reduce the
+ * index by 1 to parse it in next loop.
+ */
+ idx--;
+ continue;
+ }
+
+ lower_bound = 0x80;
+ upper_bound = 0xbf;
+ code_point = (code_point << 6) | (b & 0x3f);
+ utf8_bytes_seen++;
+ if (utf8_bytes_needed != utf8_bytes_seen) {
+ continue;
+ }
+
+ // Encode chars from U+10000 up as surrogate pairs
+ if (code_point < 0x10000) {
+ v[s++] = (jchar) code_point;
+ } else {
+ v[s++] = (jchar) ((code_point >> 10) + 0xd7c0);
+ v[s++] = (jchar) ((code_point & 0x3ff) + 0xdc00);
+ }
+
+ utf8_bytes_seen = 0;
+ utf8_bytes_needed = 0;
+ code_point = 0;
+ }
+ }
+
+ // The bytes seen are ill-formed. Substitute them by U+FFFD
+ if (utf8_bytes_needed != 0) {
+ v[s++] = kReplacementChar;
+ }
+
+ ObjPtr<mirror::String> result = mirror::String::AllocFromUtf16(soa.Self(), s, v);
+ return soa.AddLocalReference<jstring>(result);
+}
+
static JNINativeMethod gMethods[] = {
FAST_NATIVE_METHOD(StringFactory, newStringFromBytes, "([BIII)Ljava/lang/String;"),
FAST_NATIVE_METHOD(StringFactory, newStringFromChars, "(II[C)Ljava/lang/String;"),
FAST_NATIVE_METHOD(StringFactory, newStringFromString, "(Ljava/lang/String;)Ljava/lang/String;"),
+ FAST_NATIVE_METHOD(StringFactory, newStringFromUtf8Bytes, "([BII)Ljava/lang/String;"),
};
void register_java_lang_StringFactory(JNIEnv* env) {