ART: Improve VisitStringGetCharsNoCheck intrinsic for compressed strings, using SIMD The previous implementation of VisitStringGetCharsNoCheck copies one character at a time for compressed strings (that use 8 bits per char). Instead, use SIMD instructions to copy 8 chars at once where possible. On a Pixel 3 phone: Microbenchmarks for getCharsNoCheck on varying string lengths show a speedup of up to 80% (big cores) and 70% (little cores) on long strings, and around 30% (big) and 20% (little) on strings of only 8 characters. The overhead for strings of < 8 characters is ~3%, and is immediately amortized for strings of more than 8 characters. Dhrystone shows a consistent speedup of around 6% (big) and 4% (little). The getCharsNoCheck intrinsic is used by the StringBuilder append() method, which is used by the String concatenate operator ('+'). Image size change: Before: boot-core-libart.oat: 549040 boot.oat: 3789080 boot-framework.oat: 13356576 After: boot-core-libart.oat: 549024 (-16B) boot.oat: 3789144 (+64B) boot-framework.oat: 13356576 (+ 0B) Test: test_art_target.sh, test_art_host.sh Test: 536-checker-intrinsic-optimization Change-Id: I865e3df6d4725e151ae195a86e02e090dae8dd29

commit: 53d220e4ab1fc09921e11cabd8fdba388079f792 [log] [tgz]
author: David Horstmann <david.horstmann@linaro.org> Tue Jul 16 16:00:10 2019 +0100
committer: Nicolas Geoffray <ngeoffray@google.com> Mon Sep 23 12:10:12 2019 +0000
tree: 6d0f5dc6a509f903e1706996a90792ad5863877d
parent: d7ea0437a8dd11253b55651fcfab23b65d504ee2 [diff] [blame]
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 1fab712..185d487 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc

@@ -1960,7 +1960,8 @@
   Register tmp2 = temps.AcquireX();
 
   vixl::aarch64::Label done;
-  vixl::aarch64::Label compressed_string_loop;
+  vixl::aarch64::Label compressed_string_vector_loop;
+  vixl::aarch64::Label compressed_string_remainder;
   __ Sub(num_chr, srcEnd, srcBegin);
   // Early out for valid zero-length retrievals.
   __ Cbz(num_chr, &done);
@@ -2013,16 +2014,39 @@
   __ B(&done);
 
   if (mirror::kUseStringCompression) {
+    // For compressed strings, acquire a SIMD temporary register.
+    FPRegister vtmp1 = temps.AcquireVRegisterOfSize(kQRegSize);
     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
     DCHECK_EQ(c_char_size, 1u);
     __ Bind(&compressed_string_preloop);
     __ Add(src_ptr, src_ptr, Operand(srcBegin));
-    // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
-    __ Bind(&compressed_string_loop);
+
+    // Save repairing the value of num_chr on the < 8 character path.
+    __ Subs(tmp1, num_chr, 8);
+    __ B(lt, &compressed_string_remainder);
+
+    // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
+    __ Mov(num_chr, tmp1);
+
+    // Main loop for compressed src, copying 8 characters (8-bit) to (16-bit) at a time.
+    // Uses SIMD instructions.
+    __ Bind(&compressed_string_vector_loop);
+    __ Ld1(vtmp1.V8B(), MemOperand(src_ptr, c_char_size * 8, PostIndex));
+    __ Subs(num_chr, num_chr, 8);
+    __ Uxtl(vtmp1.V8H(), vtmp1.V8B());
+    __ St1(vtmp1.V8H(), MemOperand(dst_ptr, char_size * 8, PostIndex));
+    __ B(ge, &compressed_string_vector_loop);
+
+    __ Adds(num_chr, num_chr, 8);
+    __ B(eq, &done);
+
+    // Loop for < 8 character case and remainder handling with a compressed src.
+    // Copies 1 character (8-bit) to (16-bit) at a time.
+    __ Bind(&compressed_string_remainder);
     __ Ldrb(tmp1, MemOperand(src_ptr, c_char_size, PostIndex));
     __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
     __ Subs(num_chr, num_chr, Operand(1));
-    __ B(gt, &compressed_string_loop);
+    __ B(gt, &compressed_string_remainder);
   }
 
   __ Bind(&done);
commit	53d220e4ab1fc09921e11cabd8fdba388079f792	[log] [tgz]
author	David Horstmann <david.horstmann@linaro.org>	Tue Jul 16 16:00:10 2019 +0100
committer	Nicolas Geoffray <ngeoffray@google.com>	Mon Sep 23 12:10:12 2019 +0000
tree	6d0f5dc6a509f903e1706996a90792ad5863877d
parent	d7ea0437a8dd11253b55651fcfab23b65d504ee2 [diff] [blame]