ART: Improve VisitStringGetCharsNoCheck intrinsic for compressed strings, using SIMD

The previous implementation of VisitStringGetCharsNoCheck
copies one character at a time for compressed strings (that
use 8 bits per char).

Instead, use SIMD instructions to copy 8 chars at once
where possible.

On a Pixel 3 phone:

Microbenchmarks for getCharsNoCheck on varying string
lengths show a speedup of up to 80% (big cores) and
70% (little cores) on long strings, and around 30% (big)
and 20% (little) on strings of only 8 characters.

The overhead for strings of < 8 characters is ~3%,
and is immediately amortized for strings of more
than 8 characters.

Dhrystone shows a consistent speedup of around 6% (big)
and 4% (little).

The getCharsNoCheck intrinsic is used by the StringBuilder
append() method, which is used by the String concatenate
operator ('+').

Image size change:
  Before:
    boot-core-libart.oat:  549040
    boot.oat:             3789080
    boot-framework.oat:  13356576
  After:
    boot-core-libart.oat:  549024 (-16B)
    boot.oat:             3789144 (+64B)
    boot-framework.oat:  13356576 (+ 0B)

Test: test_art_target.sh, test_art_host.sh
Test: 536-checker-intrinsic-optimization

Change-Id: I865e3df6d4725e151ae195a86e02e090dae8dd29
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 1fab712..185d487 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1960,7 +1960,8 @@
   Register tmp2 = temps.AcquireX();
 
   vixl::aarch64::Label done;
-  vixl::aarch64::Label compressed_string_loop;
+  vixl::aarch64::Label compressed_string_vector_loop;
+  vixl::aarch64::Label compressed_string_remainder;
   __ Sub(num_chr, srcEnd, srcBegin);
   // Early out for valid zero-length retrievals.
   __ Cbz(num_chr, &done);
@@ -2013,16 +2014,39 @@
   __ B(&done);
 
   if (mirror::kUseStringCompression) {
+    // For compressed strings, acquire a SIMD temporary register.
+    FPRegister vtmp1 = temps.AcquireVRegisterOfSize(kQRegSize);
     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
     DCHECK_EQ(c_char_size, 1u);
     __ Bind(&compressed_string_preloop);
     __ Add(src_ptr, src_ptr, Operand(srcBegin));
-    // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
-    __ Bind(&compressed_string_loop);
+
+    // Save repairing the value of num_chr on the < 8 character path.
+    __ Subs(tmp1, num_chr, 8);
+    __ B(lt, &compressed_string_remainder);
+
+    // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
+    __ Mov(num_chr, tmp1);
+
+    // Main loop for compressed src, copying 8 characters (8-bit) to (16-bit) at a time.
+    // Uses SIMD instructions.
+    __ Bind(&compressed_string_vector_loop);
+    __ Ld1(vtmp1.V8B(), MemOperand(src_ptr, c_char_size * 8, PostIndex));
+    __ Subs(num_chr, num_chr, 8);
+    __ Uxtl(vtmp1.V8H(), vtmp1.V8B());
+    __ St1(vtmp1.V8H(), MemOperand(dst_ptr, char_size * 8, PostIndex));
+    __ B(ge, &compressed_string_vector_loop);
+
+    __ Adds(num_chr, num_chr, 8);
+    __ B(eq, &done);
+
+    // Loop for < 8 character case and remainder handling with a compressed src.
+    // Copies 1 character (8-bit) to (16-bit) at a time.
+    __ Bind(&compressed_string_remainder);
     __ Ldrb(tmp1, MemOperand(src_ptr, c_char_size, PostIndex));
     __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
     __ Subs(num_chr, num_chr, Operand(1));
-    __ B(gt, &compressed_string_loop);
+    __ B(gt, &compressed_string_remainder);
   }
 
   __ Bind(&done);
diff --git a/test/536-checker-intrinsic-optimization/src/Main.java b/test/536-checker-intrinsic-optimization/src/Main.java
index a2bd42f..630d86d 100644
--- a/test/536-checker-intrinsic-optimization/src/Main.java
+++ b/test/536-checker-intrinsic-optimization/src/Main.java
@@ -17,6 +17,21 @@
 import java.lang.reflect.Method;
 
 public class Main {
+  public static String smallString = generateString(100);
+  public static String mediumString = generateString(300);
+  public static String largeString = generateString(2000);
+
+  public static String generateString(int length) {
+    // Generate a string in the ASCII range that will
+    // use string compression.
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < length; i++) {
+      // Generate repeating alphabet.
+      sb.append(Character.valueOf((char)('a' + (i % 26))));
+    }
+    return sb.toString();
+  }
+
   public static void assertIntEquals(int expected, int result) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);
@@ -35,6 +50,12 @@
     }
   }
 
+  public static void assertStringEquals(String expected, String result) {
+    if (!expected.equals(result)) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   public static void assertStringContains(String searchTerm, String result) {
     if (result == null || !result.contains(searchTerm)) {
       throw new Error("Search term: " + searchTerm + ", not found in: " + result);
@@ -59,6 +80,30 @@
     assertCharEquals('c', $opt$noinline$stringCharAt("abc", 2));
     assertCharEquals('7', $opt$noinline$stringCharAt("0123456789", 7));
 
+    // Single character.
+    assertStringEquals("a", stringGetCharsAndBack("a"));
+    // Strings < 8 characters.
+    assertStringEquals("foobar", stringGetCharsAndBack("foobar"));
+    // Strings > 8 characters of various lengths.
+    assertStringEquals(smallString, stringGetCharsAndBack(smallString));
+    assertStringEquals(mediumString, stringGetCharsAndBack(mediumString));
+    assertStringEquals(largeString, stringGetCharsAndBack(largeString));
+
+    // Get only a substring:
+    // Substring < 8 characters.
+    assertStringEquals(smallString.substring(5, 10), stringGetCharsRange(smallString, 5, 10, 0));
+    // Substring > 8 characters.
+    assertStringEquals(smallString.substring(7, 28), stringGetCharsRange(smallString, 7, 28, 0));
+
+    // Get full string with offset in the char array.
+    assertStringEquals(smallString, stringGetCharsAndBackOffset(smallString, 17));
+
+    // Get a substring with an offset in the char array.
+    // Substring < 8 characters.
+    assertStringEquals(smallString.substring(5, 10), stringGetCharsRange(smallString, 5, 10, 17));
+    // Substring > 8 characters.
+    assertStringEquals(smallString.substring(7, 28), stringGetCharsRange(smallString, 7, 28, 17));
+
     try {
       $opt$noinline$stringCharAt("abc", -1);
       throw new Error("Should throw SIOOB.");
@@ -431,4 +476,22 @@
       throw new Error(ex);
     }
   }
+
+  public static String stringGetCharsAndBack(String src) {
+    char[] dst = new char[src.length()];
+    src.getChars(0, src.length(), dst, 0);
+    return new String(dst);
+  }
+
+  public static String stringGetCharsAndBackOffset(String src, int offset) {
+    char[] dst = new char[src.length() + offset];
+    src.getChars(0, src.length(), dst, offset);
+    return new String(dst, offset, src.length());
+  }
+
+  public static String stringGetCharsRange(String src, int srcBegin, int srcEnd, int offset) {
+    char[] dst = new char[srcEnd - srcBegin + offset];
+    src.getChars(srcBegin, srcEnd, dst, offset);
+    return new String(dst, offset, srcEnd - srcBegin);
+  }
 }