ARM64: Improve String.equals() intrinsic for const strings.

And add additional tests to 021-string2.

aosp_angler-userdebug:
  before:
    arm64 boot*.oat: 43324664
    arm64 boot*.oat/string compression: 43411112
  after:
    arm64 boot*.oat: 43300136 (-24528)
    arm64 boot*.oat/string compression: 43345464 (-65648)

The string compression code size difference drops from
86448 to 45328.

Test: m test-art-target on Nexus 6P
Test: m test-art-target on Nexus 6P with string compression enabled.
Bug: 31040547
Change-Id: I99a3777b91b248da2b0ac25abd260f9e5abb2c09
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index bbf826c..b107280 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -23,7 +23,7 @@
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
 #include "mirror/array-inl.h"
-#include "mirror/string.h"
+#include "mirror/string-inl.h"
 #include "thread.h"
 #include "utils/arm64/assembler_arm64.h"
 
@@ -1444,16 +1444,47 @@
   }
 }
 
+// The cut off for unrolling the loop in String.equals() intrinsic for const strings.
+// The normal loop plus the pre-header is 9 instructions without string compression and 12
+// instructions with string compression. We can compare up to 8 bytes in 4 instructions
+// (LDR+LDR+CMP+BNE) and up to 16 bytes in 5 instructions (LDP+LDP+CMP+CCMP+BNE). Allow up
+// to 10 instructions for the unrolled loop.
+constexpr size_t kShortConstStringEqualsCutoffInBytes = 32;
+
+static const char* GetConstString(HInstruction* candidate, uint32_t* utf16_length) {
+  if (candidate->IsLoadString()) {
+    HLoadString* load_string = candidate->AsLoadString();
+    const DexFile& dex_file = load_string->GetDexFile();
+    return dex_file.StringDataAndUtf16LengthByIdx(load_string->GetStringIndex(), utf16_length);
+  }
+  return nullptr;
+}
+
 void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                             LocationSummary::kNoCall,
                                                             kIntrinsified);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // Temporary registers to store lengths of strings and for calculations.
-  locations->AddTemp(Location::RequiresRegister());
-  locations->AddTemp(Location::RequiresRegister());
 
+  // For the generic implementation and for long const strings we need a temporary.
+  // We do not need it for short const strings, up to 8 bytes, see code generation below.
+  uint32_t const_string_length = 0u;
+  const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
+  if (const_string == nullptr) {
+    const_string = GetConstString(invoke->InputAt(1), &const_string_length);
+  }
+  bool is_compressed =
+      mirror::kUseStringCompression &&
+      const_string != nullptr &&
+      mirror::String::DexFileStringAllASCII(const_string, const_string_length);
+  if (const_string == nullptr || const_string_length > (is_compressed ? 8u : 4u)) {
+    locations->AddTemp(Location::RequiresRegister());
+  }
+
+  // TODO: If the String.equals() is used only for an immediately following HIf, we can
+  // mark it as emitted-at-use-site and emit branches directly to the appropriate blocks.
+  // Then we shall need an extra temporary register instead of the output register.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
@@ -1467,8 +1498,7 @@
 
   UseScratchRegisterScope scratch_scope(masm);
   Register temp = scratch_scope.AcquireW();
-  Register temp1 = WRegisterFrom(locations->GetTemp(0));
-  Register temp2 = WRegisterFrom(locations->GetTemp(1));
+  Register temp1 = scratch_scope.AcquireW();
 
   vixl::aarch64::Label loop;
   vixl::aarch64::Label end;
@@ -1504,47 +1534,99 @@
     __ B(&return_false, ne);
   }
 
-  // Load `count` fields of this and argument strings.
-  __ Ldr(temp, MemOperand(str.X(), count_offset));
-  __ Ldr(temp1, MemOperand(arg.X(), count_offset));
-  // Check if `count` fields are equal, return false if they're not.
-  // Also compares the compression style, if differs return false.
-  __ Cmp(temp, temp1);
-  __ B(&return_false, ne);
-  // Return true if both strings are empty. Even with string compression `count == 0` means empty.
-  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
-                "Expecting 0=compressed, 1=uncompressed");
-  __ Cbz(temp, &return_true);
+  // Check if one of the inputs is a const string. Do not special-case both strings
+  // being const, such cases should be handled by constant folding if needed.
+  uint32_t const_string_length = 0u;
+  const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
+  if (const_string == nullptr) {
+    const_string = GetConstString(invoke->InputAt(1), &const_string_length);
+    if (const_string != nullptr) {
+      std::swap(str, arg);  // Make sure the const string is in `str`.
+    }
+  }
+  bool is_compressed =
+      mirror::kUseStringCompression &&
+      const_string != nullptr &&
+      mirror::String::DexFileStringAllASCII(const_string, const_string_length);
+
+  if (const_string != nullptr) {
+    // Load `count` field of the argument string and check if it matches the const string.
+    // Also compares the compression style, if differs return false.
+    __ Ldr(temp, MemOperand(arg.X(), count_offset));
+    __ Cmp(temp, Operand(mirror::String::GetFlaggedCount(const_string_length, is_compressed)));
+    __ B(&return_false, ne);
+  } else {
+    // Load `count` fields of this and argument strings.
+    __ Ldr(temp, MemOperand(str.X(), count_offset));
+    __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+    // Check if `count` fields are equal, return false if they're not.
+    // Also compares the compression style, if differs return false.
+    __ Cmp(temp, temp1);
+    __ B(&return_false, ne);
+  }
 
   // Assertions that must hold in order to compare strings 8 bytes at a time.
   DCHECK_ALIGNED(value_offset, 8);
   static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
 
-  if (mirror::kUseStringCompression) {
-    // For string compression, calculate the number of bytes to compare (not chars).
-    // This could in theory exceed INT32_MAX, so treat temp as unsigned.
-    __ Lsr(temp, temp, 1u);             // Extract length.
-    __ And(temp1, temp1, Operand(1));   // Extract compression flag.
-    __ Lsl(temp, temp, temp1);          // Calculate number of bytes to compare.
+  if (const_string != nullptr &&
+      const_string_length < (is_compressed ? kShortConstStringEqualsCutoffInBytes
+                                           : kShortConstStringEqualsCutoffInBytes / 2u)) {
+    // Load and compare the contents. Though we know the contents of the short const string
+    // at compile time, materializing constants may be more code than loading from memory.
+    int32_t offset = value_offset;
+    size_t remaining_bytes =
+        RoundUp(is_compressed ? const_string_length : const_string_length * 2u, 8u);
+    temp = temp.X();
+    temp1 = temp1.X();
+    while (remaining_bytes > 8u) {
+      Register temp2 = XRegisterFrom(locations->GetTemp(0));
+      __ Ldp(temp, temp1, MemOperand(str.X(), offset));
+      __ Ldp(temp2, out, MemOperand(arg.X(), offset));
+      __ Cmp(temp, temp2);
+      __ Ccmp(temp1, out, NoFlag, eq);
+      __ B(&return_false, ne);
+      offset += 2u * sizeof(uint64_t);
+      remaining_bytes -= 2u * sizeof(uint64_t);
+    }
+    if (remaining_bytes != 0u) {
+      __ Ldr(temp, MemOperand(str.X(), offset));
+      __ Ldr(temp1, MemOperand(arg.X(), offset));
+      __ Cmp(temp, temp1);
+      __ B(&return_false, ne);
+    }
+  } else {
+    // Return true if both strings are empty. Even with string compression `count == 0` means empty.
+    static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                  "Expecting 0=compressed, 1=uncompressed");
+    __ Cbz(temp, &return_true);
+
+    if (mirror::kUseStringCompression) {
+      // For string compression, calculate the number of bytes to compare (not chars).
+      // This could in theory exceed INT32_MAX, so treat temp as unsigned.
+      __ And(temp1, temp, Operand(1));    // Extract compression flag.
+      __ Lsr(temp, temp, 1u);             // Extract length.
+      __ Lsl(temp, temp, temp1);          // Calculate number of bytes to compare.
+    }
+
+    // Store offset of string value in preparation for comparison loop
+    __ Mov(temp1, value_offset);
+
+    temp1 = temp1.X();
+    Register temp2 = XRegisterFrom(locations->GetTemp(0));
+    // Loop to compare strings 8 bytes at a time starting at the front of the string.
+    // Ok to do this because strings are zero-padded to kObjectAlignment.
+    __ Bind(&loop);
+    __ Ldr(out, MemOperand(str.X(), temp1));
+    __ Ldr(temp2, MemOperand(arg.X(), temp1));
+    __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
+    __ Cmp(out, temp2);
+    __ B(&return_false, ne);
+    // With string compression, we have compared 8 bytes, otherwise 4 chars.
+    __ Sub(temp, temp, Operand(mirror::kUseStringCompression ? 8 : 4), SetFlags);
+    __ B(&loop, hi);
   }
 
-  // Store offset of string value in preparation for comparison loop
-  __ Mov(temp1, value_offset);
-
-  temp1 = temp1.X();
-  temp2 = temp2.X();
-  // Loop to compare strings 8 bytes at a time starting at the front of the string.
-  // Ok to do this because strings are zero-padded to kObjectAlignment.
-  __ Bind(&loop);
-  __ Ldr(out, MemOperand(str.X(), temp1));
-  __ Ldr(temp2, MemOperand(arg.X(), temp1));
-  __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
-  __ Cmp(out, temp2);
-  __ B(&return_false, ne);
-  // With string compression, we have compared 8 bytes, otherwise 4 chars.
-  __ Sub(temp, temp, Operand(mirror::kUseStringCompression ? 8 : 4), SetFlags);
-  __ B(&loop, hi);
-
   // Return true and exit the function.
   // If loop does not result in returning false, we return true.
   __ Bind(&return_true);