Optimizing String.Equals as an intrinsic (x86_64)

The fourth implementation of String.Equals. I added an intrinsic
in x86_64 which is similar to the original java implementation
of String.equals: an instanceof check, null check,length check,
and reference equality check followed by a loop comparing strings
four characters at a time.

Interesting Benchmarking Values:

Optimizing Compiler on 64-bit Emulator
        Intrinsic 1-5 Character Strings: 48 ns
        Original 1-5 Character Strings: 56 ns
        Intrinsic 1000+ Character Strings: 4009 ns
        Original 1000+ Character Strings: 4704 ns
        Intrinsic Non-String Argument: 35 ns
        Original Non-String Argument: 42 ns

Bug: 21481923
Change-Id: I17d0d2e24a670a898ab1729669d3990403b9a853
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index b4926c2..9ea68ec 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -854,6 +854,97 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+
+  // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
+  locations->AddTemp(Location::RegisterLocation(RCX));
+  locations->AddTemp(Location::RegisterLocation(RDI));
+
+  // Set output, RSI needed for repe_cmpsq instruction anyways.
+  locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
+  CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
+  CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
+
+  Label end;
+  Label return_true;
+  Label return_false;
+
+  // Get offsets of count, value, and class fields within a string object.
+  const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+  const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // Check if input is null, return false if it is.
+  __ testl(arg, arg);
+  __ j(kEqual, &return_false);
+
+  // Instanceof check for the argument by comparing class fields.
+  // All string objects must have the same type since String cannot be subclassed.
+  // Receiver must be a string object, so its class field is equal to all strings' class fields.
+  // If the argument is a string object, its class field must be equal to receiver's class field.
+  __ movl(rcx, Address(str, class_offset));
+  __ cmpl(rcx, Address(arg, class_offset));
+  __ j(kNotEqual, &return_false);
+
+  // Reference equality check, return true if same reference.
+  __ cmpl(str, arg);
+  __ j(kEqual, &return_true);
+
+  // Load length of receiver string.
+  __ movl(rcx, Address(str, count_offset));
+  // Check if lengths are equal, return false if they're not.
+  __ cmpl(rcx, Address(arg, count_offset));
+  __ j(kNotEqual, &return_false);
+  // Return true if both strings are empty.
+  __ testl(rcx, rcx);
+  __ j(kEqual, &return_true);
+
+  // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
+  __ leal(rsi, Address(str, value_offset));
+  __ leal(rdi, Address(arg, value_offset));
+
+  // Divide string length by 4 and adjust for lengths not divisible by 4.
+  __ addl(rcx, Immediate(3));
+  __ shrl(rcx, Immediate(2));
+
+  // Assertions that must hold in order to compare strings 4 characters at a time.
+  DCHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
+
+  // Loop to compare strings four characters at a time starting at the beginning of the string.
+  __ repe_cmpsq();
+  // If strings are not equal, zero flag will be cleared.
+  __ j(kNotEqual, &return_false);
+
+  // Return true and exit the function.
+  // If loop does not result in returning false, we return true.
+  __ Bind(&return_true);
+  __ movl(rsi, Immediate(1));
+  __ jmp(&end);
+
+  // Return false and exit the function.
+  __ Bind(&return_false);
+  __ xorl(rsi, rsi);
+  __ Bind(&end);
+}
+
 static void CreateStringIndexOfLocations(HInvoke* invoke,
                                          ArenaAllocator* allocator,
                                          bool start_at_zero) {
@@ -1607,7 +1698,6 @@
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(IntegerNumberOfLeadingZeros)
 UNIMPLEMENTED_INTRINSIC(LongNumberOfLeadingZeros)
-UNIMPLEMENTED_INTRINSIC(StringEquals)
 
 #undef UNIMPLEMENTED_INTRINSIC