ART: x86 indexOf intrinsics for the optimizing compiler

Add intrinsics implementations for indexOf in the optimizing
compiler. These are mostly ported from Quick. Add instruction
support to assemblers where necessary.

Change-Id: Ife90ed0245532a5c436a26fe84715dc357f353c8
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 1fc5432..b37b6b0 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -16,6 +16,8 @@
 
 #include "intrinsics_x86_64.h"
 
+#include <limits>
+
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "code_generator_x86_64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
@@ -858,6 +860,157 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+static void CreateStringIndexOfLocations(HInvoke* invoke,
+                                         ArenaAllocator* allocator,
+                                         bool start_at_zero) {
+  LocationSummary* locations = new (allocator) LocationSummary(invoke,
+                                                               LocationSummary::kCallOnSlowPath,
+                                                               kIntrinsified);
+  // The data needs to be in RDI for scasw. So request that the string is there, anyways.
+  locations->SetInAt(0, Location::RegisterLocation(RDI));
+  // If we look for a constant char, we'll still have to copy it into RAX. So just request the
+  // allocator to do that, anyways. We can still do the constant check by checking the parameter
+  // of the instruction explicitly.
+  // Note: This works as we don't clobber RAX anywhere.
+  locations->SetInAt(1, Location::RegisterLocation(RAX));
+  if (!start_at_zero) {
+    locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
+  }
+  // As we clobber RDI during execution anyways, also use it as the output.
+  locations->SetOut(Location::SameAsFirstInput());
+
+  // repne scasw uses RCX as the counter.
+  locations->AddTemp(Location::RegisterLocation(RCX));
+  // Need another temporary to be able to compute the result.
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+static void GenerateStringIndexOf(HInvoke* invoke,
+                                  X86_64Assembler* assembler,
+                                  CodeGeneratorX86_64* codegen,
+                                  ArenaAllocator* allocator,
+                                  bool start_at_zero) {
+  LocationSummary* locations = invoke->GetLocations();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
+  CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  // Check our assumptions for registers.
+  DCHECK_EQ(string_obj.AsRegister(), RDI);
+  DCHECK_EQ(search_value.AsRegister(), RAX);
+  DCHECK_EQ(counter.AsRegister(), RCX);
+  DCHECK_EQ(out.AsRegister(), RDI);
+
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch if we have a constant.
+  SlowPathCodeX86_64* slow_path = nullptr;
+  if (invoke->InputAt(1)->IsIntConstant()) {
+    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+    std::numeric_limits<uint16_t>::max()) {
+      // Always needs the slow-path. We could directly dispatch to it, but this case should be
+      // rare, so for simplicity just put the full slow-path down and branch unconditionally.
+      slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
+      codegen->AddSlowPath(slow_path);
+      __ jmp(slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+      return;
+    }
+  } else {
+    __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
+    slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
+    codegen->AddSlowPath(slow_path);
+    __ j(kAbove, slow_path->GetEntryLabel());
+  }
+
+  // From here down, we know that we are looking for a char that fits in 16 bits.
+  // Location of reference to data array within the String object.
+  int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+  // Location of count within the String object.
+  int32_t count_offset = mirror::String::CountOffset().Int32Value();
+
+  // Load string length, i.e., the count field of the string.
+  __ movl(string_length, Address(string_obj, count_offset));
+
+  // Do a length check.
+  // TODO: Support jecxz.
+  Label not_found_label;
+  __ testl(string_length, string_length);
+  __ j(kEqual, &not_found_label);
+
+  if (start_at_zero) {
+    // Number of chars to scan is the same as the string length.
+    __ movl(counter, string_length);
+
+    // Move to the start of the string.
+    __ addq(string_obj, Immediate(value_offset));
+  } else {
+    CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
+
+    // Do a start_index check.
+    __ cmpl(start_index, string_length);
+    __ j(kGreaterEqual, &not_found_label);
+
+    // Ensure we have a start index >= 0;
+    __ xorl(counter, counter);
+    __ cmpl(start_index, Immediate(0));
+    __ cmov(kGreater, counter, start_index, false);  // 32-bit copy is enough.
+
+    // Move to the start of the string: string_obj + value_offset + 2 * start_index.
+    __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
+
+    // Now update ecx, the work counter: it's gonna be string.length - start_index.
+    __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
+    __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
+  }
+
+  // Everything is set up for repne scasw:
+  //   * Comparison address in RDI.
+  //   * Counter in ECX.
+  __ repne_scasw();
+
+  // Did we find a match?
+  __ j(kNotEqual, &not_found_label);
+
+  // Yes, we matched.  Compute the index of the result.
+  __ subl(string_length, counter);
+  __ leal(out, Address(string_length, -1));
+
+  Label done;
+  __ jmp(&done);
+
+  // Failed to match; return -1.
+  __ Bind(&not_found_label);
+  __ movl(out, Immediate(-1));
+
+  // And join up at the end.
+  __ Bind(&done);
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
+  CreateStringIndexOfLocations(invoke, arena_, true);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
+  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), true);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
+  CreateStringIndexOfLocations(invoke, arena_, false);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
+  GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), false);
+}
+
 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                             LocationSummary::kCall,
@@ -1434,8 +1587,6 @@
 }
 
 UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
-UNIMPLEMENTED_INTRINSIC(StringIndexOf)
-UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)