Inline x86 String.indexOf

Take advantage of the presence of a constant search char or start index
to tune the generated code.

Change-Id: I0adcf184fb91b899a95aa4d8ef044a14deb51d88
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 5fa4596..35d193c 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1257,17 +1257,13 @@
   } else {
     LoadValueDirectFixed(rl_start, reg_start);
   }
-  int r_tgt = (cu_->instruction_set != kX86) ? LoadHelper(QUICK_ENTRYPOINT_OFFSET(pIndexOf)) : 0;
+  int r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(pIndexOf));
   GenNullCheck(rl_obj.s_reg_low, reg_ptr, info->opt_flags);
   LIR* launch_pad = RawLIR(0, kPseudoIntrinsicRetry, WrapPointer(info));
   intrinsic_launchpads_.Insert(launch_pad);
   OpCmpImmBranch(kCondGt, reg_char, 0xFFFF, launch_pad);
   // NOTE: not a safepoint
-  if (cu_->instruction_set != kX86) {
-    OpReg(kOpBlx, r_tgt);
-  } else {
-    OpThreadMem(kOpBlx, QUICK_ENTRYPOINT_OFFSET(pIndexOf));
-  }
+  OpReg(kOpBlx, r_tgt);
   LIR* resume_tgt = NewLIR0(kPseudoTargetLabel);
   launch_pad->operands[2] = WrapPointer(resume_tgt);
   // Record that we've already inlined & null checked
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index c60c394..6e97c53 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -676,7 +676,7 @@
     bool GenInlinedAbsDouble(CallInfo* info);
     bool GenInlinedFloatCvt(CallInfo* info);
     bool GenInlinedDoubleCvt(CallInfo* info);
-    bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
+    virtual bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
     bool GenInlinedStringCompareTo(CallInfo* info);
     bool GenInlinedCurrentThread(CallInfo* info);
     bool GenInlinedUnsafeGet(CallInfo* info, bool is_long, bool is_volatile);
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 6481589..538ce0d 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -175,6 +175,8 @@
   { kX86Mov32AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { 0,             0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32AI", "[!0r+!1r<<!2d+!3d],!4d" },
   { kX86Mov32TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32TI", "fs:[!0d],!1d" },
 
+  { kX86Lea32RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE12, { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RM", "!0r,[!1r+!2d]" },
+
   { kX86Lea32RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12, { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
 
   { kX86Cmov32RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {0, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RR", "!2c !0r,!1r" },
@@ -354,6 +356,7 @@
   { kX86Jmp8,  kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP,               { 0,             0, 0xEB, 0,    0, 0, 0, 0 }, "Jmp8",  "!0t" },
   { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP,               { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
   { kX86JmpR,  kJmp,  IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xFF, 0,    0, 4, 0, 0 }, "JmpR",  "!0r" },
+  { kX86Jecxz8, kJmp, NO_OPERAND   | IS_BRANCH | NEEDS_FIXUP | REG_USEC,    { 0,             0, 0xE3, 0,    0, 0, 0, 0 }, "Jecxz", "!0t" },
   { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xE8, 0,    0, 0, 0, 0 }, "CallR", "!0r" },
   { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD | REG_USE0,        { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
   { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD | REG_USE01,       { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
@@ -364,6 +367,7 @@
   { kX86StartOfMethod, kMacro,  IS_UNARY_OP | SETS_CCODES,             { 0, 0, 0,    0, 0, 0, 0, 0 }, "StartOfMethod", "!0r" },
   { kX86PcRelLoadRA,   kPcRel,  IS_LOAD | IS_QUIN_OP | REG_DEF0_USE12, { 0, 0, 0x8B, 0, 0, 0, 0, 0 }, "PcRelLoadRA",   "!0r,[!1r+!2r<<!3d+!4p]" },
   { kX86PcRelAdr,      kPcRel,  IS_LOAD | IS_BINARY_OP | REG_DEF0,     { 0, 0, 0xB8, 0, 0, 0, 0, 4 }, "PcRelAdr",      "!0r,!1d" },
+  { kX86RepneScasw, kPrefix2Nullary, NO_OPERAND | SETS_CCODES,         { 0x66, 0xF2, 0xAF, 0, 0, 0, 0, 0 }, "RepNE ScasW", "" },
 };
 
 static size_t ComputeSize(const X86EncodingMap* entry, int base, int displacement, bool has_sib) {
@@ -407,6 +411,8 @@
       return lir->operands[0];  // length of nop is sole operand
     case kNullary:
       return 1;  // 1 byte of opcode
+    case kPrefix2Nullary:
+      return 3;  // 1 byte of opcode + 2 prefixes
     case kRegOpcode:  // lir operands - 0: reg
       return ComputeSize(entry, 0, 0, false) - 1;  // substract 1 for modrm
     case kReg:  // lir operands - 0: reg
@@ -489,7 +495,7 @@
         return 6;  // 2 byte opcode + rel32
       }
     case kJmp:
-      if (lir->opcode == kX86Jmp8) {
+      if (lir->opcode == kX86Jmp8 || lir->opcode == kX86Jecxz8) {
         return 2;  // opcode + rel8
       } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
@@ -957,6 +963,10 @@
     code_buffer_.push_back((rel >> 8) & 0xFF);
     code_buffer_.push_back((rel >> 16) & 0xFF);
     code_buffer_.push_back((rel >> 24) & 0xFF);
+  } else if (entry->opcode == kX86Jecxz8) {
+    DCHECK(IS_SIMM8(rel));
+    code_buffer_.push_back(0xE3);
+    code_buffer_.push_back(rel & 0xFF);
   } else {
     DCHECK(entry->opcode == kX86JmpR);
     code_buffer_.push_back(entry->skeleton.opcode);
@@ -1148,6 +1158,17 @@
           lir->operands[0] = delta;
           break;
         }
+        case kX86Jecxz8: {
+          LIR *target_lir = lir->target;
+          DCHECK(target_lir != NULL);
+          CodeOffset pc;
+          pc = lir->offset + 2;  // opcode + rel8
+          CodeOffset target = target_lir->offset;
+          int delta = target - pc;
+          lir->operands[0] = delta;
+          DCHECK(IS_SIMM8(delta));
+          break;
+        }
         case kX86Jmp8: {
           LIR *target_lir = lir->target;
           DCHECK(target_lir != NULL);
@@ -1226,6 +1247,14 @@
         DCHECK_EQ(0, entry->skeleton.ax_opcode);
         DCHECK_EQ(0, entry->skeleton.immediate_bytes);
         break;
+      case kPrefix2Nullary:  // 1 byte of opcode + 2 prefixes.
+        DCHECK_NE(0, entry->skeleton.prefix1);
+        DCHECK_NE(0, entry->skeleton.prefix2);
+        EmitPrefixAndOpcode(entry);
+        DCHECK_EQ(0, entry->skeleton.modrm_opcode);
+        DCHECK_EQ(0, entry->skeleton.ax_opcode);
+        DCHECK_EQ(0, entry->skeleton.immediate_bytes);
+        break;
       case kRegOpcode:  // lir operands - 0: reg
         EmitOpRegOpcode(entry, lir->operands[0]);
         break;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 6100a1d..421d51e 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -359,6 +359,15 @@
     void GenConstWide(RegLocation rl_dest, int64_t value);
 
     /*
+     * @brief generate inline code for fast case of Strng.indexOf.
+     * @param info Call parameters
+     * @param zero_based 'true' if the index into the string is 0.
+     * @returns 'true' if the call was inlined, 'false' if a regular call needs to be
+     * generated.
+     */
+    bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
+
+    /*
      * @brief Return the correct x86 opcode for the Dex operation
      * @param op Dex opcode for the operation
      * @param loc Register location of the operand
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index fa9a944..ad5b154 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -185,6 +185,14 @@
   if (flags & REG_USEB) {
     SetupRegMask(&lir->u.m.use_mask, rBX);
   }
+
+  // Fixup hard to describe instruction: Uses rAX, rCX, rDI; sets rDI.
+  if (lir->opcode == kX86RepneScasw) {
+    SetupRegMask(&lir->u.m.use_mask, rAX);
+    SetupRegMask(&lir->u.m.use_mask, rCX);
+    SetupRegMask(&lir->u.m.use_mask, rDI);
+    SetupRegMask(&lir->u.m.def_mask, rDI);
+  }
 }
 
 /* For dumping instructions */
@@ -936,4 +944,174 @@
   Mir2Lir::InstallLiteralPools();
 }
 
+// Offsets within java.lang.String.
+#define STRING_VALUE_OFFSET 8
+#define STRING_COUNT_OFFSET 12
+#define STRING_OFFSET_OFFSET 20
+#define STRING_DATA_OFFSET 12
+
+/*
+ * Fast string.index_of(I) & (II).  Inline check for simple case of char <= 0xffff,
+ * otherwise bails to standard library code.
+ */
+bool X86Mir2Lir::GenInlinedIndexOf(CallInfo* info, bool zero_based) {
+  ClobberCallerSave();
+  LockCallTemps();  // Using fixed registers
+
+  // EAX: 16 bit character being searched.
+  // ECX: count: number of words to be searched.
+  // EDI: String being searched.
+  // EDX: temporary during execution.
+  // EBX: temporary during execution.
+
+  RegLocation rl_obj = info->args[0];
+  RegLocation rl_char = info->args[1];
+  RegLocation rl_start = info->args[2];
+
+  uint32_t char_value =
+    rl_char.is_const ? mir_graph_->ConstantValue(rl_char.orig_sreg) : 0;
+
+  if (char_value > 0xFFFF) {
+    // We have to punt to the real String.indexOf.
+    return false;
+  }
+
+  // Okay, we are commited to inlining this.
+  RegLocation rl_return = GetReturn(false);
+  RegLocation rl_dest = InlineTarget(info);
+
+  // Is the string non-NULL?
+  LoadValueDirectFixed(rl_obj, rDX);
+  GenNullCheck(rl_obj.s_reg_low, rDX, info->opt_flags);
+
+  // Record that we have inlined & null checked the object.
+  info->opt_flags |= (MIR_INLINED | MIR_IGNORE_NULL_CHECK);
+
+  // Does the character fit in 16 bits?
+  LIR* launch_pad = nullptr;
+  if (rl_char.is_const) {
+    // We need the value in EAX.
+    LoadConstantNoClobber(rAX, char_value);
+  } else {
+    // Character is not a constant; compare at runtime.
+    LoadValueDirectFixed(rl_char, rAX);
+    launch_pad = RawLIR(0, kPseudoIntrinsicRetry, WrapPointer(info));
+    intrinsic_launchpads_.Insert(launch_pad);
+    OpCmpImmBranch(kCondGt, rAX, 0xFFFF, launch_pad);
+  }
+
+  // From here down, we know that we are looking for a char that fits in 16 bits.
+
+  // Character is in EAX.
+  // Object pointer is in EDX.
+
+  // We need to preserve EDI, but have no spare registers, so push it on the stack.
+  // We have to remember that all stack addresses after this are offset by sizeof(EDI).
+  NewLIR1(kX86Push32R, rDI);
+
+  // Compute the number of words to search in to rCX.
+  LoadWordDisp(rDX, STRING_COUNT_OFFSET, rCX);
+  LIR *length_compare = nullptr;
+  int start_value = 0;
+  if (zero_based) {
+    // We have to handle an empty string.  Use special instruction JECXZ.
+    length_compare = NewLIR0(kX86Jecxz8);
+  } else {
+    // We have to offset by the start index.
+    if (rl_start.is_const) {
+      start_value = mir_graph_->ConstantValue(rl_start.orig_sreg);
+      start_value = std::max(start_value, 0);
+
+      // Is the start > count?
+      length_compare = OpCmpImmBranch(kCondLe, rCX, start_value, nullptr);
+
+      if (start_value != 0) {
+        OpRegImm(kOpSub, rCX, start_value);
+      }
+    } else {
+      // Runtime start index.
+      rl_start = UpdateLoc(rl_start);
+      if (rl_start.location == kLocPhysReg) {
+        length_compare = OpCmpBranch(kCondLe, rCX, rl_start.low_reg, nullptr);
+        OpRegReg(kOpSub, rCX, rl_start.low_reg);
+      } else {
+        // Compare to memory to avoid a register load.  Handle pushed EDI.
+        int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
+        OpRegMem(kOpCmp, rDX, rX86_SP, displacement);
+        length_compare = NewLIR2(kX86Jcc8, 0, kX86CondLe);
+        OpRegMem(kOpSub, rCX, rX86_SP, displacement);
+      }
+    }
+  }
+  DCHECK(length_compare != nullptr);
+
+  // ECX now contains the count in words to be searched.
+
+  // Load the address of the string into EBX.
+  // The string starts at VALUE(String) + 2 * OFFSET(String) + STRING_DATA_OFFSET.
+  LoadWordDisp(rDX, STRING_VALUE_OFFSET, rDI);
+  LoadWordDisp(rDX, STRING_OFFSET_OFFSET, rBX);
+  OpLea(rBX, rDI, rBX, 1, STRING_DATA_OFFSET);
+
+  // Now compute into EDI where the search will start.
+  if (zero_based || rl_start.is_const) {
+    if (start_value == 0) {
+      OpRegCopy(rDI, rBX);
+    } else {
+      NewLIR3(kX86Lea32RM, rDI, rBX, 2 * start_value);
+    }
+  } else {
+    if (rl_start.location == kLocPhysReg) {
+      if (rl_start.low_reg == rDI) {
+        // We have a slight problem here.  We are already using RDI!
+        // Grab the value from the stack.
+        LoadWordDisp(rX86_SP, 0, rDX);
+        OpLea(rDI, rBX, rDX, 1, 0);
+      } else {
+        OpLea(rDI, rBX, rl_start.low_reg, 1, 0);
+      }
+    } else {
+      OpRegCopy(rDI, rBX);
+      // Load the start index from stack, remembering that we pushed EDI.
+      int displacement = SRegOffset(rl_start.s_reg_low) + sizeof(uint32_t);
+      LoadWordDisp(rX86_SP, displacement, rDX);
+      OpLea(rDI, rBX, rDX, 1, 0);
+    }
+  }
+
+  // EDI now contains the start of the string to be searched.
+  // We are all prepared to do the search for the character.
+  NewLIR0(kX86RepneScasw);
+
+  // Did we find a match?
+  LIR* failed_branch = OpCondBranch(kCondNe, nullptr);
+
+  // yes, we matched.  Compute the index of the result.
+  // index = ((curr_ptr - orig_ptr) / 2) - 1.
+  OpRegReg(kOpSub, rDI, rBX);
+  OpRegImm(kOpAsr, rDI, 1);
+  NewLIR3(kX86Lea32RM, rl_return.low_reg, rDI, -1);
+  LIR *all_done = NewLIR1(kX86Jmp8, 0);
+
+  // Failed to match; return -1.
+  LIR *not_found = NewLIR0(kPseudoTargetLabel);
+  length_compare->target = not_found;
+  failed_branch->target = not_found;
+  LoadConstantNoClobber(rl_return.low_reg, -1);
+
+  // And join up at the end.
+  all_done->target = NewLIR0(kPseudoTargetLabel);
+  // Restore EDI from the stack.
+  NewLIR1(kX86Pop32R, rDI);
+
+  // Out of line code returns here.
+  if (launch_pad != nullptr) {
+    LIR *return_point = NewLIR0(kPseudoTargetLabel);
+    launch_pad->operands[2] = WrapPointer(return_point);
+  }
+
+  StoreValue(rl_dest, rl_return);
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 480d5f5..4064bd6 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -277,6 +277,7 @@
   kX86Mov32MR, kX86Mov32AR, kX86Mov32TR,
   kX86Mov32RR, kX86Mov32RM, kX86Mov32RA, kX86Mov32RT,
   kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI,
+  kX86Lea32RM,
   kX86Lea32RA,
   // RRC - Register Register ConditionCode - cond_opcode reg1, reg2
   //             - lir operands - 0: reg1, 1: reg2, 2: CC
@@ -384,6 +385,7 @@
   kX86Jcc8, kX86Jcc32,  // jCC rel8/32; lir operands - 0: rel, 1: CC, target assigned
   kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
   kX86JmpR,             // jmp reg; lir operands - 0: reg
+  kX86Jecxz8,           // jcexz rel8; jump relative if ECX is zero.
   kX86CallR,            // call reg; lir operands - 0: reg
   kX86CallM,            // call [base + disp]; lir operands - 0: base, 1: disp
   kX86CallA,            // call [base + index * scale + disp]
@@ -396,6 +398,7 @@
   kX86PcRelLoadRA,      // mov reg, [base + index * scale + PC relative displacement]
                         // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
   kX86PcRelAdr,         // mov reg, PC relative displacement; lir operands - 0: reg, 1: table
+  kX86RepneScasw,       // repne scasw
   kX86Last
 };
 
@@ -404,6 +407,7 @@
   kData,                                   // Special case for raw data.
   kNop,                                    // Special case for variable length nop.
   kNullary,                                // Opcode that takes no arguments.
+  kPrefix2Nullary,                         // Opcode that takes no arguments, but 2 prefixes.
   kRegOpcode,                              // Shorter form of R instruction kind (opcode+rd)
   kReg, kMem, kArray,                      // R, M and A instruction kinds.
   kMemReg, kArrayReg, kThreadReg,          // MR, AR and TR instruction kinds.