Implementation of integer intrinsics on x86_64

Rationale:
Efficient implementations of common integer operations.
Already tested in:
  564-checker-bitcount
  565-checker-rotate:
  566-checker-signum
  567-checker-compare
  568-checker-onebit  (extended to deal with run-time zero)

Change-Id: Ib48c76eee751e7925056d7f26797e9a9b5ae60dd
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 6ccc5d1..51fa514 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2302,7 +2302,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
-  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
+  X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
@@ -2346,7 +2346,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
-  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
+  X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
@@ -2382,7 +2382,10 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
-static void GenBitCount(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) {
+static void GenBitCount(X86_64Assembler* assembler,
+                        CodeGeneratorX86_64* codegen,
+                        HInvoke* invoke,
+                        bool is_long) {
   LocationSummary* locations = invoke->GetLocations();
   Location src = locations->InAt(0);
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
@@ -2393,11 +2396,7 @@
     value = is_long
         ? POPCOUNT(static_cast<uint64_t>(value))
         : POPCOUNT(static_cast<uint32_t>(value));
-    if (value == 0) {
-      __ xorl(out, out);
-    } else {
-      __ movl(out, Immediate(value));
-    }
+    codegen->Load32BitValue(out, value);
     return;
   }
 
@@ -2421,7 +2420,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
-  GenBitCount(GetAssembler(), invoke, /* is_long */ false);
+  GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ false);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
@@ -2429,7 +2428,190 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
-  GenBitCount(GetAssembler(), invoke, /* is_long */ true);
+  GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ true);
+}
+
+static void CreateCompareLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void GenCompare(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) {
+  LocationSummary* locations = invoke->GetLocations();
+  CpuRegister src1 = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister src2 = locations->InAt(1).AsRegister<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  NearLabel is_lt, done;
+
+  __ xorl(out, out);
+
+  if (is_long) {
+    __ cmpq(src1, src2);
+  } else {
+    __ cmpl(src1, src2);
+  }
+  __ j(kEqual, &done);
+  __ j(kLess, &is_lt);
+
+  __ movl(out, Immediate(1));
+  __ jmp(&done);
+
+  __ Bind(&is_lt);
+  __ movl(out, Immediate(-1));
+
+  __ Bind(&done);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitIntegerCompare(HInvoke* invoke) {
+  CreateCompareLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitIntegerCompare(HInvoke* invoke) {
+  GenCompare(GetAssembler(), invoke, /* is_long */ false);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitLongCompare(HInvoke* invoke) {
+  CreateCompareLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitLongCompare(HInvoke* invoke) {
+  GenCompare(GetAssembler(), invoke, /* is_long */ true);
+}
+
+static void CreateOneBitLocations(ArenaAllocator* arena, HInvoke* invoke, bool is_high) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::Any());
+  locations->SetOut(Location::RequiresRegister());
+  locations->AddTemp(is_high ? Location::RegisterLocation(RCX)  // needs CL
+                             : Location::RequiresRegister());  // any will do
+}
+
+static void GenOneBit(X86_64Assembler* assembler,
+                      CodeGeneratorX86_64* codegen,
+                      HInvoke* invoke,
+                      bool is_high, bool is_long) {
+  LocationSummary* locations = invoke->GetLocations();
+  Location src = locations->InAt(0);
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  if (invoke->InputAt(0)->IsConstant()) {
+    // Evaluate this at compile time.
+    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
+    if (value == 0) {
+      __ xorl(out, out);  // Clears upper bits too.
+      return;
+    }
+    // Nonzero value.
+    if (is_high) {
+      value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
+                      : 31 - CLZ(static_cast<uint32_t>(value));
+    } else {
+      value = is_long ? CTZ(static_cast<uint64_t>(value))
+                      : CTZ(static_cast<uint32_t>(value));
+    }
+    if (is_long) {
+      codegen->Load64BitValue(out, 1L << value);
+    } else {
+      codegen->Load32BitValue(out, 1 << value);
+    }
+    return;
+  }
+
+  // Handle the non-constant cases.
+  CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
+  if (is_high) {
+    // Use architectural support: basically 1 << bsr.
+    if (src.IsRegister()) {
+      if (is_long) {
+        __ bsrq(tmp, src.AsRegister<CpuRegister>());
+      } else {
+        __ bsrl(tmp, src.AsRegister<CpuRegister>());
+      }
+    } else if (is_long) {
+      DCHECK(src.IsDoubleStackSlot());
+      __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
+    } else {
+      DCHECK(src.IsStackSlot());
+      __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
+    }
+    // BSR sets ZF if the input was zero.
+    NearLabel is_zero, done;
+    __ j(kEqual, &is_zero);
+    __ movl(out, Immediate(1));  // Clears upper bits too.
+    if (is_long) {
+      __ shlq(out, tmp);
+    } else {
+      __ shll(out, tmp);
+    }
+    __ jmp(&done);
+    __ Bind(&is_zero);
+    __ xorl(out, out);  // Clears upper bits too.
+    __ Bind(&done);
+  } else  {
+    // Copy input into temporary.
+    if (src.IsRegister()) {
+      if (is_long) {
+        __ movq(tmp, src.AsRegister<CpuRegister>());
+      } else {
+        __ movl(tmp, src.AsRegister<CpuRegister>());
+      }
+    } else if (is_long) {
+      DCHECK(src.IsDoubleStackSlot());
+      __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
+    } else {
+      DCHECK(src.IsStackSlot());
+      __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
+    }
+    // Do the bit twiddling: basically tmp & -tmp;
+    if (is_long) {
+      __ movq(out, tmp);
+      __ negq(tmp);
+      __ andq(out, tmp);
+    } else {
+      __ movl(out, tmp);
+      __ negl(tmp);
+      __ andl(out, tmp);
+    }
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
+  CreateOneBitLocations(arena_, invoke, /* is_high */ true);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
+  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ false);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
+  CreateOneBitLocations(arena_, invoke, /* is_high */ true);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
+  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ true);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
+  CreateOneBitLocations(arena_, invoke, /* is_high */ false);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
+  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ false);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
+  CreateOneBitLocations(arena_, invoke, /* is_high */ false);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
+  GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ true);
 }
 
 static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -2440,7 +2622,9 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
-static void GenLeadingZeros(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) {
+static void GenLeadingZeros(X86_64Assembler* assembler,
+                            CodeGeneratorX86_64* codegen,
+                            HInvoke* invoke, bool is_long) {
   LocationSummary* locations = invoke->GetLocations();
   Location src = locations->InAt(0);
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
@@ -2454,11 +2638,7 @@
     } else {
       value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
     }
-    if (value == 0) {
-      __ xorl(out, out);
-    } else {
-      __ movl(out, Immediate(value));
-    }
+    codegen->Load32BitValue(out, value);
     return;
   }
 
@@ -2497,8 +2677,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
-  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
-  GenLeadingZeros(assembler, invoke, /* is_long */ false);
+  GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
@@ -2506,8 +2685,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
-  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
-  GenLeadingZeros(assembler, invoke, /* is_long */ true);
+  GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
 }
 
 static void CreateTrailingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -2518,7 +2696,9 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
-static void GenTrailingZeros(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) {
+static void GenTrailingZeros(X86_64Assembler* assembler,
+                             CodeGeneratorX86_64* codegen,
+                             HInvoke* invoke, bool is_long) {
   LocationSummary* locations = invoke->GetLocations();
   Location src = locations->InAt(0);
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
@@ -2532,11 +2712,7 @@
     } else {
       value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
     }
-    if (value == 0) {
-      __ xorl(out, out);
-    } else {
-      __ movl(out, Immediate(value));
-    }
+    codegen->Load32BitValue(out, value);
     return;
   }
 
@@ -2570,8 +2746,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
-  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
-  GenTrailingZeros(assembler, invoke, /* is_long */ false);
+  GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
@@ -2579,8 +2754,75 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
-  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
-  GenTrailingZeros(assembler, invoke, /* is_long */ true);
+  GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
+}
+
+static void CreateSignLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::Any());
+  locations->SetOut(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());  // Need a writeable register.
+}
+
+static void GenSign(X86_64Assembler* assembler,
+                    CodeGeneratorX86_64* codegen,
+                    HInvoke* invoke, bool is_long) {
+  LocationSummary* locations = invoke->GetLocations();
+  Location src = locations->InAt(0);
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  if (invoke->InputAt(0)->IsConstant()) {
+    // Evaluate this at compile time.
+    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
+    codegen->Load32BitValue(out, value == 0 ? 0 : (value > 0 ? 1 : -1));
+    return;
+  }
+
+  // Copy input into temporary.
+  CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
+  if (src.IsRegister()) {
+    if (is_long) {
+      __ movq(tmp, src.AsRegister<CpuRegister>());
+    } else {
+      __ movl(tmp, src.AsRegister<CpuRegister>());
+    }
+  } else if (is_long) {
+    DCHECK(src.IsDoubleStackSlot());
+    __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
+  } else {
+    DCHECK(src.IsStackSlot());
+    __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
+  }
+
+  // Do the bit twiddling: basically tmp >> 63/31 | -tmp >>> 63/31 for long/int.
+  if (is_long) {
+    __ movq(out, tmp);
+    __ sarq(out, Immediate(63));
+    __ negq(tmp);
+    __ shrq(tmp, Immediate(63));
+    __ orq(out, tmp);
+  } else {
+    __ movl(out, tmp);
+    __ sarl(out, Immediate(31));
+    __ negl(tmp);
+    __ shrl(tmp, Immediate(31));
+    __ orl(out, tmp);
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitIntegerSignum(HInvoke* invoke) {
+  CreateSignLocations(arena_, invoke);
+}
+void IntrinsicCodeGeneratorX86_64::VisitIntegerSignum(HInvoke* invoke) {
+  GenSign(GetAssembler(), codegen_, invoke, /* is_long */ false);
+}
+void IntrinsicLocationsBuilderX86_64::VisitLongSignum(HInvoke* invoke) {
+  CreateSignLocations(arena_, invoke);
+}
+void IntrinsicCodeGeneratorX86_64::VisitLongSignum(HInvoke* invoke) {
+  GenSign(GetAssembler(), codegen_, invoke, /* is_long */ true);
 }
 
 // Unimplemented intrinsics.
@@ -2598,15 +2840,6 @@
 UNIMPLEMENTED_INTRINSIC(FloatIsNaN)
 UNIMPLEMENTED_INTRINSIC(DoubleIsNaN)
 
-UNIMPLEMENTED_INTRINSIC(IntegerCompare)
-UNIMPLEMENTED_INTRINSIC(LongCompare)
-UNIMPLEMENTED_INTRINSIC(IntegerHighestOneBit)
-UNIMPLEMENTED_INTRINSIC(LongHighestOneBit)
-UNIMPLEMENTED_INTRINSIC(IntegerLowestOneBit)
-UNIMPLEMENTED_INTRINSIC(LongLowestOneBit)
-UNIMPLEMENTED_INTRINSIC(IntegerSignum)
-UNIMPLEMENTED_INTRINSIC(LongSignum)
-
 // Rotate operations are handled as HRor instructions.
 UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateRight)