Implement CountLeadingZeros for x86

Generate Long and Integer numberOfLeadingZeros for x86 and x86_64. Uses
'bsr' instruction to find the first one bit, and then corrects the
result.

Added some more tests with constant values to test constant folding.
Also add a runtime test with 0 as the input.

Change-Id: I920b21bb00069bccf5f921f8f87a77e334114926
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 4471d71..b7126b2 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -20,6 +20,7 @@
 
 #include "arch/x86/instruction_set_features_x86.h"
 #include "art_method.h"
+#include "base/bit_utils.h"
 #include "code_generator_x86.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
@@ -1835,6 +1836,115 @@
   SwapBits(reg_high, temp, 4, 0x0f0f0f0f, assembler);
 }
 
+static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke, bool is_long) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  if (is_long) {
+    locations->SetInAt(0, Location::RequiresRegister());
+  } else {
+    locations->SetInAt(0, Location::Any());
+  }
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void GenLeadingZeros(X86Assembler* assembler, HInvoke* invoke, bool is_long) {
+  LocationSummary* locations = invoke->GetLocations();
+  Location src = locations->InAt(0);
+  Register out = locations->Out().AsRegister<Register>();
+
+  if (invoke->InputAt(0)->IsConstant()) {
+    // Evaluate this at compile time.
+    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
+    if (value == 0) {
+      value = is_long ? 64 : 32;
+    } else {
+      value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
+    }
+    if (value == 0) {
+      __ xorl(out, out);
+    } else {
+      __ movl(out, Immediate(value));
+    }
+    return;
+  }
+
+  // Handle the non-constant cases.
+  if (!is_long) {
+    if (src.IsRegister()) {
+      __ bsrl(out, src.AsRegister<Register>());
+    } else {
+      DCHECK(src.IsStackSlot());
+      __ bsrl(out, Address(ESP, src.GetStackIndex()));
+    }
+
+    // BSR sets ZF if the input was zero, and the output is undefined.
+    Label all_zeroes, done;
+    __ j(kEqual, &all_zeroes);
+
+    // Correct the result from BSR to get the final CLZ result.
+    __ xorl(out, Immediate(31));
+    __ jmp(&done);
+
+    // Fix the zero case with the expected result.
+    __ Bind(&all_zeroes);
+    __ movl(out, Immediate(32));
+
+    __ Bind(&done);
+    return;
+  }
+
+  // 64 bit case needs to worry about both parts of the register.
+  DCHECK(src.IsRegisterPair());
+  Register src_lo = src.AsRegisterPairLow<Register>();
+  Register src_hi = src.AsRegisterPairHigh<Register>();
+  Label handle_low, done, all_zeroes;
+
+  // Is the high word zero?
+  __ testl(src_hi, src_hi);
+  __ j(kEqual, &handle_low);
+
+  // High word is not zero. We know that the BSR result is defined in this case.
+  __ bsrl(out, src_hi);
+
+  // Correct the result from BSR to get the final CLZ result.
+  __ xorl(out, Immediate(31));
+  __ jmp(&done);
+
+  // High word was zero.  We have to compute the low word count and add 32.
+  __ Bind(&handle_low);
+  __ bsrl(out, src_lo);
+  __ j(kEqual, &all_zeroes);
+
+  // We had a valid result.  Use an XOR to both correct the result and add 32.
+  __ xorl(out, Immediate(63));
+  __ jmp(&done);
+
+  // All zero case.
+  __ Bind(&all_zeroes);
+  __ movl(out, Immediate(64));
+
+  __ Bind(&done);
+}
+
+void IntrinsicLocationsBuilderX86::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
+  CreateLeadingZeroLocations(arena_, invoke, /* is_long */ false);
+}
+
+void IntrinsicCodeGeneratorX86::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
+  X86Assembler* assembler = down_cast<X86Assembler*>(codegen_->GetAssembler());
+  GenLeadingZeros(assembler, invoke, /* is_long */ false);
+}
+
+void IntrinsicLocationsBuilderX86::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
+  CreateLeadingZeroLocations(arena_, invoke, /* is_long */ true);
+}
+
+void IntrinsicCodeGeneratorX86::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
+  X86Assembler* assembler = down_cast<X86Assembler*>(codegen_->GetAssembler());
+  GenLeadingZeros(assembler, invoke, /* is_long */ true);
+}
+
 // Unimplemented intrinsics.
 
 #define UNIMPLEMENTED_INTRINSIC(Name)                                                   \
@@ -1847,8 +1957,6 @@
 UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
-UNIMPLEMENTED_INTRINSIC(IntegerNumberOfLeadingZeros)
-UNIMPLEMENTED_INTRINSIC(LongNumberOfLeadingZeros)
 
 #undef UNIMPLEMENTED_INTRINSIC
 
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 9ea68ec..15fbac1 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -20,6 +20,7 @@
 
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "art_method-inl.h"
+#include "base/bit_utils.h"
 #include "code_generator_x86_64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
@@ -1685,6 +1686,84 @@
   SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
 }
 
+static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::Any());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void GenLeadingZeros(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) {
+  LocationSummary* locations = invoke->GetLocations();
+  Location src = locations->InAt(0);
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  int zero_value_result = is_long ? 64 : 32;
+  if (invoke->InputAt(0)->IsConstant()) {
+    // Evaluate this at compile time.
+    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
+    if (value == 0) {
+      value = zero_value_result;
+    } else {
+      value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
+    }
+    if (value == 0) {
+      __ xorl(out, out);
+    } else {
+      __ movl(out, Immediate(value));
+    }
+    return;
+  }
+
+  // Handle the non-constant cases.
+  if (src.IsRegister()) {
+    if (is_long) {
+      __ bsrq(out, src.AsRegister<CpuRegister>());
+    } else {
+      __ bsrl(out, src.AsRegister<CpuRegister>());
+    }
+  } else if (is_long) {
+    DCHECK(src.IsDoubleStackSlot());
+    __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
+  } else {
+    DCHECK(src.IsStackSlot());
+    __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
+  }
+
+  // BSR sets ZF if the input was zero, and the output is undefined.
+  Label is_zero, done;
+  __ j(kEqual, &is_zero);
+
+  // Correct the result from BSR to get the CLZ result.
+  __ xorl(out, Immediate(zero_value_result - 1));
+  __ jmp(&done);
+
+  // Fix the zero case with the expected result.
+  __ Bind(&is_zero);
+  __ movl(out, Immediate(zero_value_result));
+
+  __ Bind(&done);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
+  CreateLeadingZeroLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
+  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
+  GenLeadingZeros(assembler, invoke, /* is_long */ false);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
+  CreateLeadingZeroLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
+  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen_->GetAssembler());
+  GenLeadingZeros(assembler, invoke, /* is_long */ true);
+}
+
 // Unimplemented intrinsics.
 
 #define UNIMPLEMENTED_INTRINSIC(Name)                                                   \
@@ -1696,8 +1775,6 @@
 UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
-UNIMPLEMENTED_INTRINSIC(IntegerNumberOfLeadingZeros)
-UNIMPLEMENTED_INTRINSIC(LongNumberOfLeadingZeros)
 
 #undef UNIMPLEMENTED_INTRINSIC
 
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 77c1a99..bd606a6 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -1043,8 +1043,20 @@
     return (r1 / i1) + (r2 / i2) + i3 + i4 + i5 + i6 + i7 + i8;
   }
 
+  public static boolean doThrow = false;
+
+  public static int $noinline$return_int_zero() {
+    if (doThrow) {
+      throw new Error();
+    }
+    return 0;
+  }
+
   public static void test_Integer_numberOfLeadingZeros() {
     Assert.assertEquals(Integer.numberOfLeadingZeros(0), Integer.SIZE);
+    Assert.assertEquals(Integer.numberOfLeadingZeros(1), Integer.SIZE - 1);
+    Assert.assertEquals(Integer.numberOfLeadingZeros(1 << (Integer.SIZE-1)), 0);
+    Assert.assertEquals(Integer.numberOfLeadingZeros($noinline$return_int_zero()), Integer.SIZE);
     for (int i = 0; i < Integer.SIZE; i++) {
         Assert.assertEquals(Integer.numberOfLeadingZeros(1 << i), Integer.SIZE - 1 - i);
         Assert.assertEquals(Integer.numberOfLeadingZeros((1 << i) | 1), Integer.SIZE - 1 - i);
@@ -1052,8 +1064,19 @@
     }
   }
 
+  public static long $noinline$return_long_zero() {
+    if (doThrow) {
+      throw new Error();
+    }
+    return 0;
+  }
+
   public static void test_Long_numberOfLeadingZeros() {
     Assert.assertEquals(Long.numberOfLeadingZeros(0L), Long.SIZE);
+    Assert.assertEquals(Long.numberOfLeadingZeros(1L), Long.SIZE - 1);
+    Assert.assertEquals(Long.numberOfLeadingZeros(1L << ((Long.SIZE/2)-1)), Long.SIZE/2);
+    Assert.assertEquals(Long.numberOfLeadingZeros(1L << (Long.SIZE-1)), 0);
+    Assert.assertEquals(Long.numberOfLeadingZeros($noinline$return_long_zero()), Long.SIZE);
     for (int i = 0; i < Long.SIZE; i++) {
         Assert.assertEquals(Long.numberOfLeadingZeros(1L << i), Long.SIZE - 1 - i);
         Assert.assertEquals(Long.numberOfLeadingZeros((1L << i) | 1L), Long.SIZE - 1 - i);