Implemented BitCount as an intrinsic. With unit test.

Rationale:
Recognizing this important operation as an intrinsic has
various advantages:
(1) having the no-side-effects/no-throw allows for
    much more GVN/LICM/BCE.
(2) Some architectures, like x86_64, provide direct
    support for this operation.

Performance improvements on X86_64:
CheckersEvalBench (32-bit bitboard): 27,210KNS -> 36,798KNS  =  + 35%
ReversiEvalBench  (64-bit bitboard): 52,562KNS -> 89,086KNS  =  + 69%

Change-Id: I65d549b0469b7909b12c6611cdc34a8640a5751f
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index 4617668..3766093 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -39,6 +39,7 @@
     true,   // kIntrinsicFloatCvt
     true,   // kIntrinsicReverseBits
     true,   // kIntrinsicReverseBytes
+    true,   // kIntrinsicBitCount
     true,   // kIntrinsicNumberOfLeadingZeros
     true,   // kIntrinsicNumberOfTrailingZeros
     true,   // kIntrinsicRotateRight
@@ -99,6 +100,7 @@
 static_assert(kIntrinsicIsStatic[kIntrinsicFloatCvt], "FloatCvt must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicReverseBits], "ReverseBits must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicReverseBytes], "ReverseBytes must be static");
+static_assert(kIntrinsicIsStatic[kIntrinsicBitCount], "BitCount must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicNumberOfLeadingZeros],
               "NumberOfLeadingZeros must be static");
 static_assert(kIntrinsicIsStatic[kIntrinsicNumberOfTrailingZeros],
@@ -293,6 +295,7 @@
     "putObjectVolatile",     // kNameCachePutObjectVolatile
     "putOrderedObject",      // kNameCachePutOrderedObject
     "arraycopy",             // kNameCacheArrayCopy
+    "bitCount",              // kNameCacheBitCount
     "numberOfLeadingZeros",  // kNameCacheNumberOfLeadingZeros
     "numberOfTrailingZeros",  // kNameCacheNumberOfTrailingZeros
     "rotateRight",           // kNameCacheRotateRight
@@ -447,6 +450,8 @@
     INTRINSIC(JavaLangInteger, Reverse, I_I, kIntrinsicReverseBits, k32),
     INTRINSIC(JavaLangLong, Reverse, J_J, kIntrinsicReverseBits, k64),
 
+    INTRINSIC(JavaLangInteger, BitCount, I_I, kIntrinsicBitCount, k32),
+    INTRINSIC(JavaLangLong, BitCount, J_I, kIntrinsicBitCount, k64),
     INTRINSIC(JavaLangInteger, NumberOfLeadingZeros, I_I, kIntrinsicNumberOfLeadingZeros, k32),
     INTRINSIC(JavaLangLong, NumberOfLeadingZeros, J_I, kIntrinsicNumberOfLeadingZeros, k64),
     INTRINSIC(JavaLangInteger, NumberOfTrailingZeros, I_I, kIntrinsicNumberOfTrailingZeros, k32),
@@ -745,6 +750,7 @@
                                           intrinsic.d.data & kIntrinsicFlagIsOrdered);
     case kIntrinsicSystemArrayCopyCharArray:
       return backend->GenInlinedArrayCopyCharArray(info);
+    case kIntrinsicBitCount:
     case kIntrinsicNumberOfLeadingZeros:
     case kIntrinsicNumberOfTrailingZeros:
     case kIntrinsicRotateRight:
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index ac70577..2803623 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -224,6 +224,7 @@
       kNameCachePutObjectVolatile,
       kNameCachePutOrderedObject,
       kNameCacheArrayCopy,
+      kNameCacheBitCount,
       kNameCacheNumberOfLeadingZeros,
       kNameCacheNumberOfTrailingZeros,
       kNameCacheRotateRight,
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index c6da9a3..5caf077 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -176,6 +176,16 @@
       }
 
     // Misc data processing.
+    case kIntrinsicBitCount:
+      switch (GetType(method.d.data, true)) {
+        case Primitive::kPrimInt:
+          return Intrinsics::kIntegerBitCount;
+        case Primitive::kPrimLong:
+          return Intrinsics::kLongBitCount;
+        default:
+          LOG(FATAL) << "Unknown/unsupported op size " << method.d.data;
+          UNREACHABLE();
+      }
     case kIntrinsicNumberOfLeadingZeros:
       switch (GetType(method.d.data, true)) {
         case Primitive::kPrimInt:
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index b1fbf28..e72f927 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1577,10 +1577,12 @@
 void IntrinsicCodeGeneratorARM::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
 }
 
+UNIMPLEMENTED_INTRINSIC(IntegerBitCount)
 UNIMPLEMENTED_INTRINSIC(IntegerReverse)
 UNIMPLEMENTED_INTRINSIC(IntegerReverseBytes)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateRight)
+UNIMPLEMENTED_INTRINSIC(LongBitCount)
 UNIMPLEMENTED_INTRINSIC(LongReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverseBytes)
 UNIMPLEMENTED_INTRINSIC(LongRotateLeft)
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 81cab86..c5688a3 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1447,8 +1447,10 @@
 void IntrinsicCodeGeneratorARM64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
 }
 
+UNIMPLEMENTED_INTRINSIC(IntegerBitCount)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateRight)
+UNIMPLEMENTED_INTRINSIC(LongBitCount)
 UNIMPLEMENTED_INTRINSIC(LongRotateLeft)
 UNIMPLEMENTED_INTRINSIC(LongRotateRight)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/intrinsics_list.h b/compiler/optimizing/intrinsics_list.h
index 2e87546..ea38034 100644
--- a/compiler/optimizing/intrinsics_list.h
+++ b/compiler/optimizing/intrinsics_list.h
@@ -28,12 +28,14 @@
   V(FloatIntBitsToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(IntegerReverse, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(IntegerReverseBytes, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
+  V(IntegerBitCount, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(IntegerNumberOfLeadingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(IntegerNumberOfTrailingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(IntegerRotateRight, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(IntegerRotateLeft, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(LongReverse, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(LongReverseBytes, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
+  V(LongBitCount, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(LongNumberOfLeadingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(LongNumberOfTrailingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
   V(LongRotateRight, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index bc126a2..81112b1 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -935,6 +935,9 @@
 void IntrinsicCodeGeneratorMIPS::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
 }
 
+UNIMPLEMENTED_INTRINSIC(IntegerBitCount)
+UNIMPLEMENTED_INTRINSIC(LongBitCount)
+
 UNIMPLEMENTED_INTRINSIC(MathAbsDouble)
 UNIMPLEMENTED_INTRINSIC(MathAbsFloat)
 UNIMPLEMENTED_INTRINSIC(MathAbsInt)
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 8b45ea7..ac969e3 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -1724,6 +1724,9 @@
 void IntrinsicCodeGeneratorMIPS64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) {    \
 }
 
+UNIMPLEMENTED_INTRINSIC(IntegerBitCount)
+UNIMPLEMENTED_INTRINSIC(LongBitCount)
+
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
 UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 677f2e9..4715bdc 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -2518,8 +2518,10 @@
 
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
+UNIMPLEMENTED_INTRINSIC(IntegerBitCount)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft)
 UNIMPLEMENTED_INTRINSIC(IntegerRotateRight)
+UNIMPLEMENTED_INTRINSIC(LongBitCount)
 UNIMPLEMENTED_INTRINSIC(LongRotateRight)
 UNIMPLEMENTED_INTRINSIC(LongRotateLeft)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopy)
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 690cf3d..23a628f 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2368,6 +2368,70 @@
   SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
 }
 
+static void CreateBitCountLocations(
+    ArenaAllocator* arena, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
+  if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
+    // Do nothing if there is no popcnt support. This results in generating
+    // a call for the intrinsic rather than direct code.
+    return;
+  }
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::Any());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+static void GenBitCount(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) {
+  LocationSummary* locations = invoke->GetLocations();
+  Location src = locations->InAt(0);
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  if (invoke->InputAt(0)->IsConstant()) {
+    // Evaluate this at compile time.
+    int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
+    value = is_long
+        ? POPCOUNT(static_cast<uint64_t>(value))
+        : POPCOUNT(static_cast<uint32_t>(value));
+    if (value == 0) {
+      __ xorl(out, out);
+    } else {
+      __ movl(out, Immediate(value));
+    }
+    return;
+  }
+
+  if (src.IsRegister()) {
+    if (is_long) {
+      __ popcntq(out, src.AsRegister<CpuRegister>());
+    } else {
+      __ popcntl(out, src.AsRegister<CpuRegister>());
+    }
+  } else if (is_long) {
+    DCHECK(src.IsDoubleStackSlot());
+    __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
+  } else {
+    DCHECK(src.IsStackSlot());
+    __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
+  CreateBitCountLocations(arena_, codegen_, invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
+  GenBitCount(GetAssembler(), invoke, /* is_long */ false);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
+  CreateBitCountLocations(arena_, codegen_, invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
+  GenBitCount(GetAssembler(), invoke, /* is_long */ true);
+}
+
 static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
                                                            LocationSummary::kNoCall,
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index db07267..10f5a00 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -2247,6 +2247,42 @@
   EmitOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::popcntl(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xB8);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+void X86_64Assembler::popcntl(CpuRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xB8);
+  EmitOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::popcntq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xB8);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+void X86_64Assembler::popcntq(CpuRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF3);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xB8);
+  EmitOperand(dst.LowBits(), src);
+}
+
 void X86_64Assembler::repne_scasw() {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 01d28e3..6f0847e 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -647,6 +647,11 @@
   void bsrq(CpuRegister dst, CpuRegister src);
   void bsrq(CpuRegister dst, const Address& src);
 
+  void popcntl(CpuRegister dst, CpuRegister src);
+  void popcntl(CpuRegister dst, const Address& src);
+  void popcntq(CpuRegister dst, CpuRegister src);
+  void popcntq(CpuRegister dst, const Address& src);
+
   void rorl(CpuRegister reg, const Immediate& imm);
   void rorl(CpuRegister operand, CpuRegister shifter);
   void roll(CpuRegister reg, const Immediate& imm);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 00bb5ca..8a87fca 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1333,6 +1333,44 @@
   DriverStr(expected, "bsrq_address");
 }
 
+TEST_F(AssemblerX86_64Test, Popcntl) {
+  DriverStr(Repeatrr(&x86_64::X86_64Assembler::popcntl, "popcntl %{reg2}, %{reg1}"), "popcntl");
+}
+
+TEST_F(AssemblerX86_64Test, PopcntlAddress) {
+  GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  const char* expected =
+    "popcntl 0xc(%RDI,%RBX,4), %R10d\n"
+    "popcntl 0xc(%R10,%RBX,4), %edi\n"
+    "popcntl 0xc(%RDI,%R9,4), %edi\n";
+
+  DriverStr(expected, "popcntl_address");
+}
+
+TEST_F(AssemblerX86_64Test, Popcntq) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::popcntq, "popcntq %{reg2}, %{reg1}"), "popcntq");
+}
+
+TEST_F(AssemblerX86_64Test, PopcntqAddress) {
+  GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  const char* expected =
+    "popcntq 0xc(%RDI,%RBX,4), %R10\n"
+    "popcntq 0xc(%R10,%RBX,4), %RDI\n"
+    "popcntq 0xc(%RDI,%R9,4), %RDI\n";
+
+  DriverStr(expected, "popcntq_address");
+}
+
 /////////////////
 // Near labels //
 /////////////////