diff options
author | 2016-01-15 14:35:12 -0800 | |
---|---|---|
committer | 2016-01-20 20:14:00 -0800 | |
commit | 3f67e692860d281858485d48a4f1f81b907f1444 (patch) | |
tree | a14d3bdc1416dc3db74983d34a408a8b48cbce9c | |
parent | 6aadaef35ea52506db61e463910c2520b702ca5e (diff) |
Implemented BitCount as an intrinsic. With unit test.
Rationale:
Recognizing this important operation as an intrinsic has
various advantages:
(1) having the no-side-effects/no-throw allows for
much more GVN/LICM/BCE.
(2) Some architectures, like x86_64, provide direct
support for this operation.
Performance improvements on X86_64:
CheckersEvalBench (32-bit bitboard): 27,210KNS -> 36,798KNS = + 35%
ReversiEvalBench (64-bit bitboard): 52,562KNS -> 89,086KNS = + 69%
Change-Id: I65d549b0469b7909b12c6611cdc34a8640a5751f
23 files changed, 343 insertions, 28 deletions
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc index 4617668ee8..3766093fa8 100644 --- a/compiler/dex/quick/dex_file_method_inliner.cc +++ b/compiler/dex/quick/dex_file_method_inliner.cc @@ -39,6 +39,7 @@ static constexpr bool kIntrinsicIsStatic[] = { true, // kIntrinsicFloatCvt true, // kIntrinsicReverseBits true, // kIntrinsicReverseBytes + true, // kIntrinsicBitCount true, // kIntrinsicNumberOfLeadingZeros true, // kIntrinsicNumberOfTrailingZeros true, // kIntrinsicRotateRight @@ -99,6 +100,7 @@ static_assert(kIntrinsicIsStatic[kIntrinsicDoubleCvt], "DoubleCvt must be static static_assert(kIntrinsicIsStatic[kIntrinsicFloatCvt], "FloatCvt must be static"); static_assert(kIntrinsicIsStatic[kIntrinsicReverseBits], "ReverseBits must be static"); static_assert(kIntrinsicIsStatic[kIntrinsicReverseBytes], "ReverseBytes must be static"); +static_assert(kIntrinsicIsStatic[kIntrinsicBitCount], "BitCount must be static"); static_assert(kIntrinsicIsStatic[kIntrinsicNumberOfLeadingZeros], "NumberOfLeadingZeros must be static"); static_assert(kIntrinsicIsStatic[kIntrinsicNumberOfTrailingZeros], @@ -293,6 +295,7 @@ const char* const DexFileMethodInliner::kNameCacheNames[] = { "putObjectVolatile", // kNameCachePutObjectVolatile "putOrderedObject", // kNameCachePutOrderedObject "arraycopy", // kNameCacheArrayCopy + "bitCount", // kNameCacheBitCount "numberOfLeadingZeros", // kNameCacheNumberOfLeadingZeros "numberOfTrailingZeros", // kNameCacheNumberOfTrailingZeros "rotateRight", // kNameCacheRotateRight @@ -447,6 +450,8 @@ const DexFileMethodInliner::IntrinsicDef DexFileMethodInliner::kIntrinsicMethods INTRINSIC(JavaLangInteger, Reverse, I_I, kIntrinsicReverseBits, k32), INTRINSIC(JavaLangLong, Reverse, J_J, kIntrinsicReverseBits, k64), + INTRINSIC(JavaLangInteger, BitCount, I_I, kIntrinsicBitCount, k32), + INTRINSIC(JavaLangLong, BitCount, J_I, kIntrinsicBitCount, k64), INTRINSIC(JavaLangInteger, NumberOfLeadingZeros, I_I, kIntrinsicNumberOfLeadingZeros, k32), INTRINSIC(JavaLangLong, NumberOfLeadingZeros, J_I, kIntrinsicNumberOfLeadingZeros, k64), INTRINSIC(JavaLangInteger, NumberOfTrailingZeros, I_I, kIntrinsicNumberOfTrailingZeros, k32), @@ -745,6 +750,7 @@ bool DexFileMethodInliner::GenIntrinsic(Mir2Lir* backend, CallInfo* info) { intrinsic.d.data & kIntrinsicFlagIsOrdered); case kIntrinsicSystemArrayCopyCharArray: return backend->GenInlinedArrayCopyCharArray(info); + case kIntrinsicBitCount: case kIntrinsicNumberOfLeadingZeros: case kIntrinsicNumberOfTrailingZeros: case kIntrinsicRotateRight: diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h index ac70577b48..28036237d7 100644 --- a/compiler/dex/quick/dex_file_method_inliner.h +++ b/compiler/dex/quick/dex_file_method_inliner.h @@ -224,6 +224,7 @@ class DexFileMethodInliner { kNameCachePutObjectVolatile, kNameCachePutOrderedObject, kNameCacheArrayCopy, + kNameCacheBitCount, kNameCacheNumberOfLeadingZeros, kNameCacheNumberOfTrailingZeros, kNameCacheRotateRight, diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc index c6da9a3f5e..5caf077858 100644 --- a/compiler/optimizing/intrinsics.cc +++ b/compiler/optimizing/intrinsics.cc @@ -176,6 +176,16 @@ static Intrinsics GetIntrinsic(InlineMethod method) { } // Misc data processing. + case kIntrinsicBitCount: + switch (GetType(method.d.data, true)) { + case Primitive::kPrimInt: + return Intrinsics::kIntegerBitCount; + case Primitive::kPrimLong: + return Intrinsics::kLongBitCount; + default: + LOG(FATAL) << "Unknown/unsupported op size " << method.d.data; + UNREACHABLE(); + } case kIntrinsicNumberOfLeadingZeros: switch (GetType(method.d.data, true)) { case Primitive::kPrimInt: diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index b1fbf28204..e72f927e44 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -1577,10 +1577,12 @@ void IntrinsicLocationsBuilderARM::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSE void IntrinsicCodeGeneratorARM::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \ } +UNIMPLEMENTED_INTRINSIC(IntegerBitCount) UNIMPLEMENTED_INTRINSIC(IntegerReverse) UNIMPLEMENTED_INTRINSIC(IntegerReverseBytes) UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft) UNIMPLEMENTED_INTRINSIC(IntegerRotateRight) +UNIMPLEMENTED_INTRINSIC(LongBitCount) UNIMPLEMENTED_INTRINSIC(LongReverse) UNIMPLEMENTED_INTRINSIC(LongReverseBytes) UNIMPLEMENTED_INTRINSIC(LongRotateLeft) diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 81cab86c83..c5688a3ea2 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -1447,8 +1447,10 @@ void IntrinsicLocationsBuilderARM64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNU void IntrinsicCodeGeneratorARM64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \ } +UNIMPLEMENTED_INTRINSIC(IntegerBitCount) UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft) UNIMPLEMENTED_INTRINSIC(IntegerRotateRight) +UNIMPLEMENTED_INTRINSIC(LongBitCount) UNIMPLEMENTED_INTRINSIC(LongRotateLeft) UNIMPLEMENTED_INTRINSIC(LongRotateRight) UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar) diff --git a/compiler/optimizing/intrinsics_list.h b/compiler/optimizing/intrinsics_list.h index 2e87546282..ea380347da 100644 --- a/compiler/optimizing/intrinsics_list.h +++ b/compiler/optimizing/intrinsics_list.h @@ -28,12 +28,14 @@ V(FloatIntBitsToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(IntegerReverse, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(IntegerReverseBytes, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ + V(IntegerBitCount, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(IntegerNumberOfLeadingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(IntegerNumberOfTrailingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(IntegerRotateRight, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(IntegerRotateLeft, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(LongReverse, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(LongReverseBytes, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ + V(LongBitCount, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(LongNumberOfLeadingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(LongNumberOfTrailingZeros, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ V(LongRotateRight, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow) \ diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index bc126a2716..81112b1a34 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -935,6 +935,9 @@ void IntrinsicLocationsBuilderMIPS::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUS void IntrinsicCodeGeneratorMIPS::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \ } +UNIMPLEMENTED_INTRINSIC(IntegerBitCount) +UNIMPLEMENTED_INTRINSIC(LongBitCount) + UNIMPLEMENTED_INTRINSIC(MathAbsDouble) UNIMPLEMENTED_INTRINSIC(MathAbsFloat) UNIMPLEMENTED_INTRINSIC(MathAbsInt) diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index 8b45ea7c4f..ac969e39fa 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -1724,6 +1724,9 @@ void IntrinsicLocationsBuilderMIPS64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UN void IntrinsicCodeGeneratorMIPS64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \ } +UNIMPLEMENTED_INTRINSIC(IntegerBitCount) +UNIMPLEMENTED_INTRINSIC(LongBitCount) + UNIMPLEMENTED_INTRINSIC(MathRoundDouble) UNIMPLEMENTED_INTRINSIC(MathRoundFloat) diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 677f2e9c81..4715bdca2e 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -2518,8 +2518,10 @@ void IntrinsicCodeGeneratorX86::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) UNIMPLEMENTED_INTRINSIC(MathRoundDouble) UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent) +UNIMPLEMENTED_INTRINSIC(IntegerBitCount) UNIMPLEMENTED_INTRINSIC(IntegerRotateLeft) UNIMPLEMENTED_INTRINSIC(IntegerRotateRight) +UNIMPLEMENTED_INTRINSIC(LongBitCount) UNIMPLEMENTED_INTRINSIC(LongRotateRight) UNIMPLEMENTED_INTRINSIC(LongRotateLeft) UNIMPLEMENTED_INTRINSIC(SystemArrayCopy) diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 690cf3d413..23a628f243 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -2368,6 +2368,70 @@ void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) { SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler); } +static void CreateBitCountLocations( + ArenaAllocator* arena, CodeGeneratorX86_64* codegen, HInvoke* invoke) { + if (!codegen->GetInstructionSetFeatures().HasPopCnt()) { + // Do nothing if there is no popcnt support. This results in generating + // a call for the intrinsic rather than direct code. + return; + } + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::Any()); + locations->SetOut(Location::RequiresRegister()); +} + +static void GenBitCount(X86_64Assembler* assembler, HInvoke* invoke, bool is_long) { + LocationSummary* locations = invoke->GetLocations(); + Location src = locations->InAt(0); + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + + if (invoke->InputAt(0)->IsConstant()) { + // Evaluate this at compile time. + int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant()); + value = is_long + ? POPCOUNT(static_cast<uint64_t>(value)) + : POPCOUNT(static_cast<uint32_t>(value)); + if (value == 0) { + __ xorl(out, out); + } else { + __ movl(out, Immediate(value)); + } + return; + } + + if (src.IsRegister()) { + if (is_long) { + __ popcntq(out, src.AsRegister<CpuRegister>()); + } else { + __ popcntl(out, src.AsRegister<CpuRegister>()); + } + } else if (is_long) { + DCHECK(src.IsDoubleStackSlot()); + __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex())); + } else { + DCHECK(src.IsStackSlot()); + __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex())); + } +} + +void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) { + CreateBitCountLocations(arena_, codegen_, invoke); +} + +void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) { + GenBitCount(GetAssembler(), invoke, /* is_long */ false); +} + +void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) { + CreateBitCountLocations(arena_, codegen_, invoke); +} + +void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) { + GenBitCount(GetAssembler(), invoke, /* is_long */ true); +} + static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary* locations = new (arena) LocationSummary(invoke, LocationSummary::kNoCall, diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc index db072678ef..10f5a005e1 100644 --- a/compiler/utils/x86_64/assembler_x86_64.cc +++ b/compiler/utils/x86_64/assembler_x86_64.cc @@ -2247,6 +2247,42 @@ void X86_64Assembler::bsrq(CpuRegister dst, const Address& src) { EmitOperand(dst.LowBits(), src); } +void X86_64Assembler::popcntl(CpuRegister dst, CpuRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0xF3); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xB8); + EmitRegisterOperand(dst.LowBits(), src.LowBits()); +} + +void X86_64Assembler::popcntl(CpuRegister dst, const Address& src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0xF3); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xB8); + EmitOperand(dst.LowBits(), src); +} + +void X86_64Assembler::popcntq(CpuRegister dst, CpuRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0xF3); + EmitRex64(dst, src); + EmitUint8(0x0F); + EmitUint8(0xB8); + EmitRegisterOperand(dst.LowBits(), src.LowBits()); +} + +void X86_64Assembler::popcntq(CpuRegister dst, const Address& src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0xF3); + EmitRex64(dst, src); + EmitUint8(0x0F); + EmitUint8(0xB8); + EmitOperand(dst.LowBits(), src); +} + void X86_64Assembler::repne_scasw() { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x66); diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index 01d28e305d..6f0847eb61 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -647,6 +647,11 @@ class X86_64Assembler FINAL : public Assembler { void bsrq(CpuRegister dst, CpuRegister src); void bsrq(CpuRegister dst, const Address& src); + void popcntl(CpuRegister dst, CpuRegister src); + void popcntl(CpuRegister dst, const Address& src); + void popcntq(CpuRegister dst, CpuRegister src); + void popcntq(CpuRegister dst, const Address& src); + void rorl(CpuRegister reg, const Immediate& imm); void rorl(CpuRegister operand, CpuRegister shifter); void roll(CpuRegister reg, const Immediate& imm); diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc index 00bb5ca36b..8a87fca96a 100644 --- a/compiler/utils/x86_64/assembler_x86_64_test.cc +++ b/compiler/utils/x86_64/assembler_x86_64_test.cc @@ -1333,6 +1333,44 @@ TEST_F(AssemblerX86_64Test, BsrqAddress) { DriverStr(expected, "bsrq_address"); } +TEST_F(AssemblerX86_64Test, Popcntl) { + DriverStr(Repeatrr(&x86_64::X86_64Assembler::popcntl, "popcntl %{reg2}, %{reg1}"), "popcntl"); +} + +TEST_F(AssemblerX86_64Test, PopcntlAddress) { + GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::R10), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12)); + GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address( + x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12)); + GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12)); + const char* expected = + "popcntl 0xc(%RDI,%RBX,4), %R10d\n" + "popcntl 0xc(%R10,%RBX,4), %edi\n" + "popcntl 0xc(%RDI,%R9,4), %edi\n"; + + DriverStr(expected, "popcntl_address"); +} + +TEST_F(AssemblerX86_64Test, Popcntq) { + DriverStr(RepeatRR(&x86_64::X86_64Assembler::popcntq, "popcntq %{reg2}, %{reg1}"), "popcntq"); +} + +TEST_F(AssemblerX86_64Test, PopcntqAddress) { + GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::R10), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12)); + GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address( + x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12)); + GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12)); + const char* expected = + "popcntq 0xc(%RDI,%RBX,4), %R10\n" + "popcntq 0xc(%R10,%RBX,4), %RDI\n" + "popcntq 0xc(%RDI,%R9,4), %RDI\n"; + + DriverStr(expected, "popcntq_address"); +} + ///////////////// // Near labels // ///////////////// diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc index d4bef0fe7b..1f74c93045 100644 --- a/disassembler/disassembler_x86.cc +++ b/disassembler/disassembler_x86.cc @@ -938,6 +938,11 @@ DISASSEMBLER_ENTRY(cmp, has_modrm = true; load = true; break; + case 0xB8: + opcode1 = "popcnt"; + has_modrm = true; + load = true; + break; case 0xBE: opcode1 = "movsxb"; has_modrm = true; diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc index 42f5df467d..da01ee467a 100644 --- a/runtime/arch/x86/instruction_set_features_x86.cc +++ b/runtime/arch/x86/instruction_set_features_x86.cc @@ -50,6 +50,10 @@ static constexpr const char* x86_variants_prefer_locked_add_sync[] = { "silvermont", }; +static constexpr const char* x86_variants_with_popcnt[] = { + "silvermont", +}; + const X86InstructionSetFeatures* X86InstructionSetFeatures::FromVariant( const std::string& variant, std::string* error_msg ATTRIBUTE_UNUSED, bool x86_64) { @@ -69,6 +73,11 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromVariant( arraysize(x86_variants_prefer_locked_add_sync), variant); + bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt, + arraysize(x86_variants_with_popcnt), + variant); + + // Verify that variant is known. bool known_variant = FindVariantInArray(x86_known_variants, arraysize(x86_known_variants), variant); if (!known_variant && variant != "default") { @@ -77,10 +86,10 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromVariant( if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } } @@ -93,12 +102,15 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromBitmap(uint32_t bool has_AVX = (bitmap & kAvxBitfield) != 0; bool has_AVX2 = (bitmap & kAvxBitfield) != 0; bool prefers_locked_add = (bitmap & kPrefersLockedAdd) != 0; + bool has_POPCNT = (bitmap & kPopCntBitfield) != 0; if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, - has_AVX, has_AVX2, prefers_locked_add); + has_AVX, has_AVX2, prefers_locked_add, + has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, - has_AVX, has_AVX2, prefers_locked_add); + has_AVX, has_AVX2, prefers_locked_add, + has_POPCNT); } } @@ -138,12 +150,15 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCppDefines(bool // No #define for memory synchronization preference. const bool prefers_locked_add = false; + // No #define for popcnt. + const bool has_POPCNT = false; + if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } } @@ -158,6 +173,7 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCpuInfo(bool x86 bool has_AVX2 = false; // No cpuinfo for memory synchronization preference. const bool prefers_locked_add = false; + bool has_POPCNT = false; std::ifstream in("/proc/cpuinfo"); if (!in.fail()) { @@ -183,6 +199,9 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCpuInfo(bool x86 if (line.find("avx2") != std::string::npos) { has_AVX2 = true; } + if (line.find("popcnt") != std::string::npos) { + has_POPCNT = true; + } } else if (line.find("processor") != std::string::npos && line.find(": 1") != std::string::npos) { smp = true; @@ -195,10 +214,10 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCpuInfo(bool x86 } if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } } @@ -223,7 +242,8 @@ bool X86InstructionSetFeatures::Equals(const InstructionSetFeatures* other) cons (has_SSE4_2_ == other_as_x86->has_SSE4_2_) && (has_AVX_ == other_as_x86->has_AVX_) && (has_AVX2_ == other_as_x86->has_AVX2_) && - (prefers_locked_add_ == other_as_x86->prefers_locked_add_); + (prefers_locked_add_ == other_as_x86->prefers_locked_add_) && + (has_POPCNT_ == other_as_x86->has_POPCNT_); } uint32_t X86InstructionSetFeatures::AsBitmap() const { @@ -233,7 +253,8 @@ uint32_t X86InstructionSetFeatures::AsBitmap() const { (has_SSE4_2_ ? kSse4_2Bitfield : 0) | (has_AVX_ ? kAvxBitfield : 0) | (has_AVX2_ ? kAvx2Bitfield : 0) | - (prefers_locked_add_ ? kPrefersLockedAdd : 0); + (prefers_locked_add_ ? kPrefersLockedAdd : 0) | + (has_POPCNT_ ? kPopCntBitfield : 0); } std::string X86InstructionSetFeatures::GetFeatureString() const { @@ -273,6 +294,11 @@ std::string X86InstructionSetFeatures::GetFeatureString() const { } else { result += ",-lock_add"; } + if (has_POPCNT_) { + result += ",popcnt"; + } else { + result += ",-popcnt"; + } return result; } @@ -285,6 +311,7 @@ const InstructionSetFeatures* X86InstructionSetFeatures::AddFeaturesFromSplitStr bool has_AVX = has_AVX_; bool has_AVX2 = has_AVX2_; bool prefers_locked_add = prefers_locked_add_; + bool has_POPCNT = has_POPCNT_; for (auto i = features.begin(); i != features.end(); i++) { std::string feature = Trim(*i); if (feature == "ssse3") { @@ -311,6 +338,10 @@ const InstructionSetFeatures* X86InstructionSetFeatures::AddFeaturesFromSplitStr prefers_locked_add = true; } else if (feature == "-lock_add") { prefers_locked_add = false; + } else if (feature == "popcnt") { + has_POPCNT = true; + } else if (feature == "-popcnt") { + has_POPCNT = false; } else { *error_msg = StringPrintf("Unknown instruction set feature: '%s'", feature.c_str()); return nullptr; @@ -318,10 +349,10 @@ const InstructionSetFeatures* X86InstructionSetFeatures::AddFeaturesFromSplitStr } if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add); + has_AVX2, prefers_locked_add, has_POPCNT); } } diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h index 2b845f8dcc..1819654bef 100644 --- a/runtime/arch/x86/instruction_set_features_x86.h +++ b/runtime/arch/x86/instruction_set_features_x86.h @@ -62,6 +62,8 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { bool PrefersLockedAddSynchronization() const { return prefers_locked_add_; } + bool HasPopCnt() const { return has_POPCNT_; } + protected: // Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures. virtual const InstructionSetFeatures* @@ -75,10 +77,17 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { bool x86_64, std::string* error_msg) const; X86InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2, - bool has_AVX, bool has_AVX2, bool prefers_locked_add) - : InstructionSetFeatures(smp), has_SSSE3_(has_SSSE3), has_SSE4_1_(has_SSE4_1), - has_SSE4_2_(has_SSE4_2), has_AVX_(has_AVX), has_AVX2_(has_AVX2), - prefers_locked_add_(prefers_locked_add) { + bool has_AVX, bool has_AVX2, + bool prefers_locked_add, + bool has_POPCNT) + : InstructionSetFeatures(smp), + has_SSSE3_(has_SSSE3), + has_SSE4_1_(has_SSE4_1), + has_SSE4_2_(has_SSE4_2), + has_AVX_(has_AVX), + has_AVX2_(has_AVX2), + prefers_locked_add_(prefers_locked_add), + has_POPCNT_(has_POPCNT) { } private: @@ -91,6 +100,7 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { kAvxBitfield = 16, kAvx2Bitfield = 32, kPrefersLockedAdd = 64, + kPopCntBitfield = 128, }; const bool has_SSSE3_; // x86 128bit SIMD - Supplemental SSE. @@ -99,6 +109,7 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { const bool has_AVX_; // x86 256bit SIMD AVX. const bool has_AVX2_; // x86 256bit SIMD AVX 2.0. const bool prefers_locked_add_; // x86 use locked add for memory synchronization. + const bool has_POPCNT_; // x86 population count DISALLOW_COPY_AND_ASSIGN(X86InstructionSetFeatures); }; diff --git a/runtime/arch/x86/instruction_set_features_x86_test.cc b/runtime/arch/x86/instruction_set_features_x86_test.cc index e8d01e6c14..a062c12892 100644 --- a/runtime/arch/x86/instruction_set_features_x86_test.cc +++ b/runtime/arch/x86/instruction_set_features_x86_test.cc @@ -27,7 +27,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromDefaultVariant) { ASSERT_TRUE(x86_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_features->Equals(x86_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", x86_features->GetFeatureString().c_str()); EXPECT_EQ(x86_features->AsBitmap(), 1U); } @@ -40,7 +40,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromAtomVariant) { ASSERT_TRUE(x86_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_features->Equals(x86_features.get())); - EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add", + EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt", x86_features->GetFeatureString().c_str()); EXPECT_EQ(x86_features->AsBitmap(), 67U); @@ -50,7 +50,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromAtomVariant) { ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", x86_default_features->GetFeatureString().c_str()); EXPECT_EQ(x86_default_features->AsBitmap(), 1U); @@ -60,7 +60,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromAtomVariant) { ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64); EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get())); - EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add", + EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt", x86_64_features->GetFeatureString().c_str()); EXPECT_EQ(x86_64_features->AsBitmap(), 67U); @@ -77,9 +77,9 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) { ASSERT_TRUE(x86_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_features->Equals(x86_features.get())); - EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add", + EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt", x86_features->GetFeatureString().c_str()); - EXPECT_EQ(x86_features->AsBitmap(), 79U); + EXPECT_EQ(x86_features->AsBitmap(), 207U); // Build features for a 32-bit x86 default processor. std::unique_ptr<const InstructionSetFeatures> x86_default_features( @@ -87,7 +87,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) { ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", x86_default_features->GetFeatureString().c_str()); EXPECT_EQ(x86_default_features->AsBitmap(), 1U); @@ -97,9 +97,9 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) { ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64); EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get())); - EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add", + EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt", x86_64_features->GetFeatureString().c_str()); - EXPECT_EQ(x86_64_features->AsBitmap(), 79U); + EXPECT_EQ(x86_64_features->AsBitmap(), 207U); EXPECT_FALSE(x86_64_features->Equals(x86_features.get())); EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get())); diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64.h b/runtime/arch/x86_64/instruction_set_features_x86_64.h index b8000d0001..aba72348f8 100644 --- a/runtime/arch/x86_64/instruction_set_features_x86_64.h +++ b/runtime/arch/x86_64/instruction_set_features_x86_64.h @@ -74,9 +74,10 @@ class X86_64InstructionSetFeatures FINAL : public X86InstructionSetFeatures { private: X86_64InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2, - bool has_AVX, bool has_AVX2, bool prefers_locked_add) + bool has_AVX, bool has_AVX2, bool prefers_locked_add, + bool has_POPCNT) : X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add) { + has_AVX2, prefers_locked_add, has_POPCNT) { } friend class X86InstructionSetFeatures; diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc index 4562c64bc9..78aeacf214 100644 --- a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc +++ b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc @@ -27,7 +27,7 @@ TEST(X86_64InstructionSetFeaturesTest, X86Features) { ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64); EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", x86_64_features->GetFeatureString().c_str()); EXPECT_EQ(x86_64_features->AsBitmap(), 1U); } diff --git a/runtime/quick/inline_method_analyser.h b/runtime/quick/inline_method_analyser.h index 6cea90219e..ca456c2861 100644 --- a/runtime/quick/inline_method_analyser.h +++ b/runtime/quick/inline_method_analyser.h @@ -39,6 +39,7 @@ enum InlineMethodOpcode : uint16_t { kIntrinsicFloatCvt, kIntrinsicReverseBits, kIntrinsicReverseBytes, + kIntrinsicBitCount, kIntrinsicNumberOfLeadingZeros, kIntrinsicNumberOfTrailingZeros, kIntrinsicRotateRight, diff --git a/test/564-checker-bitcount/expected.txt b/test/564-checker-bitcount/expected.txt new file mode 100644 index 0000000000..b0aad4deb5 --- /dev/null +++ b/test/564-checker-bitcount/expected.txt @@ -0,0 +1 @@ +passed diff --git a/test/564-checker-bitcount/info.txt b/test/564-checker-bitcount/info.txt new file mode 100644 index 0000000000..57db66b8ca --- /dev/null +++ b/test/564-checker-bitcount/info.txt @@ -0,0 +1 @@ +Unit test for 32-bit and 64-bit bit count operation. diff --git a/test/564-checker-bitcount/src/Main.java b/test/564-checker-bitcount/src/Main.java new file mode 100644 index 0000000000..b250145ef3 --- /dev/null +++ b/test/564-checker-bitcount/src/Main.java @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class Main { + + // TODO: make this work when b/26700769 is done. + // + // CHECK-START-X86_64: int Main.bits32(int) disassembly (after) + // CHECK-DAG: popcnt + // + // CHECK-START-X86_64: int Main.bits32(int) disassembly (after) + // CHECK-NOT: call + private static int bits32(int x) { + return Integer.bitCount(x); + } + + // TODO: make this work when b/26700769 is done. + // + // CHECK-START-X86_64: int Main.bits64(long) disassembly (after) + // CHECK-DAG: popcnt + // + // CHECK-START-X86_64: int Main.bits64(long) disassembly (after) + // CHECK-NOT: call + private static int bits64(long x) { + return Long.bitCount(x); + } + + public static void main(String args[]) { + expectEquals32(bits32(0x00000000), 0); + expectEquals32(bits32(0x00000001), 1); + expectEquals32(bits32(0x10000000), 1); + expectEquals32(bits32(0x10000001), 2); + expectEquals32(bits32(0x00000003), 2); + expectEquals32(bits32(0x70000000), 3); + expectEquals32(bits32(0x000F0000), 4); + expectEquals32(bits32(0x00001111), 4); + expectEquals32(bits32(0x11110000), 4); + expectEquals32(bits32(0x11111111), 8); + expectEquals32(bits32(0x12345678), 13); + expectEquals32(bits32(0x9ABCDEF0), 19); + expectEquals32(bits32(0xFFFFFFFF), 32); + + for (int i = 0; i < 32; i++) { + expectEquals32(bits32(1 << i), 1); + } + + expectEquals64(bits64(0x0000000000000000L), 0); + expectEquals64(bits64(0x0000000000000001L), 1); + expectEquals64(bits64(0x1000000000000000L), 1); + expectEquals64(bits64(0x1000000000000001L), 2); + expectEquals64(bits64(0x0000000000000003L), 2); + expectEquals64(bits64(0x7000000000000000L), 3); + expectEquals64(bits64(0x000F000000000000L), 4); + expectEquals64(bits64(0x0000000011111111L), 8); + expectEquals64(bits64(0x1111111100000000L), 8); + expectEquals64(bits64(0x1111111111111111L), 16); + expectEquals64(bits64(0x123456789ABCDEF1L), 33); + expectEquals64(bits64(0xFFFFFFFFFFFFFFFFL), 64); + + for (int i = 0; i < 64; i++) { + expectEquals64(bits64(1L << i), 1); + } + + System.out.println("passed"); + } + + private static void expectEquals32(int expected, int result) { + if (expected != result) { + throw new Error("Expected: " + expected + ", found: " + result); + } + } + private static void expectEquals64(long expected, long result) { + if (expected != result) { + throw new Error("Expected: " + expected + ", found: " + result); + } + } +} |