summaryrefslogtreecommitdiff
path: root/compiler/optimizing
diff options
context:
space:
mode:
author xueliang.zhong <xueliang.zhong@linaro.org> 2016-07-05 15:28:19 +0100
committer xueliang.zhong <xueliang.zhong@linaro.org> 2016-07-14 16:32:50 +0100
commitf1073c81a88bd545a45639865c38c43c83f89419 (patch)
treee25adf623c8a56552400ae6ad3f52c85c509c3a4 /compiler/optimizing
parent2e7acaffda05db1df6e0631468f10726e898a20a (diff)
Integer.bitCount and Long.bitCount intrinsics for ARM
Change-Id: I4ed3e779415be026c7d090b61a3e356b37c418e5
Diffstat (limited to 'compiler/optimizing')
-rw-r--r--compiler/optimizing/intrinsics_arm.cc47
1 files changed, 45 insertions, 2 deletions
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 579fb9d3bb..d25f439b06 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1979,6 +1979,51 @@ void IntrinsicCodeGeneratorARM::VisitShortReverseBytes(HInvoke* invoke) {
__ revsh(out, in);
}
+static void GenBitCount(HInvoke* instr, Primitive::Type type, ArmAssembler* assembler) {
+ DCHECK(Primitive::IsIntOrLongType(type)) << type;
+ DCHECK_EQ(instr->GetType(), Primitive::kPrimInt);
+ DCHECK_EQ(Primitive::PrimitiveKind(instr->InputAt(0)->GetType()), type);
+
+ bool is_long = type == Primitive::kPrimLong;
+ LocationSummary* locations = instr->GetLocations();
+ Location in = locations->InAt(0);
+ Register src_0 = is_long ? in.AsRegisterPairLow<Register>() : in.AsRegister<Register>();
+ Register src_1 = is_long ? in.AsRegisterPairHigh<Register>() : src_0;
+ SRegister tmp_s = locations->GetTemp(0).AsFpuRegisterPairLow<SRegister>();
+ DRegister tmp_d = FromLowSToD(tmp_s);
+ Register out_r = locations->Out().AsRegister<Register>();
+
+ // Move data from core register(s) to temp D-reg for bit count calculation, then move back.
+ // According to Cortex A57 and A72 optimization guides, compared to transferring to full D-reg,
+ // transferring data from core reg to upper or lower half of vfp D-reg requires extra latency,
+ // That's why for integer bit count, we use 'vmov d0, r0, r0' instead of 'vmov d0[0], r0'.
+ __ vmovdrr(tmp_d, src_1, src_0); // Temp DReg |--src_1|--src_0|
+ __ vcntd(tmp_d, tmp_d); // Temp DReg |c|c|c|c|c|c|c|c|
+ __ vpaddld(tmp_d, tmp_d, 8, /* is_unsigned */ true); // Temp DReg |--c|--c|--c|--c|
+ __ vpaddld(tmp_d, tmp_d, 16, /* is_unsigned */ true); // Temp DReg |------c|------c|
+ if (is_long) {
+ __ vpaddld(tmp_d, tmp_d, 32, /* is_unsigned */ true); // Temp DReg |--------------c|
+ }
+ __ vmovrs(out_r, tmp_s);
+}
+
+void IntrinsicLocationsBuilderARM::VisitIntegerBitCount(HInvoke* invoke) {
+ CreateIntToIntLocations(arena_, invoke);
+ invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM::VisitIntegerBitCount(HInvoke* invoke) {
+ GenBitCount(invoke, Primitive::kPrimInt, GetAssembler());
+}
+
+void IntrinsicLocationsBuilderARM::VisitLongBitCount(HInvoke* invoke) {
+ VisitIntegerBitCount(invoke);
+}
+
+void IntrinsicCodeGeneratorARM::VisitLongBitCount(HInvoke* invoke) {
+ GenBitCount(invoke, Primitive::kPrimLong, GetAssembler());
+}
+
void IntrinsicLocationsBuilderARM::VisitStringGetCharsNoCheck(HInvoke* invoke) {
LocationSummary* locations = new (arena_) LocationSummary(invoke,
LocationSummary::kNoCall,
@@ -2119,8 +2164,6 @@ void IntrinsicCodeGeneratorARM::VisitDoubleIsInfinite(HInvoke* invoke) {
__ Lsr(out, out, 5);
}
-UNIMPLEMENTED_INTRINSIC(ARM, IntegerBitCount)
-UNIMPLEMENTED_INTRINSIC(ARM, LongBitCount)
UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble)
UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat)
UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble)