diff options
author | 2019-10-15 15:36:51 +0100 | |
---|---|---|
committer | 2019-10-23 12:04:00 +0000 | |
commit | 2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 (patch) | |
tree | 5ff5f5f7c5c25f441a36506a84988fa95d2dbd46 /compiler/optimizing/intrinsics_arm64.cc | |
parent | b8c884e5f22390386b202459ab55ef3046631e42 (diff) |
ARM64: toHalf() intrinsic for ARMv8
This CL implements an intrinsic for toHalf() method with
ARMv8.2 FP16 instructions.
This intrinsic implementation achieves bit-level compatibility with the
original Java implementation android.util.Half.toFloat().
The time required to execute the below code on Pixel3:
- Java implementation android.util.Half.toFloat():
- big cluster only: 2136ms
- little cluster only: 6442ms
- arm64 Intrinisic implementation:
- big cluster only: 1347ms (~37% faster)
- little cluster only: 4937ms (~ 23% faster)
int benchmarkToHalf() {
int result = 0;
// 5.9605E-8 is the smallest positive subnormal number that can be
// represented by FP16. This is 0x33800032 in float bits.
int raw_input = 0x33800032;
long before = 0;
long after = 0;
before = System.currentTimeMillis();
do {
float input = Float.intBitsToFloat(raw_input);
short output = FP16.toHalf(input);
result += output;
} while (++raw_input != 0x477fff00);
// 65535 is the max possible integer that can be represented by FP16.
//This is 0x477fff00 in float bits.
after = System.currentTimeMillis();
System.out.println("Time of FP16.toHalf (ms): " + (after - before));
return result;
}
Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Test: test-art-host, test-art-target
Change-Id: I69b152682390e5ffa5b3fdca60b496261191655d
Diffstat (limited to 'compiler/optimizing/intrinsics_arm64.cc')
-rw-r--r-- | compiler/optimizing/intrinsics_arm64.cc | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 185d487dff..6a666c9eef 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -3216,6 +3216,30 @@ void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) { __ Fcvt(out, half); } +void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) { + if (!codegen_->GetInstructionSetFeatures().HasFP16()) { + return; + } + + LocationSummary* locations = new (allocator_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) { + DCHECK(codegen_->GetInstructionSetFeatures().HasFP16()); + MacroAssembler* masm = GetVIXLAssembler(); + UseScratchRegisterScope scratch_scope(masm); + FPRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0)); + FPRegister half = scratch_scope.AcquireH(); + Register out = WRegisterFrom(invoke->GetLocations()->Out()); + __ Fcvt(half, in); + __ Fmov(out, half); + __ Sxth(out, out); // sign extend due to returning a short type. +} + UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf); |