ARM64: toHalf() intrinsic for ARMv8

This CL implements an intrinsic for toHalf() method with ARMv8.2 FP16 instructions. This intrinsic implementation achieves bit-level compatibility with the original Java implementation android.util.Half.toFloat(). The time required to execute the below code on Pixel3: - Java implementation android.util.Half.toFloat(): - big cluster only: 2136ms - little cluster only: 6442ms - arm64 Intrinisic implementation: - big cluster only: 1347ms (~37% faster) - little cluster only: 4937ms (~ 23% faster) int benchmarkToHalf() { int result = 0; // 5.9605E-8 is the smallest positive subnormal number that can be // represented by FP16. This is 0x33800032 in float bits. int raw_input = 0x33800032; long before = 0; long after = 0; before = System.currentTimeMillis(); do { float input = Float.intBitsToFloat(raw_input); short output = FP16.toHalf(input); result += output; } while (++raw_input != 0x477fff00); // 65535 is the max possible integer that can be represented by FP16. //This is 0x477fff00 in float bits. after = System.currentTimeMillis(); System.out.println("Time of FP16.toHalf (ms): " + (after - before)); return result; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Test: test-art-host, test-art-target Change-Id: I69b152682390e5ffa5b3fdca60b496261191655d
author: Usama Arif <usama.arif@linaro.org> 2019-10-15 15:36:51 +0100
committer: Vladimir Marko <vmarko@google.com> 2019-10-23 12:04:00 +0000
commit: 2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 (patch)
tree: 5ff5f5f7c5c25f441a36506a84988fa95d2dbd46 /compiler/optimizing/intrinsics_arm64.cc
parent: b8c884e5f22390386b202459ab55ef3046631e42 (diff)
1 files changed, 24 insertions, 0 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 185d487dff..6a666c9eef 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3216,6 +3216,30 @@ void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
   __ Fcvt(out, half);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  LocationSummary* locations = new (allocator_) LocationSummary(invoke,
+                                                                LocationSummary::kNoCall,
+                                                                kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  FPRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0));
+  FPRegister half = scratch_scope.AcquireH();
+  Register out = WRegisterFrom(invoke->GetLocations()->Out());
+  __ Fcvt(half, in);
+  __ Fmov(out, half);
+  __ Sxth(out, out);  // sign extend due to returning a short type.
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
author	Usama Arif <usama.arif@linaro.org>	2019-10-15 15:36:51 +0100
committer	Vladimir Marko <vmarko@google.com>	2019-10-23 12:04:00 +0000
commit	2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 (patch)
tree	5ff5f5f7c5c25f441a36506a84988fa95d2dbd46 /compiler/optimizing/intrinsics_arm64.cc
parent	b8c884e5f22390386b202459ab55ef3046631e42 (diff)