ARM64: toFloat() intrinsics with ARMv8 FP16. This CL intrinsifies toFloat() method with ARMv8.2 FP16 instructions. This CL depends on the android framework and libcore changes: moving FP16 implementations into libcore. Tested with local micro benchmark on Pixel 3, compared to original android.util.Half.toFloat() Java implementation, this intrinsic is 50% faster. In real-life case, the FP16 toFloat() intrinsic can help accelerate ColorLong ARGB decoding in Android framework. This intrinsic implementation archieves bit-level compatibility with the original Java implementation android.util.Half.toFloat(). Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Test: test-art-host, test-art-target Change-Id: I059c69747067b84f2c532465e32a1dcd3c25269f

commit: 9ce340f829f836560278ecd078fbefcf19c9d629 [log] [tgz]
author: xueliang.zhong <xueliang.zhong@linaro.org> Tue Jan 22 17:46:09 2019 +0000
committer: Hans Boehm <hboehm@google.com> Thu Aug 15 20:35:11 2019 +0000
tree: 3327d6d7dd3c0ff86861cdab67791ee7c8dbe3b8
parent: d4fc62c66328c0944348a314e3770b4f2b8006ce [diff] [blame]
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index da1874e..1fab712 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc

@@ -3169,6 +3169,29 @@
   GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16ToFloat(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  LocationSummary* locations = new (allocator_) LocationSummary(invoke,
+                                                                LocationSummary::kNoCall,
+                                                                kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  Register bits = InputRegisterAt(invoke, 0);
+  FPRegister out = SRegisterFrom(invoke->GetLocations()->Out());
+  FPRegister half = scratch_scope.AcquireH();
+  __ Fmov(half, bits);  // ARMv8.2
+  __ Fcvt(out, half);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
commit	9ce340f829f836560278ecd078fbefcf19c9d629	[log] [tgz]
author	xueliang.zhong <xueliang.zhong@linaro.org>	Tue Jan 22 17:46:09 2019 +0000
committer	Hans Boehm <hboehm@google.com>	Thu Aug 15 20:35:11 2019 +0000
tree	3327d6d7dd3c0ff86861cdab67791ee7c8dbe3b8
parent	d4fc62c66328c0944348a314e3770b4f2b8006ce [diff] [blame]