ARM64: toFloat() intrinsics with ARMv8 FP16.
This CL intrinsifies toFloat() method with ARMv8.2 FP16 instructions.
This CL depends on the android framework and libcore changes:
moving FP16 implementations into libcore.
Tested with local micro benchmark on Pixel 3, compared to original
android.util.Half.toFloat() Java implementation, this intrinsic is
50% faster.
In real-life case, the FP16 toFloat() intrinsic can help
accelerate ColorLong ARGB decoding in Android framework.
This intrinsic implementation archieves bit-level compatibility with the
original Java implementation android.util.Half.toFloat().
Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Test: test-art-host, test-art-target
Change-Id: I059c69747067b84f2c532465e32a1dcd3c25269f
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index da1874e..1fab712 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3169,6 +3169,29 @@
GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
}
+void IntrinsicLocationsBuilderARM64::VisitFP16ToFloat(HInvoke* invoke) {
+ if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+ return;
+ }
+
+ LocationSummary* locations = new (allocator_) LocationSummary(invoke,
+ LocationSummary::kNoCall,
+ kIntrinsified);
+ locations->SetInAt(0, Location::RequiresRegister());
+ locations->SetOut(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
+ DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+ MacroAssembler* masm = GetVIXLAssembler();
+ UseScratchRegisterScope scratch_scope(masm);
+ Register bits = InputRegisterAt(invoke, 0);
+ FPRegister out = SRegisterFrom(invoke->GetLocations()->Out());
+ FPRegister half = scratch_scope.AcquireH();
+ __ Fmov(half, bits); // ARMv8.2
+ __ Fcvt(out, half);
+}
+
UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index a497cca..65f3888 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3070,6 +3070,7 @@
UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32Update)
UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateBytes)
UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 9e5316d..f71d281 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2707,6 +2707,7 @@
UNIMPLEMENTED_INTRINSIC(MIPS, CRC32Update)
UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateBytes)
UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index d8ec892..7b87b03 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2357,6 +2357,7 @@
UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32Update)
UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateBytes)
UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index f8d3bf5..5a622ca 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3081,6 +3081,7 @@
UNIMPLEMENTED_INTRINSIC(X86, CRC32Update)
UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes)
UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index f2a6c53..cbf6606 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2748,6 +2748,7 @@
UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update)
UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes)
UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);