ARM64: FP16.compare() intrinsic for ARMv8
This CL implements an intrinsic for compare() method with
ARMv8.2 FP16 instructions.
The performance improvements using timeCompareFP16 FP16Intrinsic
micro intrinsic benchmark on pixel4:
- Java implementation libcore.util.FP16.compare:
- big cluster only: 742
- little cluster only: 2286
- arm64 compare Intrinisic implementation:
- big cluster only: 492 (~34% faster)
- little cluster only: 1535 (~33% faster)
The benchmark can be found in the following patch:
https://android-review.linaro.org/c/linaro/art-testing/+/21039
Authors: Usama Arif, Edward Pickup, Joel Goddard
Test: 580-checker-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Change-Id: Idbe9f56f964f044e6d725bd696459fb04d2ac76c
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 800e5bd..14b16bb 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3819,6 +3819,69 @@
GenerateFP16Compare(invoke, codegen_, masm, ls);
}
+void IntrinsicLocationsBuilderARM64::VisitFP16Compare(HInvoke* invoke) {
+ if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+ return;
+ }
+
+ CreateIntIntToIntLocations(allocator_, invoke);
+ invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+ invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Compare(HInvoke* invoke) {
+ MacroAssembler* masm = GetVIXLAssembler();
+ auto compareOp = [masm](const Register out,
+ const VRegister& in0,
+ const VRegister& in1) {
+ vixl::aarch64::Label end;
+ vixl::aarch64::Label equal;
+ vixl::aarch64::Label normal;
+
+ // The normal cases for this method are:
+ // - in0 > in1 => out = 1
+ // - in0 < in1 => out = -1
+ // - in0 == in1 => out = 0
+ // +/-Infinity are ordered by default so are handled by the normal case.
+ // There are two special cases that Fcmp is insufficient for distinguishing:
+ // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
+ // - in0 or in1 is NaN => manually compare with in0 and in1 separately
+ __ Fcmp(in0, in1);
+ __ B(eq, &equal); // in0==in1 or +0 -0 case.
+ __ B(vc, &normal); // in0 and in1 are ordered (not NaN).
+
+ // Either of the inputs is NaN.
+ // NaN is equal to itself and greater than any other number so:
+ // - if only in0 is NaN => return 1
+ // - if only in1 is NaN => return -1
+ // - if both in0 and in1 are NaN => return 0
+ __ Fcmp(in0, 0.0);
+ __ Mov(out, -1);
+ __ B(vc, &end); // in0 != NaN => out = -1.
+ __ Fcmp(in1, 0.0);
+ __ Cset(out, vc); // if in1 != NaN => out = 1, otherwise both are NaNs => out = 0.
+ __ B(&end);
+
+ // in0 == in1 or if one of the inputs is +0 and the other is -0.
+ __ Bind(&equal);
+ // Compare encoding of in0 and in1 as the denormal fraction of single precision float.
+ // Reverse operand order because -0 > +0 when compared as S registers.
+ // The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
+ // Therefore the value of bits[127:16] will not matter when doing the
+ // below Fcmp as they are set to 0.
+ __ Fcmp(in1.S(), in0.S());
+
+ __ Bind(&normal);
+ __ Cset(out, gt); // if in0 > in1 => out = 1, otherwise out = 0.
+ // Note: could be from equals path or original comparison
+ __ Csinv(out, out, wzr, pl); // if in0 >= in1 out=out, otherwise out=-1.
+
+ __ Bind(&end);
+ };
+
+ GenerateFP16Compare(invoke, codegen_, masm, compareOp);
+}
+
static void GenerateDivideUnsigned(HInvoke* invoke, CodeGeneratorARM64* codegen) {
LocationSummary* locations = invoke->GetLocations();
MacroAssembler* masm = codegen->GetVIXLAssembler();