ARM64: FP16.compare() intrinsic for ARMv8

This CL implements an intrinsic for compare() method with
ARMv8.2 FP16 instructions.

The performance improvements using timeCompareFP16 FP16Intrinsic
micro intrinsic benchmark on pixel4:
- Java implementation libcore.util.FP16.compare:
    - big cluster only: 742
    - little cluster only: 2286
- arm64 compare Intrinisic implementation:
    - big cluster only: 492 (~34% faster)
    - little cluster only: 1535 (~33% faster)
The benchmark can be found in the following patch:
https://android-review.linaro.org/c/linaro/art-testing/+/21039

Authors: Usama Arif, Edward Pickup, Joel Goddard

Test: 580-checker-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac

Change-Id: Idbe9f56f964f044e6d725bd696459fb04d2ac76c
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 800e5bd..14b16bb 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3819,6 +3819,69 @@
   GenerateFP16Compare(invoke, codegen_, masm, ls);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16Compare(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntIntToIntLocations(allocator_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Compare(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  auto compareOp = [masm](const Register out,
+                          const VRegister& in0,
+                          const VRegister& in1) {
+    vixl::aarch64::Label end;
+    vixl::aarch64::Label equal;
+    vixl::aarch64::Label normal;
+
+    // The normal cases for this method are:
+    // - in0 > in1 => out = 1
+    // - in0 < in1 => out = -1
+    // - in0 == in1 => out = 0
+    // +/-Infinity are ordered by default so are handled by the normal case.
+    // There are two special cases that Fcmp is insufficient for distinguishing:
+    // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
+    // - in0 or in1 is NaN => manually compare with in0 and in1 separately
+    __ Fcmp(in0, in1);
+    __ B(eq, &equal);  // in0==in1 or +0 -0 case.
+    __ B(vc, &normal);  // in0 and in1 are ordered (not NaN).
+
+    // Either of the inputs is NaN.
+    // NaN is equal to itself and greater than any other number so:
+    // - if only in0 is NaN => return 1
+    // - if only in1 is NaN => return -1
+    // - if both in0 and in1 are NaN => return 0
+    __ Fcmp(in0, 0.0);
+    __ Mov(out, -1);
+    __ B(vc, &end);  // in0 != NaN => out = -1.
+    __ Fcmp(in1, 0.0);
+    __ Cset(out, vc);  // if in1 != NaN => out = 1, otherwise both are NaNs => out = 0.
+    __ B(&end);
+
+    // in0 == in1 or if one of the inputs is +0 and the other is -0.
+    __ Bind(&equal);
+    // Compare encoding of in0 and in1 as the denormal fraction of single precision float.
+    // Reverse operand order because -0 > +0 when compared as S registers.
+    // The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
+    // Therefore the value of bits[127:16] will not matter when doing the
+    // below Fcmp as they are set to 0.
+    __ Fcmp(in1.S(), in0.S());
+
+    __ Bind(&normal);
+    __ Cset(out, gt);  // if in0 > in1 => out = 1, otherwise out = 0.
+                       // Note: could be from equals path or original comparison
+    __ Csinv(out, out, wzr, pl);  // if in0 >= in1 out=out, otherwise out=-1.
+
+    __ Bind(&end);
+  };
+
+  GenerateFP16Compare(invoke, codegen_, masm, compareOp);
+}
+
 static void GenerateDivideUnsigned(HInvoke* invoke, CodeGeneratorARM64* codegen) {
   LocationSummary* locations = invoke->GetLocations();
   MacroAssembler* masm = codegen->GetVIXLAssembler();