From 2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@linaro.org>
Date: Tue, 15 Oct 2019 15:36:51 +0100
Subject: ARM64: toHalf() intrinsic for ARMv8

This CL implements an intrinsic for toHalf() method with
ARMv8.2 FP16 instructions.

This intrinsic implementation achieves bit-level compatibility with the
original Java implementation android.util.Half.toFloat().

The time required to execute the below code on Pixel3:
- Java implementation android.util.Half.toFloat():
    - big cluster only: 2136ms
    - little cluster only: 6442ms
- arm64 Intrinisic implementation:
    - big cluster only: 1347ms (~37% faster)
    - little cluster only: 4937ms (~ 23% faster)

int benchmarkToHalf() {
    int result = 0;
    // 5.9605E-8 is the smallest positive subnormal number that can be
    // represented by FP16. This is 0x33800032 in float bits.
    int raw_input = 0x33800032;
    long before = 0;
    long after = 0;
    before = System.currentTimeMillis();
    do {
        float input = Float.intBitsToFloat(raw_input);
        short output = FP16.toHalf(input);
        result += output;
    } while (++raw_input != 0x477fff00);
    // 65535 is the max possible integer that can be represented by FP16.
    //This is 0x477fff00 in float bits.
    after = System.currentTimeMillis();
    System.out.println("Time of FP16.toHalf (ms): " + (after - before));
    return result;
}

Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Test: test-art-host, test-art-target

Change-Id: I69b152682390e5ffa5b3fdca60b496261191655d
---
 compiler/optimizing/intrinsics_arm64.cc    | 24 ++++++++++++++++++++++++
 compiler/optimizing/intrinsics_arm_vixl.cc |  1 +
 compiler/optimizing/intrinsics_mips.cc     |  1 +
 compiler/optimizing/intrinsics_mips64.cc   |  1 +
 compiler/optimizing/intrinsics_x86.cc      |  1 +
 compiler/optimizing/intrinsics_x86_64.cc   |  1 +
 6 files changed, 29 insertions(+)

(limited to 'compiler/optimizing')

diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 185d487dff..6a666c9eef 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3216,6 +3216,30 @@ void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
   __ Fcvt(out, half);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  LocationSummary* locations = new (allocator_) LocationSummary(invoke,
+                                                                LocationSummary::kNoCall,
+                                                                kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  FPRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0));
+  FPRegister half = scratch_scope.AcquireH();
+  Register out = WRegisterFrom(invoke->GetLocations()->Out());
+  __ Fcvt(half, in);
+  __ Fmov(out, half);
+  __ Sxth(out, out);  // sign extend due to returning a short type.
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 65f388837d..74e861fa8e 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3071,6 +3071,7 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index f71d281d5a..b18bbdde2d 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2708,6 +2708,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 7b87b03b50..e4627db33f 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2358,6 +2358,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 5a622ca6d1..95aa4c0eaa 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3082,6 +3082,7 @@ UNIMPLEMENTED_INTRINSIC(X86, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index cbf66069fe..8dbc0d3062 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2749,6 +2749,7 @@ UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
-- 
cgit v1.2.3-59-g8ed1b