ARM64: FP16.rint() intrinsic for ARMv8

This CL implements an intrinsic for rint() method with ARMv8.2 FP16 instructions. This intrinsic implementation achieves bit-level compatibility with the original Java implementation android.util.Half.rint(). The time required in milliseconds to execute the below code on Pixel3: - Java implementation android.util.Half.rint(): - big cluster only: 19828 - little cluster only: 61457 - arm64 Intrinisic implementation: - big cluster only: 14186 (~28% faster) - little cluster only: 54405 (~11% faster) Analysis of this function with simpleperf showed that approximately only 60-65% of the time is spent in libcore.util.FP16.rint. So the percentage improvement using intrinsics is likely to be more than the numbers stated above. Another reason that the performance improvement with intrinsic is lower than expected is because the java implementation for values between -1 and 1 (abs < 0x3c00) only requires a few instructions and should almost give a similar performance to the intrinsic in this case. In the benchmark function below, 46.8% of the values tested are between -1 and 1. public static short benchmarkrint(){ short ret = 0; long before = 0; long after = 0; before = System.currentTimeMillis(); for(int i = 0; i < 50000; i++){ for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) { ret += FP16.rint(h); } } after = System.currentTimeMillis(); System.out.println("Time of FP16.rint (ms): " + (after - before)); System.out.println(ret); return ret; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Change-Id: I075c3e85a36fd9bce14deee437c5b961bd667b5d
author: Usama Arif <usama.arif@linaro.org> 2019-10-30 16:23:26 +0000
committer: Hans Boehm <hboehm@google.com> 2019-11-12 20:58:17 +0000
commit: 681692b6291008caaddf4971eab7ea9f9b25d9ca (patch)
tree: cad26b5b6016cc27baa480e5f4b83faac3317ef0
parent: 665aac46784684dfb85fe999f6a566ed0cf173ef (diff)
11 files changed, 54 insertions, 1 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 085959631d..9ef2e69737 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3288,6 +3288,22 @@ void IntrinsicCodeGeneratorARM64::VisitFP16Ceil(HInvoke* invoke) {
   GenerateFP16Round(invoke, codegen_, masm, roundOp);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16Rint(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntToIntLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Rint(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  auto roundOp = [masm](const FPRegister& out, const FPRegister& in) {
+    __ Frintn(out, in);  // Round to nearest, with ties to even
+  };
+  GenerateFP16Round(invoke, codegen_, masm, roundOp);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 77dcbfb3d4..1dfebddf1e 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3074,6 +3074,7 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Rint)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index fc06691bc9..ea9c591a20 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2711,6 +2711,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Rint)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 8a6e94ca99..fd939026a8 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2361,6 +2361,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Rint)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index e10214bc4f..9d3cecbbed 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3085,6 +3085,7 @@ UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(X86, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(X86, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Rint)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index d8ccd9b7e5..1111a59955 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2752,6 +2752,7 @@ UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Rint)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index 152a72579f..0f6eab08cf 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -361,6 +361,7 @@ ALWAYS_INLINE inline uint32_t GetRuntimeFlags(ArtMethod* method)
       case Intrinsics::kFP16Floor:
       case Intrinsics::kFP16ToFloat:
       case Intrinsics::kFP16ToHalf:
+      case Intrinsics::kFP16Rint:
         return kAccCorePlatformApi;
       default:
         // Remaining intrinsics are public API. We DCHECK that in SetIntrinsic().
diff --git a/runtime/image.cc b/runtime/image.cc
index 171547b1e2..2566f80cc3 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '2', '\0' };  // FP16Ceil intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '3', '\0' };  // FP16Rint intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index f3ef257271..5d23e350d8 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -577,6 +577,7 @@ bool MterpHandleIntrinsic(ShadowFrame* shadow_frame,
     UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */)
     UNIMPLEMENTED_CASE(FP16Floor /* (S)S */)
     UNIMPLEMENTED_CASE(FP16Ceil /* (S)S */)
+    UNIMPLEMENTED_CASE(FP16Rint /* (S)S */)
     INTRINSIC_CASE(VarHandleFullFence)
     INTRINSIC_CASE(VarHandleAcquireFence)
     INTRINSIC_CASE(VarHandleReleaseFence)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index ee91066743..2bd738c075 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -167,6 +167,7 @@
   V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
   V(FP16Ceil, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "ceil", "(S)S") \
   V(FP16Floor, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "floor", "(S)S") \
+  V(FP16Rint, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "rint", "(S)S") \
   V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
   V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \
   V(StringCharAt, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "charAt", "(I)C") \
diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
index de9deda8de..2dbec8be4b 100644
--- a/test/580-fp16/src-art/Main.java
+++ b/test/580-fp16/src-art/Main.java
@@ -162,11 +162,40 @@ public class Main {
         assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
     }
 
+    public static void testRint() {
+        assertEquals(FP16.POSITIVE_INFINITY, FP16.rint(FP16.POSITIVE_INFINITY));
+        assertEquals(FP16.NEGATIVE_INFINITY, FP16.rint(FP16.NEGATIVE_INFINITY));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.rint(FP16.POSITIVE_ZERO));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.rint(FP16.NEGATIVE_ZERO));
+        assertEquals(FP16.NaN, FP16.rint(FP16.NaN));
+        assertEquals(FP16.LOWEST_VALUE, FP16.rint(FP16.LOWEST_VALUE));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.rint(FP16.MIN_VALUE));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.rint((short) 0x200));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.rint((short) 0x3ff));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.rint(FP16.toHalf(0.2f)));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.rint(FP16.toHalf(-0.2f)));
+        assertEquals(1.0f, FP16.toFloat(FP16.rint(FP16.toHalf(0.7f))));
+        assertEquals(-1.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-0.7f))));
+        assertEquals(0.0f, FP16.toFloat(FP16.rint(FP16.toHalf(0.5f))));
+        assertEquals(-0.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-0.5f))));
+        assertEquals(125.0f, FP16.toFloat(FP16.rint(FP16.toHalf(124.7f))));
+        assertEquals(-125.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-124.7f))));
+        assertEquals(124.0f, FP16.toFloat(FP16.rint(FP16.toHalf(124.2f))));
+        assertEquals(-124.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-124.2f))));
+        // floor for NaN values
+        assertEquals((short) 0x7e01, FP16.floor((short) 0x7c01));
+        assertEquals((short) 0x7f00, FP16.floor((short) 0x7d00));
+        assertEquals((short) 0xfe01, FP16.floor((short) 0xfc01));
+        assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
+
+    }
+
     public static void main(String args[]) {
         testHalfToFloatToHalfConversions();
         testToHalf();
         testToFloat();
         testFloor();
         testCeil();
+        testRint();
     }
 }
author	Usama Arif <usama.arif@linaro.org>	2019-10-30 16:23:26 +0000
committer	Hans Boehm <hboehm@google.com>	2019-11-12 20:58:17 +0000
commit	681692b6291008caaddf4971eab7ea9f9b25d9ca (patch)
tree	cad26b5b6016cc27baa480e5f4b83faac3317ef0
parent	665aac46784684dfb85fe999f6a566ed0cf173ef (diff)