ARM64: FP16.ceil() intrinsic for ARMv8

This CL implements an intrinsic for ceil() method with ARMv8.2 FP16 instructions. This intrinsic implementation achieves bit-level compatibility with the original Java implementation android.util.Half.ceil(). The time required in milliseconds to execute the below code on Pixel3: - Java implementation android.util.Half.ceil(): - big cluster only: 19447 - little cluster only: 62638 - arm64 Intrinisic implementation: - big cluster only: 14260 (~27% faster) - little cluster only: 54387 (~13% faster) Analysis of this function with simpleperf showed that approximately only 60-65% of the time is spent in libcore.util.FP16.ceil. So the percentage improvement using intrinsics is likely to be more than the numbers stated above. Another reason that the performance improvement with intrinsic is lower than expected is because the java implementation for values between -1 and 1 (abs < 0x3c00) only requires a few instructions and should almost give a similar performance to the intrinsic in this case. In the benchmark function below, 46.8% of the values tested are between -1 and 1. public static short benchmarkCeil(){ short ret = 0; long before = 0; long after = 0; before = System.currentTimeMillis(); for(int i = 0; i < 50000; i++){ for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) { ret += FP16.ceil(h); } } after = System.currentTimeMillis(); System.out.println("Time of FP16.ceil (ms): " + (after - before)); System.out.println(ret); return ret; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Change-Id: I5474c1d0d7c08ec77a6f82c4fb67f555253bfa67
author: Usama Arif <usama.arif@linaro.org> 2019-10-29 11:13:18 +0000
committer: Hans Boehm <hboehm@google.com> 2019-11-12 20:58:17 +0000
commit: 665aac46784684dfb85fe999f6a566ed0cf173ef (patch)
tree: 343d6956068b1c21e0fc4af018e1322d7685411b
parent: b9f02c2f8624bbf0746939e3b2735a1537a567b6 (diff)
11 files changed, 51 insertions, 1 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 228255aca6..085959631d 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3272,6 +3272,22 @@ void IntrinsicCodeGeneratorARM64::VisitFP16Floor(HInvoke* invoke) {
   GenerateFP16Round(invoke, codegen_, masm, roundOp);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16Ceil(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntToIntLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Ceil(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  auto roundOp = [masm](const FPRegister& out, const FPRegister& in) {
+    __ Frintp(out, in);  // Round towards Plus infinity
+  };
+  GenerateFP16Round(invoke, codegen_, masm, roundOp);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 8217980ad0..77dcbfb3d4 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3073,6 +3073,7 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 0bab2a0b17..fc06691bc9 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2710,6 +2710,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 6ed1133c90..8a6e94ca99 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2360,6 +2360,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 7bc9b63d0f..e10214bc4f 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3084,6 +3084,7 @@ UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(X86, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index e11208c818..d8ccd9b7e5 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2751,6 +2751,7 @@ UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index ec24c42135..152a72579f 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -357,6 +357,7 @@ ALWAYS_INLINE inline uint32_t GetRuntimeFlags(ArtMethod* method)
       case Intrinsics::kVarHandleWeakCompareAndSetRelease:
         return 0u;
       case Intrinsics::kUnsafeGetLong:
+      case Intrinsics::kFP16Ceil:
       case Intrinsics::kFP16Floor:
       case Intrinsics::kFP16ToFloat:
       case Intrinsics::kFP16ToHalf:
diff --git a/runtime/image.cc b/runtime/image.cc
index 2b4099f258..171547b1e2 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '1', '\0' };  // FP16Floor intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '2', '\0' };  // FP16Ceil intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 4ddf9bb465..f3ef257271 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -576,6 +576,7 @@ bool MterpHandleIntrinsic(ShadowFrame* shadow_frame,
     UNIMPLEMENTED_CASE(FP16ToFloat /* (S)F */)
     UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */)
     UNIMPLEMENTED_CASE(FP16Floor /* (S)S */)
+    UNIMPLEMENTED_CASE(FP16Ceil /* (S)S */)
     INTRINSIC_CASE(VarHandleFullFence)
     INTRINSIC_CASE(VarHandleAcquireFence)
     INTRINSIC_CASE(VarHandleReleaseFence)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index 045c808356..ee91066743 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -165,6 +165,7 @@
   V(MemoryPokeIntNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeIntNative", "(JI)V") \
   V(MemoryPokeLongNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeLongNative", "(JJ)V") \
   V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
+  V(FP16Ceil, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "ceil", "(S)S") \
   V(FP16Floor, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "floor", "(S)S") \
   V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
   V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \
diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
index 815c9f52e8..de9deda8de 100644
--- a/test/580-fp16/src-art/Main.java
+++ b/test/580-fp16/src-art/Main.java
@@ -137,10 +137,36 @@ public class Main {
         assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
     }
 
+    public static void testCeil() {
+        // These tests have been taken from the cts HalfTest
+        assertEquals(FP16.POSITIVE_INFINITY, FP16.ceil(FP16.POSITIVE_INFINITY));
+        assertEquals(FP16.NEGATIVE_INFINITY, FP16.ceil(FP16.NEGATIVE_INFINITY));
+        assertEquals(FP16.POSITIVE_ZERO, FP16.ceil(FP16.POSITIVE_ZERO));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.ceil(FP16.NEGATIVE_ZERO));
+        assertEquals(FP16.NaN, FP16.ceil(FP16.NaN));
+        assertEquals(FP16.LOWEST_VALUE, FP16.ceil(FP16.LOWEST_VALUE));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil(FP16.MIN_NORMAL)));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil((short) 0x3ff)));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(0.2f))));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.ceil(FP16.toHalf(-0.2f)));
+        assertEquals(1.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(0.7f))));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.ceil(FP16.toHalf(-0.7f)));
+        assertEquals(125.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(124.7f))));
+        assertEquals(-124.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(-124.7f))));
+        assertEquals(125.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(124.2f))));
+        assertEquals(-124.0f, FP16.toFloat(FP16.ceil(FP16.toHalf(-124.2f))));
+        // ceil for NaN values
+        assertEquals((short) 0x7e01, FP16.floor((short) 0x7c01));
+        assertEquals((short) 0x7f00, FP16.floor((short) 0x7d00));
+        assertEquals((short) 0xfe01, FP16.floor((short) 0xfc01));
+        assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
+    }
+
     public static void main(String args[]) {
         testHalfToFloatToHalfConversions();
         testToHalf();
         testToFloat();
         testFloor();
+        testCeil();
     }
 }
author	Usama Arif <usama.arif@linaro.org>	2019-10-29 11:13:18 +0000
committer	Hans Boehm <hboehm@google.com>	2019-11-12 20:58:17 +0000
commit	665aac46784684dfb85fe999f6a566ed0cf173ef (patch)
tree	343d6956068b1c21e0fc4af018e1322d7685411b
parent	b9f02c2f8624bbf0746939e3b2735a1537a567b6 (diff)