ARM64: FP16.rint() intrinsic for ARMv8
This CL implements an intrinsic for rint() method with
ARMv8.2 FP16 instructions.
This intrinsic implementation achieves bit-level compatibility with the
original Java implementation android.util.Half.rint().
The time required in milliseconds to execute the below code on Pixel3:
- Java implementation android.util.Half.rint():
- big cluster only: 19828
- little cluster only: 61457
- arm64 Intrinisic implementation:
- big cluster only: 14186 (~28% faster)
- little cluster only: 54405 (~11% faster)
Analysis of this function with simpleperf showed that approximately only
60-65% of the time is spent in libcore.util.FP16.rint. So the percentage
improvement using intrinsics is likely to be more than the numbers stated
above.
Another reason that the performance improvement with intrinsic is lower
than expected is because the java implementation for values between -1 and
1 (abs < 0x3c00) only requires a few instructions and should almost give
a similar performance to the intrinsic in this case. In the benchmark function
below, 46.8% of the values tested are between -1 and 1.
public static short benchmarkrint(){
short ret = 0;
long before = 0;
long after = 0;
before = System.currentTimeMillis();
for(int i = 0; i < 50000; i++){
for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) {
ret += FP16.rint(h);
}
}
after = System.currentTimeMillis();
System.out.println("Time of FP16.rint (ms): " + (after - before));
System.out.println(ret);
return ret;
}
Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Change-Id: I075c3e85a36fd9bce14deee437c5b961bd667b5d
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 0859596..9ef2e69 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3288,6 +3288,22 @@
GenerateFP16Round(invoke, codegen_, masm, roundOp);
}
+void IntrinsicLocationsBuilderARM64::VisitFP16Rint(HInvoke* invoke) {
+ if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+ return;
+ }
+
+ CreateIntToIntLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Rint(HInvoke* invoke) {
+ MacroAssembler* masm = GetVIXLAssembler();
+ auto roundOp = [masm](const FPRegister& out, const FPRegister& in) {
+ __ Frintn(out, in); // Round to nearest, with ties to even
+ };
+ GenerateFP16Round(invoke, codegen_, masm, roundOp);
+}
+
UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 77dcbfb..1dfebdd 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3074,6 +3074,7 @@
UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor)
UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Rint)
UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index fc06691..ea9c591 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2711,6 +2711,7 @@
UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor)
UNIMPLEMENTED_INTRINSIC(MIPS, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Rint)
UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 8a6e94c..fd93902 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2361,6 +2361,7 @@
UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor)
UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Rint)
UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index e10214b..9d3cecb 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3085,6 +3085,7 @@
UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
UNIMPLEMENTED_INTRINSIC(X86, FP16Floor)
UNIMPLEMENTED_INTRINSIC(X86, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Rint)
UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index d8ccd9b..1111a59 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2752,6 +2752,7 @@
UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Rint)
UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index 152a725..0f6eab0 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -361,6 +361,7 @@
case Intrinsics::kFP16Floor:
case Intrinsics::kFP16ToFloat:
case Intrinsics::kFP16ToHalf:
+ case Intrinsics::kFP16Rint:
return kAccCorePlatformApi;
default:
// Remaining intrinsics are public API. We DCHECK that in SetIntrinsic().
diff --git a/runtime/image.cc b/runtime/image.cc
index 171547b..2566f80 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
namespace art {
const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '2', '\0' }; // FP16Ceil intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '3', '\0' }; // FP16Rint intrinsic
ImageHeader::ImageHeader(uint32_t image_reservation_size,
uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index f3ef257..5d23e35 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -577,6 +577,7 @@
UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */)
UNIMPLEMENTED_CASE(FP16Floor /* (S)S */)
UNIMPLEMENTED_CASE(FP16Ceil /* (S)S */)
+ UNIMPLEMENTED_CASE(FP16Rint /* (S)S */)
INTRINSIC_CASE(VarHandleFullFence)
INTRINSIC_CASE(VarHandleAcquireFence)
INTRINSIC_CASE(VarHandleReleaseFence)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index ee91066..2bd738c 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -167,6 +167,7 @@
V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
V(FP16Ceil, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "ceil", "(S)S") \
V(FP16Floor, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "floor", "(S)S") \
+ V(FP16Rint, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "rint", "(S)S") \
V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \
V(StringCharAt, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "charAt", "(I)C") \
diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
index de9deda..2dbec8b 100644
--- a/test/580-fp16/src-art/Main.java
+++ b/test/580-fp16/src-art/Main.java
@@ -162,11 +162,40 @@
assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
}
+ public static void testRint() {
+ assertEquals(FP16.POSITIVE_INFINITY, FP16.rint(FP16.POSITIVE_INFINITY));
+ assertEquals(FP16.NEGATIVE_INFINITY, FP16.rint(FP16.NEGATIVE_INFINITY));
+ assertEquals(FP16.POSITIVE_ZERO, FP16.rint(FP16.POSITIVE_ZERO));
+ assertEquals(FP16.NEGATIVE_ZERO, FP16.rint(FP16.NEGATIVE_ZERO));
+ assertEquals(FP16.NaN, FP16.rint(FP16.NaN));
+ assertEquals(FP16.LOWEST_VALUE, FP16.rint(FP16.LOWEST_VALUE));
+ assertEquals(FP16.POSITIVE_ZERO, FP16.rint(FP16.MIN_VALUE));
+ assertEquals(FP16.POSITIVE_ZERO, FP16.rint((short) 0x200));
+ assertEquals(FP16.POSITIVE_ZERO, FP16.rint((short) 0x3ff));
+ assertEquals(FP16.POSITIVE_ZERO, FP16.rint(FP16.toHalf(0.2f)));
+ assertEquals(FP16.NEGATIVE_ZERO, FP16.rint(FP16.toHalf(-0.2f)));
+ assertEquals(1.0f, FP16.toFloat(FP16.rint(FP16.toHalf(0.7f))));
+ assertEquals(-1.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-0.7f))));
+ assertEquals(0.0f, FP16.toFloat(FP16.rint(FP16.toHalf(0.5f))));
+ assertEquals(-0.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-0.5f))));
+ assertEquals(125.0f, FP16.toFloat(FP16.rint(FP16.toHalf(124.7f))));
+ assertEquals(-125.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-124.7f))));
+ assertEquals(124.0f, FP16.toFloat(FP16.rint(FP16.toHalf(124.2f))));
+ assertEquals(-124.0f, FP16.toFloat(FP16.rint(FP16.toHalf(-124.2f))));
+ // floor for NaN values
+ assertEquals((short) 0x7e01, FP16.floor((short) 0x7c01));
+ assertEquals((short) 0x7f00, FP16.floor((short) 0x7d00));
+ assertEquals((short) 0xfe01, FP16.floor((short) 0xfc01));
+ assertEquals((short) 0xff00, FP16.floor((short) 0xfd00));
+
+ }
+
public static void main(String args[]) {
testHalfToFloatToHalfConversions();
testToHalf();
testToFloat();
testFloor();
testCeil();
+ testRint();
}
}