Add Math.fma (double & float versions) intrinsics for arm64
Intrinsic implementation is ~500 times faster than java implementation
using BigInteger.
Bug: 199373643
Test: ./art/test/testrunner/testrunner.py --target \
--optimizing --64 -t 082-inline-execute
Change-Id: I50eae88b332ba9338b0a59fecad7d2158a97ffbb
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index a1ba873..92776f1 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -2259,6 +2259,22 @@
locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
}
+static void CreateFPFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
+ DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
+ DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
+ DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
+ DCHECK(DataType::IsFloatingPointType(invoke->InputAt(2)->GetType()));
+ DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
+
+ LocationSummary* const locations =
+ new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+}
+
static void GenFPToFPCall(HInvoke* invoke,
CodeGeneratorARM64* codegen,
QuickEntrypointEnum entry) {
@@ -4179,6 +4195,33 @@
__ Smulh(out, x, y);
}
+static void GenerateMathFma(HInvoke* invoke, CodeGeneratorARM64* codegen) {
+ MacroAssembler* masm = codegen->GetVIXLAssembler();
+
+ VRegister n = helpers::InputFPRegisterAt(invoke, 0);
+ VRegister m = helpers::InputFPRegisterAt(invoke, 1);
+ VRegister a = helpers::InputFPRegisterAt(invoke, 2);
+ VRegister out = helpers::OutputFPRegister(invoke);
+
+ __ Fmadd(out, n, m, a);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathFmaDouble(HInvoke* invoke) {
+ CreateFPFPFPToFPCallLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathFmaDouble(HInvoke* invoke) {
+ GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathFmaFloat(HInvoke* invoke) {
+ CreateFPFPFPToFPCallLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathFmaFloat(HInvoke* invoke) {
+ GenerateMathFma(invoke, codegen_);
+}
+
class VarHandleSlowPathARM64 : public IntrinsicSlowPathARM64 {
public:
VarHandleSlowPathARM64(HInvoke* invoke, std::memory_order order)
@@ -5597,9 +5640,6 @@
UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderToString);
// 1.8.
-UNIMPLEMENTED_INTRINSIC(ARM64, MathFmaDouble)
-UNIMPLEMENTED_INTRINSIC(ARM64, MathFmaFloat)
-
UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddInt)
UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddLong)
UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndSetInt)
diff --git a/runtime/image.cc b/runtime/image.cc
index e62380a..b39ffe5 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -30,7 +30,7 @@
const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
// Last change: Math.fma(double, double, double) intrinsic.
-const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '5', '\0' };
+const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '6', '\0' };
ImageHeader::ImageHeader(uint32_t image_reservation_size,
uint32_t component_count,