Support Math.fma Intrinsic for x86 and x86_64
Intrinsic Implementation is 2287x faster than base
Time for base: 188.25s
Time for patch: 0.0822s
Test: ./run-test --host --64 --optimizing 082-inline-execute
Test: ./run-test --host --64 --optimizing --instruction-set-features sse4.1 082-inline-execute
Test: ./run-test --host --64 --optimizing --instruction-set-features sse4.1,avx2 082-inline-execute
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I68b96a35f41f3bb23d7e26d87fee1dd2a5ebf6a6
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 5e55a1b..6015a6d 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -728,6 +728,17 @@
locations->SetOut(Location::FpuRegisterLocation(XMM0));
}
+static void CreateFPFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
+ DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
+ LocationSummary* locations =
+ new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+ InvokeRuntimeCallingConvention calling_convention;
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+}
+
void IntrinsicLocationsBuilderX86::VisitMathAtan2(HInvoke* invoke) {
CreateFPFPToFPCallLocations(allocator_, invoke);
}
@@ -4733,6 +4744,44 @@
GenerateVarHandleGetAndBitwiseOp(invoke, codegen_);
}
+static void GenerateMathFma(HInvoke* invoke, CodeGeneratorX86* codegen) {
+ DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
+ LocationSummary* locations = invoke->GetLocations();
+ DCHECK(locations->InAt(0).Equals(locations->Out()));
+ X86Assembler* assembler = codegen->GetAssembler();
+ XmmRegister left = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister right = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister accumulator = locations->InAt(2).AsFpuRegister<XmmRegister>();
+ if (invoke->GetType() == DataType::Type::kFloat32) {
+ __ vfmadd213ss(left, right, accumulator);
+ } else {
+ DCHECK_EQ(invoke->GetType(), DataType::Type::kFloat64);
+ __ vfmadd213sd(left, right, accumulator);
+ }
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathFmaDouble(HInvoke* invoke) {
+ DCHECK(codegen_->GetInstructionSetFeatures().HasAVX2());
+ GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathFmaDouble(HInvoke* invoke) {
+ if (codegen_->GetInstructionSetFeatures().HasAVX2()) {
+ CreateFPFPFPToFPCallLocations(allocator_, invoke);
+ }
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathFmaFloat(HInvoke* invoke) {
+ DCHECK(codegen_->GetInstructionSetFeatures().HasAVX2());
+ GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathFmaFloat(HInvoke* invoke) {
+ if (codegen_->GetInstructionSetFeatures().HasAVX2()) {
+ CreateFPFPFPToFPCallLocations(allocator_, invoke);
+ }
+}
+
UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite)
UNIMPLEMENTED_INTRINSIC(X86, DoubleIsInfinite)
@@ -4775,6 +4824,7 @@
UNIMPLEMENTED_INTRINSIC(X86, StringBuilderToString);
// 1.8.
+
UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndAddInt)
UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndAddLong)
UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndSetInt)