x86/x86-64: Clean up intrinsic codegen for SSE4.1.

Let the normal codegen handle the calls if we're not
emitting intrinsics because of lack of SSE4.1 support.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Change-Id: Ic26e65974231bbcb44ad696100e1fa4469165e41
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 6d7462e..2dad2f1 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -63,11 +63,6 @@
   return res->Intrinsified();
 }
 
-static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) {
-  InvokeDexCallingConventionVisitorX86 calling_convention_visitor;
-  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
-}
-
 using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>;
 
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
@@ -347,52 +342,23 @@
   GetAssembler()->sqrtsd(out, in);
 }
 
-static void InvokeOutOfLineIntrinsic(CodeGeneratorX86* codegen, HInvoke* invoke) {
-  MoveArguments(invoke, codegen);
-
-  DCHECK(invoke->IsInvokeStaticOrDirect());
-  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(),
-                                      Location::RegisterLocation(EAX));
-
-  // Copy the result back to the expected output.
-  Location out = invoke->GetLocations()->Out();
-  if (out.IsValid()) {
-    DCHECK(out.IsRegister());
-    codegen->MoveFromReturnRegister(out, invoke->GetType());
-  }
-}
-
 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator,
                                        HInvoke* invoke,
                                        CodeGeneratorX86* codegen) {
   // Do we have instruction support?
-  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
-    CreateFPToFPLocations(allocator, invoke);
+  if (!codegen->GetInstructionSetFeatures().HasSSE4_1()) {
     return;
   }
 
-  // We have to fall back to a call to the intrinsic.
-  LocationSummary* locations =
-      new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
-  InvokeRuntimeCallingConvention calling_convention;
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-  locations->SetOut(Location::FpuRegisterLocation(XMM0));
-  // Needs to be EAX for the invoke.
-  locations->AddTemp(Location::RegisterLocation(EAX));
+  CreateFPToFPLocations(allocator, invoke);
 }
 
-static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86* codegen,
-                                   HInvoke* invoke,
-                                   X86Assembler* assembler,
-                                   int round_mode) {
+static void GenSSE41FPToFPIntrinsic(HInvoke* invoke, X86Assembler* assembler, int round_mode) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {
-    InvokeOutOfLineIntrinsic(codegen, invoke);
-  } else {
-    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
-    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-    __ roundsd(out, in, Immediate(round_mode));
-  }
+  DCHECK(!locations->WillCall());
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
+  __ roundsd(out, in, Immediate(round_mode));
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathCeil(HInvoke* invoke) {
@@ -400,7 +366,7 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathCeil(HInvoke* invoke) {
-  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
+  GenSSE41FPToFPIntrinsic(invoke, GetAssembler(), 2);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathFloor(HInvoke* invoke) {
@@ -408,7 +374,7 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathFloor(HInvoke* invoke) {
-  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
+  GenSSE41FPToFPIntrinsic(invoke, GetAssembler(), 1);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathRint(HInvoke* invoke) {
@@ -416,44 +382,33 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) {
-  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
+  GenSSE41FPToFPIntrinsic(invoke, GetAssembler(), 0);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
   // Do we have instruction support?
-  if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
-    HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
-    DCHECK(static_or_direct != nullptr);
-    LocationSummary* locations =
-        new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
-    locations->SetInAt(0, Location::RequiresFpuRegister());
-    if (static_or_direct->HasSpecialInput() &&
-        invoke->InputAt(
-            static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
-      locations->SetInAt(1, Location::RequiresRegister());
-    }
-    locations->SetOut(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresFpuRegister());
-    locations->AddTemp(Location::RequiresFpuRegister());
+  if (!codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
     return;
   }
 
-  // We have to fall back to a call to the intrinsic.
+  HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
+  DCHECK(static_or_direct != nullptr);
   LocationSummary* locations =
-      new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
-  InvokeRuntimeCallingConvention calling_convention;
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-  locations->SetOut(Location::RegisterLocation(EAX));
-  // Needs to be EAX for the invoke.
-  locations->AddTemp(Location::RegisterLocation(EAX));
+      new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  if (static_or_direct->HasSpecialInput() &&
+      invoke->InputAt(
+          static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
+  locations->SetOut(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresFpuRegister());
+  locations->AddTemp(Location::RequiresFpuRegister());
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {  // TODO: can we reach this?
-    InvokeOutOfLineIntrinsic(codegen_, invoke);
-    return;
-  }
+  DCHECK(!locations->WillCall());
 
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 0f6b006..7a0f131 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -61,11 +61,6 @@
   return res->Intrinsified();
 }
 
-static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
-  InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
-  IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
-}
-
 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
 
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
@@ -255,52 +250,23 @@
   GetAssembler()->sqrtsd(out, in);
 }
 
-static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
-  MoveArguments(invoke, codegen);
-
-  DCHECK(invoke->IsInvokeStaticOrDirect());
-  codegen->GenerateStaticOrDirectCall(
-      invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
-
-  // Copy the result back to the expected output.
-  Location out = invoke->GetLocations()->Out();
-  if (out.IsValid()) {
-    DCHECK(out.IsRegister());
-    codegen->MoveFromReturnRegister(out, invoke->GetType());
-  }
-}
-
 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator,
                                        HInvoke* invoke,
                                        CodeGeneratorX86_64* codegen) {
   // Do we have instruction support?
-  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
-    CreateFPToFPLocations(allocator, invoke);
+  if (!codegen->GetInstructionSetFeatures().HasSSE4_1()) {
     return;
   }
 
-  // We have to fall back to a call to the intrinsic.
-  LocationSummary* locations =
-      new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
-  InvokeRuntimeCallingConvention calling_convention;
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-  locations->SetOut(Location::FpuRegisterLocation(XMM0));
-  // Needs to be RDI for the invoke.
-  locations->AddTemp(Location::RegisterLocation(RDI));
+  CreateFPToFPLocations(allocator, invoke);
 }
 
-static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
-                                   HInvoke* invoke,
-                                   X86_64Assembler* assembler,
-                                   int round_mode) {
+static void GenSSE41FPToFPIntrinsic(HInvoke* invoke, X86_64Assembler* assembler, int round_mode) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {
-    InvokeOutOfLineIntrinsic(codegen, invoke);
-  } else {
-    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
-    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-    __ roundsd(out, in, Immediate(round_mode));
-  }
+  DCHECK(!locations->WillCall());
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
+  __ roundsd(out, in, Immediate(round_mode));
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
@@ -308,7 +274,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
-  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
+  GenSSE41FPToFPIntrinsic(invoke, GetAssembler(), 2);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
@@ -316,7 +282,7 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
-  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
+  GenSSE41FPToFPIntrinsic(invoke, GetAssembler(), 1);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
@@ -324,31 +290,23 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
-  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
+  GenSSE41FPToFPIntrinsic(invoke, GetAssembler(), 0);
 }
 
 static void CreateSSE41FPToIntLocations(ArenaAllocator* allocator,
                                         HInvoke* invoke,
                                         CodeGeneratorX86_64* codegen) {
   // Do we have instruction support?
-  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
-    LocationSummary* locations =
-        new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
-    locations->SetInAt(0, Location::RequiresFpuRegister());
-    locations->SetOut(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresFpuRegister());
-    locations->AddTemp(Location::RequiresFpuRegister());
+  if (!codegen->GetInstructionSetFeatures().HasSSE4_1()) {
     return;
   }
 
-  // We have to fall back to a call to the intrinsic.
   LocationSummary* locations =
-      new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
-  InvokeRuntimeCallingConvention calling_convention;
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-  locations->SetOut(Location::RegisterLocation(RAX));
-  // Needs to be RDI for the invoke.
-  locations->AddTemp(Location::RegisterLocation(RDI));
+      new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresFpuRegister());
+  locations->AddTemp(Location::RequiresFpuRegister());
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
@@ -357,10 +315,7 @@
 
 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {
-    InvokeOutOfLineIntrinsic(codegen_, invoke);
-    return;
-  }
+  DCHECK(!locations->WillCall());
 
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
@@ -401,10 +356,7 @@
 
 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {
-    InvokeOutOfLineIntrinsic(codegen_, invoke);
-    return;
-  }
+  DCHECK(!locations->WillCall());
 
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();