[optimizing compiler] Implement inline x86 FP '%'

Replace the calls to fmod/fmodf by inline code as is done in the Quick
compiler.

Remove the quick fmod/fmodf runtime entries, as they are no longer in
use.

64 bit code generator Move() routine needed to be enhanced to handle
constants, as Location::Any() allows them to be generated.

Change-Id: I6b6a42f6faeed4b0b3c940453e487daf5b25d184
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 5b09fc1..57f01e8 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -40,6 +40,8 @@
 static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
 static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
+static constexpr int kC2ConditionMask = 0x400;
+
 // Marker for places that can be updated once we don't follow the quick ABI.
 static constexpr bool kFollowsQuickABI = true;
 
@@ -2076,6 +2078,81 @@
   }
 }
 
+void InstructionCodeGeneratorX86::PushOntoFPStack(Location source, uint32_t temp_offset,
+                                                  uint32_t stack_adjustment, bool is_float) {
+  if (source.IsStackSlot()) {
+    DCHECK(is_float);
+    __ flds(Address(ESP, source.GetStackIndex() + stack_adjustment));
+  } else if (source.IsDoubleStackSlot()) {
+    DCHECK(!is_float);
+    __ fldl(Address(ESP, source.GetStackIndex() + stack_adjustment));
+  } else {
+    // Write the value to the temporary location on the stack and load to FP stack.
+    if (is_float) {
+      Location stack_temp = Location::StackSlot(temp_offset);
+      codegen_->Move32(stack_temp, source);
+      __ flds(Address(ESP, temp_offset));
+    } else {
+      Location stack_temp = Location::DoubleStackSlot(temp_offset);
+      codegen_->Move64(stack_temp, source);
+      __ fldl(Address(ESP, temp_offset));
+    }
+  }
+}
+
+void InstructionCodeGeneratorX86::GenerateRemFP(HRem *rem) {
+  Primitive::Type type = rem->GetResultType();
+  bool is_float = type == Primitive::kPrimFloat;
+  size_t elem_size = Primitive::ComponentSize(type);
+  LocationSummary* locations = rem->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  Location out = locations->Out();
+
+  // Create stack space for 2 elements.
+  // TODO: enhance register allocator to ask for stack temporaries.
+  __ subl(ESP, Immediate(2 * elem_size));
+
+  // Load the values to the FP stack in reverse order, using temporaries if needed.
+  PushOntoFPStack(second, elem_size, 2 * elem_size, is_float);
+  PushOntoFPStack(first, 0, 2 * elem_size, is_float);
+
+  // Loop doing FPREM until we stabilize.
+  Label retry;
+  __ Bind(&retry);
+  __ fprem();
+
+  // Move FP status to AX.
+  __ fstsw();
+
+  // And see if the argument reduction is complete. This is signaled by the
+  // C2 FPU flag bit set to 0.
+  __ andl(EAX, Immediate(kC2ConditionMask));
+  __ j(kNotEqual, &retry);
+
+  // We have settled on the final value. Retrieve it into an XMM register.
+  // Store FP top of stack to real stack.
+  if (is_float) {
+    __ fsts(Address(ESP, 0));
+  } else {
+    __ fstl(Address(ESP, 0));
+  }
+
+  // Pop the 2 items from the FP stack.
+  __ fucompp();
+
+  // Load the value from the stack into an XMM register.
+  DCHECK(out.IsFpuRegister()) << out;
+  if (is_float) {
+    __ movss(out.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
+  } else {
+    __ movsd(out.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
+  }
+
+  // And remove the temporary stack space we allocated.
+  __ addl(ESP, Immediate(2 * elem_size));
+}
+
 void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
 
@@ -2209,10 +2286,8 @@
 
 void LocationsBuilderX86::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
-  LocationSummary::CallKind call_kind = type == Primitive::kPrimInt
-      ? LocationSummary::kNoCall
-      : LocationSummary::kCall;
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
+  LocationSummary* locations =
+    new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
 
   switch (type) {
     case Primitive::kPrimInt: {
@@ -2231,24 +2306,12 @@
       locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
       break;
     }
+    case Primitive::kPrimDouble:
     case Primitive::kPrimFloat: {
-      InvokeRuntimeCallingConvention calling_convention;
-      // x86 floating-point parameters are passed through core registers (EAX, ECX).
-      locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-      locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-      // The runtime helper puts the result in XMM0.
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
-      break;
-    }
-    case Primitive::kPrimDouble: {
-      InvokeRuntimeCallingConvention calling_convention;
-      // x86 floating-point parameters are passed through core registers (EAX_ECX, EDX_EBX).
-      locations->SetInAt(0, Location::RegisterPairLocation(
-          calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
-      locations->SetInAt(1, Location::RegisterPairLocation(
-          calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3)));
-      // The runtime helper puts the result in XMM0.
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
+      locations->SetInAt(0, Location::Any());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::RequiresFpuRegister());
+      locations->AddTemp(Location::RegisterLocation(EAX));
       break;
     }
 
@@ -2265,14 +2328,9 @@
       GenerateDivRemIntegral(rem);
       break;
     }
-    case Primitive::kPrimFloat: {
-      __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pFmodf)));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
-      break;
-    }
+    case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pFmod)));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
+      GenerateRemFP(rem);
       break;
     }
     default: