[optimizing compiler] Implement inline x86 FP '%'
Replace the calls to fmod/fmodf by inline code as is done in the Quick
compiler.
Remove the quick fmod/fmodf runtime entries, as they are no longer in
use.
64 bit code generator Move() routine needed to be enhanced to handle
constants, as Location::Any() allows them to be generated.
Change-Id: I6b6a42f6faeed4b0b3c940453e487daf5b25d184
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 5b09fc1..57f01e8 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -40,6 +40,8 @@
static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
+static constexpr int kC2ConditionMask = 0x400;
+
// Marker for places that can be updated once we don't follow the quick ABI.
static constexpr bool kFollowsQuickABI = true;
@@ -2076,6 +2078,81 @@
}
}
+void InstructionCodeGeneratorX86::PushOntoFPStack(Location source, uint32_t temp_offset,
+ uint32_t stack_adjustment, bool is_float) {
+ if (source.IsStackSlot()) {
+ DCHECK(is_float);
+ __ flds(Address(ESP, source.GetStackIndex() + stack_adjustment));
+ } else if (source.IsDoubleStackSlot()) {
+ DCHECK(!is_float);
+ __ fldl(Address(ESP, source.GetStackIndex() + stack_adjustment));
+ } else {
+ // Write the value to the temporary location on the stack and load to FP stack.
+ if (is_float) {
+ Location stack_temp = Location::StackSlot(temp_offset);
+ codegen_->Move32(stack_temp, source);
+ __ flds(Address(ESP, temp_offset));
+ } else {
+ Location stack_temp = Location::DoubleStackSlot(temp_offset);
+ codegen_->Move64(stack_temp, source);
+ __ fldl(Address(ESP, temp_offset));
+ }
+ }
+}
+
+void InstructionCodeGeneratorX86::GenerateRemFP(HRem *rem) {
+ Primitive::Type type = rem->GetResultType();
+ bool is_float = type == Primitive::kPrimFloat;
+ size_t elem_size = Primitive::ComponentSize(type);
+ LocationSummary* locations = rem->GetLocations();
+ Location first = locations->InAt(0);
+ Location second = locations->InAt(1);
+ Location out = locations->Out();
+
+ // Create stack space for 2 elements.
+ // TODO: enhance register allocator to ask for stack temporaries.
+ __ subl(ESP, Immediate(2 * elem_size));
+
+ // Load the values to the FP stack in reverse order, using temporaries if needed.
+ PushOntoFPStack(second, elem_size, 2 * elem_size, is_float);
+ PushOntoFPStack(first, 0, 2 * elem_size, is_float);
+
+ // Loop doing FPREM until we stabilize.
+ Label retry;
+ __ Bind(&retry);
+ __ fprem();
+
+ // Move FP status to AX.
+ __ fstsw();
+
+ // And see if the argument reduction is complete. This is signaled by the
+ // C2 FPU flag bit set to 0.
+ __ andl(EAX, Immediate(kC2ConditionMask));
+ __ j(kNotEqual, &retry);
+
+ // We have settled on the final value. Retrieve it into an XMM register.
+ // Store FP top of stack to real stack.
+ if (is_float) {
+ __ fsts(Address(ESP, 0));
+ } else {
+ __ fstl(Address(ESP, 0));
+ }
+
+ // Pop the 2 items from the FP stack.
+ __ fucompp();
+
+ // Load the value from the stack into an XMM register.
+ DCHECK(out.IsFpuRegister()) << out;
+ if (is_float) {
+ __ movss(out.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
+ } else {
+ __ movsd(out.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
+ }
+
+ // And remove the temporary stack space we allocated.
+ __ addl(ESP, Immediate(2 * elem_size));
+}
+
void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instruction) {
DCHECK(instruction->IsDiv() || instruction->IsRem());
@@ -2209,10 +2286,8 @@
void LocationsBuilderX86::VisitRem(HRem* rem) {
Primitive::Type type = rem->GetResultType();
- LocationSummary::CallKind call_kind = type == Primitive::kPrimInt
- ? LocationSummary::kNoCall
- : LocationSummary::kCall;
- LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
+ LocationSummary* locations =
+ new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
switch (type) {
case Primitive::kPrimInt: {
@@ -2231,24 +2306,12 @@
locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
break;
}
+ case Primitive::kPrimDouble:
case Primitive::kPrimFloat: {
- InvokeRuntimeCallingConvention calling_convention;
- // x86 floating-point parameters are passed through core registers (EAX, ECX).
- locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
- locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
- // The runtime helper puts the result in XMM0.
- locations->SetOut(Location::FpuRegisterLocation(XMM0));
- break;
- }
- case Primitive::kPrimDouble: {
- InvokeRuntimeCallingConvention calling_convention;
- // x86 floating-point parameters are passed through core registers (EAX_ECX, EDX_EBX).
- locations->SetInAt(0, Location::RegisterPairLocation(
- calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
- locations->SetInAt(1, Location::RegisterPairLocation(
- calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3)));
- // The runtime helper puts the result in XMM0.
- locations->SetOut(Location::FpuRegisterLocation(XMM0));
+ locations->SetInAt(0, Location::Any());
+ locations->SetInAt(1, Location::Any());
+ locations->SetOut(Location::RequiresFpuRegister());
+ locations->AddTemp(Location::RegisterLocation(EAX));
break;
}
@@ -2265,14 +2328,9 @@
GenerateDivRemIntegral(rem);
break;
}
- case Primitive::kPrimFloat: {
- __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pFmodf)));
- codegen_->RecordPcInfo(rem, rem->GetDexPc());
- break;
- }
+ case Primitive::kPrimFloat:
case Primitive::kPrimDouble: {
- __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pFmod)));
- codegen_->RecordPcInfo(rem, rem->GetDexPc());
+ GenerateRemFP(rem);
break;
}
default:
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index b77a1aa..a9086f8 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -137,6 +137,7 @@
void GenerateClassInitializationCheck(SlowPathCodeX86* slow_path, Register class_reg);
void HandleBitwiseOperation(HBinaryOperation* instruction);
void GenerateDivRemIntegral(HBinaryOperation* instruction);
+ void GenerateRemFP(HRem *rem);
void HandleShift(HBinaryOperation* instruction);
void GenerateShlLong(const Location& loc, Register shifter);
void GenerateShrLong(const Location& loc, Register shifter);
@@ -144,6 +145,8 @@
void GenerateMemoryBarrier(MemBarrierKind kind);
void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+ void PushOntoFPStack(Location source, uint32_t temp_offset,
+ uint32_t stack_adjustment, bool is_float);
void GenerateImplicitNullCheck(HNullCheck* instruction);
void GenerateExplicitNullCheck(HNullCheck* instruction);
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 196e0cf..dd6861f 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -47,6 +47,8 @@
static constexpr size_t kRuntimeParameterFpuRegistersLength =
arraysize(kRuntimeParameterFpuRegisters);
+static constexpr int kC2ConditionMask = 0x400;
+
class InvokeRuntimeCallingConvention : public CallingConvention<Register, FloatRegister> {
public:
InvokeRuntimeCallingConvention()
@@ -583,8 +585,18 @@
} else if (source.IsFpuRegister()) {
__ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
source.AsFpuRegister<XmmRegister>());
+ } else if (source.IsConstant()) {
+ HConstant* constant = source.GetConstant();
+ int32_t value;
+ if (constant->IsFloatConstant()) {
+ value = bit_cast<float, int32_t>(constant->AsFloatConstant()->GetValue());
+ } else {
+ DCHECK(constant->IsIntConstant());
+ value = constant->AsIntConstant()->GetValue();
+ }
+ __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), Immediate(value));
} else {
- DCHECK(source.IsStackSlot());
+ DCHECK(source.IsStackSlot()) << source;
__ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
__ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
}
@@ -596,6 +608,17 @@
} else if (source.IsFpuRegister()) {
__ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
source.AsFpuRegister<XmmRegister>());
+ } else if (source.IsConstant()) {
+ HConstant* constant = source.GetConstant();
+ int64_t value = constant->AsLongConstant()->GetValue();
+ if (constant->IsDoubleConstant()) {
+ value = bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue());
+ } else {
+ DCHECK(constant->IsLongConstant());
+ value = constant->AsLongConstant()->GetValue();
+ }
+ __ movq(CpuRegister(TMP), Immediate(value));
+ __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
} else {
DCHECK(source.IsDoubleStackSlot());
__ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -2000,6 +2023,81 @@
}
}
+void InstructionCodeGeneratorX86_64::PushOntoFPStack(Location source, uint32_t temp_offset,
+ uint32_t stack_adjustment, bool is_float) {
+ if (source.IsStackSlot()) {
+ DCHECK(is_float);
+ __ flds(Address(CpuRegister(RSP), source.GetStackIndex() + stack_adjustment));
+ } else if (source.IsDoubleStackSlot()) {
+ DCHECK(!is_float);
+ __ fldl(Address(CpuRegister(RSP), source.GetStackIndex() + stack_adjustment));
+ } else {
+ // Write the value to the temporary location on the stack and load to FP stack.
+ if (is_float) {
+ Location stack_temp = Location::StackSlot(temp_offset);
+ codegen_->Move(stack_temp, source);
+ __ flds(Address(CpuRegister(RSP), temp_offset));
+ } else {
+ Location stack_temp = Location::DoubleStackSlot(temp_offset);
+ codegen_->Move(stack_temp, source);
+ __ fldl(Address(CpuRegister(RSP), temp_offset));
+ }
+ }
+}
+
+void InstructionCodeGeneratorX86_64::GenerateRemFP(HRem *rem) {
+ Primitive::Type type = rem->GetResultType();
+ bool is_float = type == Primitive::kPrimFloat;
+ size_t elem_size = Primitive::ComponentSize(type);
+ LocationSummary* locations = rem->GetLocations();
+ Location first = locations->InAt(0);
+ Location second = locations->InAt(1);
+ Location out = locations->Out();
+
+ // Create stack space for 2 elements.
+ // TODO: enhance register allocator to ask for stack temporaries.
+ __ subq(CpuRegister(RSP), Immediate(2 * elem_size));
+
+ // Load the values to the FP stack in reverse order, using temporaries if needed.
+ PushOntoFPStack(second, elem_size, 2 * elem_size, is_float);
+ PushOntoFPStack(first, 0, 2 * elem_size, is_float);
+
+ // Loop doing FPREM until we stabilize.
+ Label retry;
+ __ Bind(&retry);
+ __ fprem();
+
+ // Move FP status to AX.
+ __ fstsw();
+
+ // And see if the argument reduction is complete. This is signaled by the
+ // C2 FPU flag bit set to 0.
+ __ andl(CpuRegister(RAX), Immediate(kC2ConditionMask));
+ __ j(kNotEqual, &retry);
+
+ // We have settled on the final value. Retrieve it into an XMM register.
+ // Store FP top of stack to real stack.
+ if (is_float) {
+ __ fsts(Address(CpuRegister(RSP), 0));
+ } else {
+ __ fstl(Address(CpuRegister(RSP), 0));
+ }
+
+ // Pop the 2 items from the FP stack.
+ __ fucompp();
+
+ // Load the value from the stack into an XMM register.
+ DCHECK(out.IsFpuRegister()) << out;
+ if (is_float) {
+ __ movss(out.AsFpuRegister<XmmRegister>(), Address(CpuRegister(RSP), 0));
+ } else {
+ __ movsd(out.AsFpuRegister<XmmRegister>(), Address(CpuRegister(RSP), 0));
+ }
+
+ // And remove the temporary stack space we allocated.
+ __ addq(CpuRegister(RSP), Immediate(2 * elem_size));
+}
+
void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
DCHECK(instruction->IsDiv() || instruction->IsRem());
Primitive::Type type = instruction->GetResultType();
@@ -2099,11 +2197,8 @@
void LocationsBuilderX86_64::VisitRem(HRem* rem) {
Primitive::Type type = rem->GetResultType();
- LocationSummary::CallKind call_kind =
- (type == Primitive::kPrimInt) || (type == Primitive::kPrimLong)
- ? LocationSummary::kNoCall
- : LocationSummary::kCall;
- LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
+ LocationSummary* locations =
+ new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
switch (type) {
case Primitive::kPrimInt:
@@ -2117,11 +2212,10 @@
case Primitive::kPrimFloat:
case Primitive::kPrimDouble: {
- InvokeRuntimeCallingConvention calling_convention;
- locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
- locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
- // The runtime helper puts the result in XMM0.
- locations->SetOut(Location::FpuRegisterLocation(XMM0));
+ locations->SetInAt(0, Location::Any());
+ locations->SetInAt(1, Location::Any());
+ locations->SetOut(Location::RequiresFpuRegister());
+ locations->AddTemp(Location::RegisterLocation(RAX));
break;
}
@@ -2138,14 +2232,9 @@
GenerateDivRemIntegral(rem);
break;
}
- case Primitive::kPrimFloat: {
- __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pFmodf), true));
- codegen_->RecordPcInfo(rem, rem->GetDexPc());
- break;
- }
+ case Primitive::kPrimFloat:
case Primitive::kPrimDouble: {
- __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pFmod), true));
- codegen_->RecordPcInfo(rem, rem->GetDexPc());
+ GenerateRemFP(rem);
break;
}
default:
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index befe994..ead771a 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -155,6 +155,7 @@
void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
void GenerateClassInitializationCheck(SlowPathCodeX86_64* slow_path, CpuRegister class_reg);
void HandleBitwiseOperation(HBinaryOperation* operation);
+ void GenerateRemFP(HRem *rem);
void GenerateDivRemIntegral(HBinaryOperation* instruction);
void HandleShift(HBinaryOperation* operation);
void GenerateMemoryBarrier(MemBarrierKind kind);
@@ -162,6 +163,8 @@
void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
void GenerateImplicitNullCheck(HNullCheck* instruction);
void GenerateExplicitNullCheck(HNullCheck* instruction);
+ void PushOntoFPStack(Location source, uint32_t temp_offset,
+ uint32_t stack_adjustment, bool is_float);
X86_64Assembler* const assembler_;
CodeGeneratorX86_64* const codegen_;