ART: Implement rem_double/rem_float for x86/x86-64
This adds inlined version of the rem_double/rem_float bytecodes
for x86/x86-64 platforms. This patch also removes unnecessary
fmod and fmodf stubs from runtime.
Change-Id: I2311aa2adf08d6614527e0da070e3b6ce2343a20
Signed-off-by: Alexei Zavjalov <alexei.zavjalov@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index c7e289d..3f54798 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -407,10 +407,17 @@
{ kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1, false }, "PslldRI", "!0r,!1d" },
{ kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1, false }, "PsllqRI", "!0r,!1d" },
- { kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0, false }, "Fild32M", "[!0r,!1d]" },
- { kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0, false }, "Fild64M", "[!0r,!1d]" },
- { kX86Fstp32M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xD9, 0x00, 0, 3, 0, 0, false }, "FstpsM", "[!0r,!1d]" },
- { kX86Fstp64M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0, false }, "FstpdM", "[!0r,!1d]" },
+ { kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0, false }, "Fild32M", "[!0r,!1d]" },
+ { kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0, false }, "Fild64M", "[!0r,!1d]" },
+ { kX86Fld32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xD9, 0x00, 0, 0, 0, 0, false }, "Fld32M", "[!0r,!1d]" },
+ { kX86Fld64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDD, 0x00, 0, 0, 0, 0, false }, "Fld64M", "[!0r,!1d]" },
+ { kX86Fstp32M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xD9, 0x00, 0, 3, 0, 0, false }, "Fstps32M", "[!0r,!1d]" },
+ { kX86Fstp64M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0, false }, "Fstpd64M", "[!0r,!1d]" },
+ { kX86Fst32M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0, { 0x0, 0, 0xD9, 0x00, 0, 2, 0, 0, false }, "Fsts32M", "[!0r,!1d]" },
+ { kX86Fst64M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0, { 0x0, 0, 0xDD, 0x00, 0, 2, 0, 0, false }, "Fstd64M", "[!0r,!1d]" },
+ { kX86Fprem, kNullary, NO_OPERAND | USE_FP_STACK, { 0xD9, 0, 0xF8, 0, 0, 0, 0, 0, false }, "Fprem64", "" },
+ { kX86Fucompp, kNullary, NO_OPERAND | USE_FP_STACK, { 0xDA, 0, 0xE9, 0, 0, 0, 0, 0, false }, "Fucompp", "" },
+ { kX86Fstsw16R, kNullary, NO_OPERAND, { 0x9B, 0xDF, 0xE0, 0, 0, 0, 0, 0, false }, "Fstsw16R", "ax" },
EXT_0F_ENCODING_MAP(Mova128, 0x66, 0x6F, REG_DEF0),
{ kX86Mova128MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128MR", "[!0r+!1d],!2r" },
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 3540843..d874aaa 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -148,6 +148,7 @@
RegLocation rl_src2);
void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
RegLocation rl_src2);
+ void GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2, bool is_double);
void GenCmpFP(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
RegLocation rl_src2);
void GenConversion(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src);
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 61623d0..a2b6fe1 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -48,16 +48,7 @@
break;
case Instruction::REM_FLOAT_2ADDR:
case Instruction::REM_FLOAT:
- FlushAllRegs(); // Send everything to home location
- if (cu_->target64) {
- CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pFmodf), rl_src1, rl_src2,
- false);
- } else {
- CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
- false);
- }
- rl_result = GetReturn(kFPReg);
- StoreValue(rl_dest, rl_result);
+ GenRemFP(rl_dest, rl_src1, rl_src2, false /* is_double */);
return;
case Instruction::NEG_FLOAT:
GenNegFloat(rl_dest, rl_src1);
@@ -110,16 +101,7 @@
break;
case Instruction::REM_DOUBLE_2ADDR:
case Instruction::REM_DOUBLE:
- FlushAllRegs(); // Send everything to home location
- if (cu_->target64) {
- CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pFmod), rl_src1, rl_src2,
- false);
- } else {
- CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
- false);
- }
- rl_result = GetReturnWide(kFPReg);
- StoreValueWide(rl_dest, rl_result);
+ GenRemFP(rl_dest, rl_src1, rl_src2, true /* is_double */);
return;
case Instruction::NEG_DOUBLE:
GenNegDouble(rl_dest, rl_src1);
@@ -356,6 +338,110 @@
}
}
+void X86Mir2Lir::GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2, bool is_double) {
+ // Compute offsets to the source and destination VRs on stack.
+ int src1_v_reg_offset = SRegOffset(rl_src1.s_reg_low);
+ int src2_v_reg_offset = SRegOffset(rl_src2.s_reg_low);
+ int dest_v_reg_offset = SRegOffset(rl_dest.s_reg_low);
+
+ // Update the in-register state of sources.
+ rl_src1 = is_double ? UpdateLocWide(rl_src1) : UpdateLoc(rl_src1);
+ rl_src2 = is_double ? UpdateLocWide(rl_src2) : UpdateLoc(rl_src2);
+
+ // All memory accesses below reference dalvik regs.
+ ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+
+ // If the source is in physical register, then put it in its location on stack.
+ if (rl_src1.location == kLocPhysReg) {
+ RegisterInfo* reg_info = GetRegInfo(rl_src1.reg);
+
+ if (reg_info != nullptr && reg_info->IsTemp()) {
+ // Calling FlushSpecificReg because it will only write back VR if it is dirty.
+ FlushSpecificReg(reg_info);
+ // ResetDef to prevent NullifyRange from removing stores.
+ ResetDef(rl_src1.reg);
+ } else {
+ // It must have been register promoted if it is not a temp but is still in physical
+ // register. Since we need it to be in memory to convert, we place it there now.
+ StoreBaseDisp(TargetReg(kSp), src1_v_reg_offset, rl_src1.reg, is_double ? k64 : k32);
+ }
+ }
+
+ if (rl_src2.location == kLocPhysReg) {
+ RegisterInfo* reg_info = GetRegInfo(rl_src2.reg);
+ if (reg_info != nullptr && reg_info->IsTemp()) {
+ FlushSpecificReg(reg_info);
+ ResetDef(rl_src2.reg);
+ } else {
+ StoreBaseDisp(TargetReg(kSp), src2_v_reg_offset, rl_src2.reg, is_double ? k64 : k32);
+ }
+ }
+
+ int fld_opcode = is_double ? kX86Fld64M : kX86Fld32M;
+
+ // Push the source virtual registers onto the x87 stack.
+ LIR *fld_2 = NewLIR2NoDest(fld_opcode, TargetReg(kSp).GetReg(),
+ src2_v_reg_offset + LOWORD_OFFSET);
+ AnnotateDalvikRegAccess(fld_2, (src2_v_reg_offset + LOWORD_OFFSET) >> 2,
+ true /* is_load */, is_double /* is64bit */);
+
+ LIR *fld_1 = NewLIR2NoDest(fld_opcode, TargetReg(kSp).GetReg(),
+ src1_v_reg_offset + LOWORD_OFFSET);
+ AnnotateDalvikRegAccess(fld_1, (src1_v_reg_offset + LOWORD_OFFSET) >> 2,
+ true /* is_load */, is_double /* is64bit */);
+
+ FlushReg(rs_rAX);
+ Clobber(rs_rAX);
+ LockTemp(rs_rAX);
+
+ LIR* retry = NewLIR0(kPseudoTargetLabel);
+
+ // Divide ST(0) by ST(1) and place result to ST(0).
+ NewLIR0(kX86Fprem);
+
+ // Move FPU status word to AX.
+ NewLIR0(kX86Fstsw16R);
+
+ // Check if reduction is complete.
+ OpRegImm(kOpAnd, rs_rAX, 0x400);
+
+ // If no then continue to compute remainder.
+ LIR* branch = NewLIR2(kX86Jcc8, 0, kX86CondNe);
+ branch->target = retry;
+
+ FreeTemp(rs_rAX);
+
+ // Now store result in the destination VR's stack location.
+ int displacement = dest_v_reg_offset + LOWORD_OFFSET;
+ int opcode = is_double ? kX86Fst64M : kX86Fst32M;
+ LIR *fst = NewLIR2NoDest(opcode, TargetReg(kSp).GetReg(), displacement);
+ AnnotateDalvikRegAccess(fst, displacement >> 2, false /* is_load */, is_double /* is64bit */);
+
+ // Pop ST(1) and ST(0).
+ NewLIR0(kX86Fucompp);
+
+ /*
+ * The result is in a physical register if it was in a temp or was register
+ * promoted. For that reason it is enough to check if it is in physical
+ * register. If it is, then we must do all of the bookkeeping necessary to
+ * invalidate temp (if needed) and load in promoted register (if needed).
+ * If the result's location is in memory, then we do not need to do anything
+ * more since the fstp has already placed the correct value in memory.
+ */
+ RegLocation rl_result = is_double ? UpdateLocWideTyped(rl_dest, kFPReg) :
+ UpdateLocTyped(rl_dest, kFPReg);
+ if (rl_result.location == kLocPhysReg) {
+ rl_result = EvalLoc(rl_dest, kFPReg, true);
+ if (is_double) {
+ LoadBaseDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg, k64);
+ StoreFinalValueWide(rl_dest, rl_result);
+ } else {
+ Load32Disp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg);
+ StoreFinalValue(rl_dest, rl_result);
+ }
+ }
+}
+
void X86Mir2Lir::GenCmpFP(Instruction::Code code, RegLocation rl_dest,
RegLocation rl_src1, RegLocation rl_src2) {
bool single = (code == Instruction::CMPL_FLOAT) || (code == Instruction::CMPG_FLOAT);
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index f1b5811..28b9dca 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -572,8 +572,15 @@
kX86PsllqRI, // left shift of floating point registers 64 bits x 2
kX86Fild32M, // push 32-bit integer on x87 stack
kX86Fild64M, // push 64-bit integer on x87 stack
+ kX86Fld32M, // push float on x87 stack
+ kX86Fld64M, // push double on x87 stack
kX86Fstp32M, // pop top x87 fp stack and do 32-bit store
kX86Fstp64M, // pop top x87 fp stack and do 64-bit store
+ kX86Fst32M, // do 32-bit store
+ kX86Fst64M, // do 64-bit store
+ kX86Fprem, // remainder from dividing of two floating point values
+ kX86Fucompp, // compare floating point values and pop x87 fp stack twice
+ kX86Fstsw16R, // store FPU status word
Binary0fOpCode(kX86Mova128), // move 128 bits aligned
kX86Mova128MR, kX86Mova128AR, // store 128 bit aligned from xmm1 to m128
Binary0fOpCode(kX86Movups), // load unaligned packed single FP values from xmm2/m128 to xmm1
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index c30dca1..a85e250 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -69,8 +69,6 @@
extern "C" void art_quick_unlock_object(void*);
// Math entrypoints.
-extern "C" double art_quick_fmod(double, double);
-extern "C" float art_quick_fmodf(float, float);
extern "C" int64_t art_quick_d2l(double);
extern "C" int64_t art_quick_f2l(float);
extern "C" int64_t art_quick_ldiv(int64_t, int64_t);
@@ -175,9 +173,9 @@
// points->pCmpgFloat = NULL; // Not needed on x86.
// points->pCmplDouble = NULL; // Not needed on x86.
// points->pCmplFloat = NULL; // Not needed on x86.
- qpoints->pFmod = art_quick_fmod;
+ // qpoints->pFmod = NULL; // Not needed on x86.
// qpoints->pL2d = NULL; // Not needed on x86.
- qpoints->pFmodf = art_quick_fmodf;
+ // qpoints->pFmodf = NULL; // Not needed on x86.
// qpoints->pL2f = NULL; // Not needed on x86.
// points->pD2iz = NULL; // Not needed on x86.
// points->pF2iz = NULL; // Not needed on x86.
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 28e4dd6..ecd8ce6 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -734,35 +734,6 @@
NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
-DEFINE_FUNCTION art_quick_fmod
- subl LITERAL(12), %esp // alignment padding
- CFI_ADJUST_CFA_OFFSET(12)
- PUSH ebx // pass arg4 b.hi
- PUSH edx // pass arg3 b.lo
- PUSH ecx // pass arg2 a.hi
- PUSH eax // pass arg1 a.lo
- SETUP_GOT_NOSAVE // clobbers EBX
- call PLT_SYMBOL(fmod) // (jdouble a, jdouble b)
- fstpl (%esp) // pop return value off fp stack
- movsd (%esp), %xmm0 // place into %xmm0
- addl LITERAL(28), %esp // pop arguments
- CFI_ADJUST_CFA_OFFSET(-28)
- ret
-END_FUNCTION art_quick_fmod
-
-DEFINE_FUNCTION art_quick_fmodf
- PUSH eax // alignment padding
- PUSH ecx // pass arg2 b
- PUSH eax // pass arg1 a
- SETUP_GOT_NOSAVE // clobbers EBX
- call PLT_SYMBOL(fmodf) // (jfloat a, jfloat b)
- fstps (%esp) // pop return value off fp stack
- movss (%esp), %xmm0 // place into %xmm0
- addl LITERAL(12), %esp // pop arguments
- CFI_ADJUST_CFA_OFFSET(-12)
- ret
-END_FUNCTION art_quick_fmodf
-
DEFINE_FUNCTION art_quick_d2l
PUSH eax // alignment padding
PUSH ecx // pass arg2 a.hi
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 2612417..92aabee 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -174,9 +174,9 @@
// points->pCmpgFloat = NULL; // Not needed on x86.
// points->pCmplDouble = NULL; // Not needed on x86.
// points->pCmplFloat = NULL; // Not needed on x86.
- qpoints->pFmod = fmod;
+ // qpoints->pFmod = NULL; // Not needed on x86.
// qpoints->pL2d = NULL; // Not needed on x86.
- qpoints->pFmodf = fmodf;
+ // qpoints->pFmodf = NULL; // Not needed on x86.
// qpoints->pL2f = NULL; // Not needed on x86.
// points->pD2iz = NULL; // Not needed on x86.
// points->pF2iz = NULL; // Not needed on x86.