ART: Implement rem_double/rem_float for x86/x86-64 This adds inlined version of the rem_double/rem_float bytecodes for x86/x86-64 platforms. This patch also removes unnecessary fmod and fmodf stubs from runtime. Change-Id: I2311aa2adf08d6614527e0da070e3b6ce2343a20 Signed-off-by: Alexei Zavjalov <alexei.zavjalov@intel.com>

commit: bd3682eada753de52975ae2b4a712bd87dc139a6 [log] [tgz]
author: Alexei Zavjalov <alexei.zavjalov@intel.com> Thu Jun 12 03:08:01 2014 +0700
committer: Alexei Zavjalov <alexei.zavjalov@intel.com> Tue Jun 17 13:22:50 2014 +0700
tree: c479068b294328afb3fc83b127eb6f725552717d
parent: eaac24f37a814c54073ef438cf93f2181464b562 [diff]
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index c7e289d..3f54798 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc

@@ -407,10 +407,17 @@
   { kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1, false }, "PslldRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1, false }, "PsllqRI", "!0r,!1d" },
 
-  { kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0, false }, "Fild32M", "[!0r,!1d]" },
-  { kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0, false }, "Fild64M", "[!0r,!1d]" },
-  { kX86Fstp32M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xD9, 0x00, 0, 3, 0, 0, false }, "FstpsM", "[!0r,!1d]" },
-  { kX86Fstp64M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0, false }, "FstpdM", "[!0r,!1d]" },
+  { kX86Fild32M,  kMem,     IS_LOAD    | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0,  0,    0xDB, 0x00, 0, 0, 0, 0, false }, "Fild32M",  "[!0r,!1d]" },
+  { kX86Fild64M,  kMem,     IS_LOAD    | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0,  0,    0xDF, 0x00, 0, 5, 0, 0, false }, "Fild64M",  "[!0r,!1d]" },
+  { kX86Fld32M,   kMem,     IS_LOAD    | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0,  0,    0xD9, 0x00, 0, 0, 0, 0, false }, "Fld32M",   "[!0r,!1d]" },
+  { kX86Fld64M,   kMem,     IS_LOAD    | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0,  0,    0xDD, 0x00, 0, 0, 0, 0, false }, "Fld64M",   "[!0r,!1d]" },
+  { kX86Fstp32M,  kMem,     IS_STORE   | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0,  0,    0xD9, 0x00, 0, 3, 0, 0, false }, "Fstps32M", "[!0r,!1d]" },
+  { kX86Fstp64M,  kMem,     IS_STORE   | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0,  0,    0xDD, 0x00, 0, 3, 0, 0, false }, "Fstpd64M", "[!0r,!1d]" },
+  { kX86Fst32M,   kMem,     IS_STORE   | IS_UNARY_OP | REG_USE0,                { 0x0,  0,    0xD9, 0x00, 0, 2, 0, 0, false }, "Fsts32M",  "[!0r,!1d]" },
+  { kX86Fst64M,   kMem,     IS_STORE   | IS_UNARY_OP | REG_USE0,                { 0x0,  0,    0xDD, 0x00, 0, 2, 0, 0, false }, "Fstd64M",  "[!0r,!1d]" },
+  { kX86Fprem,    kNullary, NO_OPERAND | USE_FP_STACK,                          { 0xD9, 0,    0xF8, 0,    0, 0, 0, 0, false }, "Fprem64",  "" },
+  { kX86Fucompp,  kNullary, NO_OPERAND | USE_FP_STACK,                          { 0xDA, 0,    0xE9, 0,    0, 0, 0, 0, false }, "Fucompp",  "" },
+  { kX86Fstsw16R, kNullary, NO_OPERAND,                                         { 0x9B, 0xDF, 0xE0, 0,    0, 0, 0, 0, false }, "Fstsw16R", "ax" },
 
   EXT_0F_ENCODING_MAP(Mova128,    0x66, 0x6F, REG_DEF0),
   { kX86Mova128MR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128MR", "[!0r+!1d],!2r" },

diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 3540843..d874aaa 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h

@@ -148,6 +148,7 @@
                         RegLocation rl_src2);
   void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                        RegLocation rl_src2);
+  void GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2, bool is_double);
   void GenCmpFP(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                 RegLocation rl_src2);
   void GenConversion(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src);

diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 61623d0..a2b6fe1 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc

@@ -48,16 +48,7 @@
       break;
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT:
-      FlushAllRegs();   // Send everything to home location
-      if (cu_->target64) {
-        CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pFmodf), rl_src1, rl_src2,
-                                                false);
-      } else {
-        CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmodf), rl_src1, rl_src2,
-                                                false);
-      }
-      rl_result = GetReturn(kFPReg);
-      StoreValue(rl_dest, rl_result);
+      GenRemFP(rl_dest, rl_src1, rl_src2, false /* is_double */);
       return;
     case Instruction::NEG_FLOAT:
       GenNegFloat(rl_dest, rl_src1);
@@ -110,16 +101,7 @@
       break;
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE:
-      FlushAllRegs();   // Send everything to home location
-      if (cu_->target64) {
-        CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(8, pFmod), rl_src1, rl_src2,
-                                                false);
-      } else {
-        CallRuntimeHelperRegLocationRegLocation(QUICK_ENTRYPOINT_OFFSET(4, pFmod), rl_src1, rl_src2,
-                                                false);
-      }
-      rl_result = GetReturnWide(kFPReg);
-      StoreValueWide(rl_dest, rl_result);
+      GenRemFP(rl_dest, rl_src1, rl_src2, true /* is_double */);
       return;
     case Instruction::NEG_DOUBLE:
       GenNegDouble(rl_dest, rl_src1);
@@ -356,6 +338,110 @@
   }
 }
 
+void X86Mir2Lir::GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2, bool is_double) {
+  // Compute offsets to the source and destination VRs on stack.
+  int src1_v_reg_offset = SRegOffset(rl_src1.s_reg_low);
+  int src2_v_reg_offset = SRegOffset(rl_src2.s_reg_low);
+  int dest_v_reg_offset = SRegOffset(rl_dest.s_reg_low);
+
+  // Update the in-register state of sources.
+  rl_src1 = is_double ? UpdateLocWide(rl_src1) : UpdateLoc(rl_src1);
+  rl_src2 = is_double ? UpdateLocWide(rl_src2) : UpdateLoc(rl_src2);
+
+  // All memory accesses below reference dalvik regs.
+  ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+
+  // If the source is in physical register, then put it in its location on stack.
+  if (rl_src1.location == kLocPhysReg) {
+    RegisterInfo* reg_info = GetRegInfo(rl_src1.reg);
+
+    if (reg_info != nullptr && reg_info->IsTemp()) {
+      // Calling FlushSpecificReg because it will only write back VR if it is dirty.
+      FlushSpecificReg(reg_info);
+      // ResetDef to prevent NullifyRange from removing stores.
+      ResetDef(rl_src1.reg);
+    } else {
+      // It must have been register promoted if it is not a temp but is still in physical
+      // register. Since we need it to be in memory to convert, we place it there now.
+      StoreBaseDisp(TargetReg(kSp), src1_v_reg_offset, rl_src1.reg, is_double ? k64 : k32);
+    }
+  }
+
+  if (rl_src2.location == kLocPhysReg) {
+    RegisterInfo* reg_info = GetRegInfo(rl_src2.reg);
+    if (reg_info != nullptr && reg_info->IsTemp()) {
+      FlushSpecificReg(reg_info);
+      ResetDef(rl_src2.reg);
+    } else {
+      StoreBaseDisp(TargetReg(kSp), src2_v_reg_offset, rl_src2.reg, is_double ? k64 : k32);
+    }
+  }
+
+  int fld_opcode = is_double ? kX86Fld64M : kX86Fld32M;
+
+  // Push the source virtual registers onto the x87 stack.
+  LIR *fld_2 = NewLIR2NoDest(fld_opcode, TargetReg(kSp).GetReg(),
+                             src2_v_reg_offset + LOWORD_OFFSET);
+  AnnotateDalvikRegAccess(fld_2, (src2_v_reg_offset + LOWORD_OFFSET) >> 2,
+                          true /* is_load */, is_double /* is64bit */);
+
+  LIR *fld_1 = NewLIR2NoDest(fld_opcode, TargetReg(kSp).GetReg(),
+                             src1_v_reg_offset + LOWORD_OFFSET);
+  AnnotateDalvikRegAccess(fld_1, (src1_v_reg_offset + LOWORD_OFFSET) >> 2,
+                          true /* is_load */, is_double /* is64bit */);
+
+  FlushReg(rs_rAX);
+  Clobber(rs_rAX);
+  LockTemp(rs_rAX);
+
+  LIR* retry = NewLIR0(kPseudoTargetLabel);
+
+  // Divide ST(0) by ST(1) and place result to ST(0).
+  NewLIR0(kX86Fprem);
+
+  // Move FPU status word to AX.
+  NewLIR0(kX86Fstsw16R);
+
+  // Check if reduction is complete.
+  OpRegImm(kOpAnd, rs_rAX, 0x400);
+
+  // If no then continue to compute remainder.
+  LIR* branch = NewLIR2(kX86Jcc8, 0, kX86CondNe);
+  branch->target = retry;
+
+  FreeTemp(rs_rAX);
+
+  // Now store result in the destination VR's stack location.
+  int displacement = dest_v_reg_offset + LOWORD_OFFSET;
+  int opcode = is_double ? kX86Fst64M : kX86Fst32M;
+  LIR *fst = NewLIR2NoDest(opcode, TargetReg(kSp).GetReg(), displacement);
+  AnnotateDalvikRegAccess(fst, displacement >> 2, false /* is_load */, is_double /* is64bit */);
+
+  // Pop ST(1) and ST(0).
+  NewLIR0(kX86Fucompp);
+
+  /*
+   * The result is in a physical register if it was in a temp or was register
+   * promoted. For that reason it is enough to check if it is in physical
+   * register. If it is, then we must do all of the bookkeeping necessary to
+   * invalidate temp (if needed) and load in promoted register (if needed).
+   * If the result's location is in memory, then we do not need to do anything
+   * more since the fstp has already placed the correct value in memory.
+   */
+  RegLocation rl_result = is_double ? UpdateLocWideTyped(rl_dest, kFPReg) :
+      UpdateLocTyped(rl_dest, kFPReg);
+  if (rl_result.location == kLocPhysReg) {
+    rl_result = EvalLoc(rl_dest, kFPReg, true);
+    if (is_double) {
+      LoadBaseDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg, k64);
+      StoreFinalValueWide(rl_dest, rl_result);
+    } else {
+      Load32Disp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg);
+      StoreFinalValue(rl_dest, rl_result);
+    }
+  }
+}
+
 void X86Mir2Lir::GenCmpFP(Instruction::Code code, RegLocation rl_dest,
                           RegLocation rl_src1, RegLocation rl_src2) {
   bool single = (code == Instruction::CMPL_FLOAT) || (code == Instruction::CMPG_FLOAT);

diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index f1b5811..28b9dca 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h

@@ -572,8 +572,15 @@
   kX86PsllqRI,                  // left shift of floating point registers 64 bits x 2
   kX86Fild32M,                  // push 32-bit integer on x87 stack
   kX86Fild64M,                  // push 64-bit integer on x87 stack
+  kX86Fld32M,                   // push float on x87 stack
+  kX86Fld64M,                   // push double on x87 stack
   kX86Fstp32M,                  // pop top x87 fp stack and do 32-bit store
   kX86Fstp64M,                  // pop top x87 fp stack and do 64-bit store
+  kX86Fst32M,                   // do 32-bit store
+  kX86Fst64M,                   // do 64-bit store
+  kX86Fprem,                    // remainder from dividing of two floating point values
+  kX86Fucompp,                  // compare floating point values and pop x87 fp stack twice
+  kX86Fstsw16R,                 // store FPU status word
   Binary0fOpCode(kX86Mova128),  // move 128 bits aligned
   kX86Mova128MR, kX86Mova128AR,  // store 128 bit aligned from xmm1 to m128
   Binary0fOpCode(kX86Movups),   // load unaligned packed single FP values from xmm2/m128 to xmm1

diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index c30dca1..a85e250 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc

@@ -69,8 +69,6 @@
 extern "C" void art_quick_unlock_object(void*);
 
 // Math entrypoints.
-extern "C" double art_quick_fmod(double, double);
-extern "C" float art_quick_fmodf(float, float);
 extern "C" int64_t art_quick_d2l(double);
 extern "C" int64_t art_quick_f2l(float);
 extern "C" int64_t art_quick_ldiv(int64_t, int64_t);
@@ -175,9 +173,9 @@
   // points->pCmpgFloat = NULL;  // Not needed on x86.
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
-  qpoints->pFmod = art_quick_fmod;
+  // qpoints->pFmod = NULL;  // Not needed on x86.
   // qpoints->pL2d = NULL;  // Not needed on x86.
-  qpoints->pFmodf = art_quick_fmodf;
+  // qpoints->pFmodf = NULL;  // Not needed on x86.
   // qpoints->pL2f = NULL;  // Not needed on x86.
   // points->pD2iz = NULL;  // Not needed on x86.
   // points->pF2iz = NULL;  // Not needed on x86.

diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 28e4dd6..ecd8ce6 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S

@@ -734,35 +734,6 @@
 
 NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
 
-DEFINE_FUNCTION art_quick_fmod
-    subl LITERAL(12), %esp        // alignment padding
-    CFI_ADJUST_CFA_OFFSET(12)
-    PUSH ebx                      // pass arg4 b.hi
-    PUSH edx                      // pass arg3 b.lo
-    PUSH ecx                      // pass arg2 a.hi
-    PUSH eax                      // pass arg1 a.lo
-    SETUP_GOT_NOSAVE              // clobbers EBX
-    call PLT_SYMBOL(fmod)         // (jdouble a, jdouble b)
-    fstpl (%esp)                  // pop return value off fp stack
-    movsd (%esp), %xmm0           // place into %xmm0
-    addl LITERAL(28), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-28)
-    ret
-END_FUNCTION art_quick_fmod
-
-DEFINE_FUNCTION art_quick_fmodf
-    PUSH eax                      // alignment padding
-    PUSH ecx                      // pass arg2 b
-    PUSH eax                      // pass arg1 a
-    SETUP_GOT_NOSAVE              // clobbers EBX
-    call PLT_SYMBOL(fmodf)        // (jfloat a, jfloat b)
-    fstps (%esp)                  // pop return value off fp stack
-    movss (%esp), %xmm0           // place into %xmm0
-    addl LITERAL(12), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-12)
-    ret
-END_FUNCTION art_quick_fmodf
-
 DEFINE_FUNCTION art_quick_d2l
     PUSH eax                      // alignment padding
     PUSH ecx                      // pass arg2 a.hi

diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 2612417..92aabee 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc

@@ -174,9 +174,9 @@
   // points->pCmpgFloat = NULL;  // Not needed on x86.
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
-  qpoints->pFmod = fmod;
+  // qpoints->pFmod = NULL;  // Not needed on x86.
   // qpoints->pL2d = NULL;  // Not needed on x86.
-  qpoints->pFmodf = fmodf;
+  // qpoints->pFmodf = NULL;  // Not needed on x86.
   // qpoints->pL2f = NULL;  // Not needed on x86.
   // points->pD2iz = NULL;  // Not needed on x86.
   // points->pF2iz = NULL;  // Not needed on x86.
commit	bd3682eada753de52975ae2b4a712bd87dc139a6	[log] [tgz]
author	Alexei Zavjalov <alexei.zavjalov@intel.com>	Thu Jun 12 03:08:01 2014 +0700
committer	Alexei Zavjalov <alexei.zavjalov@intel.com>	Tue Jun 17 13:22:50 2014 +0700
tree	c479068b294328afb3fc83b127eb6f725552717d
parent	eaac24f37a814c54073ef438cf93f2181464b562 [diff]