Add an optimization for removing redundant suspend tests in ART

This CL:
(1) eliminates redundant suspend checks (dominated by another check),

(2) removes the special treatment of the R4 register, which got
reset on every native call, possibly yielding long execution
sequences without any suspend checks, and

(3) fixes the absence of suspend checks in leaf methods.

(2) and (3) increase the frequency of suspend checks, which improves
the performance of GC and the accuracy of profile data.  To
compensate for the increased number of checks, we implemented an
optimization that leverages dominance information to remove
redundant suspend checks on back edges.  Based on the results of
running the Caffeine benchmark on Nexus 7, the patch performs
roughly 30% more useful suspend checks, spreading them much more
evenly along the execution trace, while incurring less than 1%
overhead.  For flexibility consideration, this CL defines two flags
to control the enabling of optimizations.  The original
implementation is the default.

Change-Id: I31e81a5b3c53030444dbe0434157274c9ab8640f
Signed-off-by: Wei Jin <wejin@google.com>
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 5340d83..bd9c8b4 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -25,9 +25,15 @@
 
 namespace art {
 
+#ifdef ARM_R4_SUSPEND_FLAG
 static constexpr RegStorage core_regs_arr[] =
     {rs_r0, rs_r1, rs_r2, rs_r3, rs_rARM_SUSPEND, rs_r5, rs_r6, rs_r7, rs_r8, rs_rARM_SELF,
      rs_r10, rs_r11, rs_r12, rs_rARM_SP, rs_rARM_LR, rs_rARM_PC};
+#else
+static constexpr RegStorage core_regs_arr[] =
+    {rs_r0, rs_r1, rs_r2, rs_r3, rs_r4, rs_r5, rs_r6, rs_r7, rs_r8, rs_rARM_SELF,
+     rs_r10, rs_r11, rs_r12, rs_rARM_SP, rs_rARM_LR, rs_rARM_PC};
+#endif
 static constexpr RegStorage sp_regs_arr[] =
     {rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7, rs_fr8, rs_fr9, rs_fr10,
      rs_fr11, rs_fr12, rs_fr13, rs_fr14, rs_fr15, rs_fr16, rs_fr17, rs_fr18, rs_fr19, rs_fr20,
@@ -36,9 +42,15 @@
 static constexpr RegStorage dp_regs_arr[] =
     {rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7, rs_dr8, rs_dr9, rs_dr10,
      rs_dr11, rs_dr12, rs_dr13, rs_dr14, rs_dr15};
+#ifdef ARM_R4_SUSPEND_FLAG
 static constexpr RegStorage reserved_regs_arr[] =
     {rs_rARM_SUSPEND, rs_rARM_SELF, rs_rARM_SP, rs_rARM_LR, rs_rARM_PC};
 static constexpr RegStorage core_temps_arr[] = {rs_r0, rs_r1, rs_r2, rs_r3, rs_r12};
+#else
+static constexpr RegStorage reserved_regs_arr[] =
+    {rs_rARM_SELF, rs_rARM_SP, rs_rARM_LR, rs_rARM_PC};
+static constexpr RegStorage core_temps_arr[] = {rs_r0, rs_r1, rs_r2, rs_r3, rs_r4, rs_r12};
+#endif
 static constexpr RegStorage sp_temps_arr[] =
     {rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7, rs_fr8, rs_fr9, rs_fr10,
      rs_fr11, rs_fr12, rs_fr13, rs_fr14, rs_fr15};
@@ -79,7 +91,11 @@
   RegStorage res_reg = RegStorage::InvalidReg();
   switch (reg) {
     case kSelf: res_reg = rs_rARM_SELF; break;
+#ifdef ARM_R4_SUSPEND_FLAG
     case kSuspend: res_reg =  rs_rARM_SUSPEND; break;
+#else
+    case kSuspend: res_reg = RegStorage::InvalidReg(); break;
+#endif
     case kLr: res_reg =  rs_rARM_LR; break;
     case kPc: res_reg =  rs_rARM_PC; break;
     case kSp: res_reg =  rs_rARM_SP; break;
@@ -578,11 +594,13 @@
     }
   }
 
+#ifdef ARM_R4_SUSPEND_FLAG
   // TODO: re-enable this when we can safely save r4 over the suspension code path.
   bool no_suspend = NO_SUSPEND;  // || !Runtime::Current()->ExplicitSuspendChecks();
   if (no_suspend) {
     GetRegInfo(rs_rARM_SUSPEND)->MarkFree();
   }
+#endif
 
   // Don't start allocating temps at r0/s0/d0 or you may clobber return regs in early-exit methods.
   // TODO: adjust when we roll to hard float calling convention.