Suspend check reworking (ready for rewiew)

I hate burning a register, but the cost of suspend checks was just too high
in our current environment.  There are things that can be done in future
releases to avoid the register burn, but for now it's worthwhile.

The general strategy is to reserve r4 as a suspend check counter.
Rather than poll the thread suspendPending counter, we instead simply
decrement the counter register.  When it rolls to zero, we check.  For
now I'm just using the counter scheme on backwards branches - we always
poll on returns (which is already heavyweight enough that the extra cost
isn't especially noticable).

I've also added an optimization hint to the MIR in case we have enough
time to test and enable the existing loop analysis code that omits the
suspend check on smallish counted loops.

Change-Id: I82d8bad5882a4cf2ccff590942e2d1520d58969d
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index b697292..0965c14 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -87,6 +87,7 @@
     kMIRInlined,                        // Invoke is inlined (ie dead)
     kMIRInlinedPred,                    // Invoke is inlined via prediction
     kMIRCallee,                         // Instruction is inlined from callee
+    kMIRIgnoreSuspendCheck,
 } MIROptimizationFlagPositons;
 
 #define MIR_IGNORE_NULL_CHECK           (1 << kMIRIgnoreNullCheck)
@@ -96,6 +97,7 @@
 #define MIR_INLINED                     (1 << kMIRInlined)
 #define MIR_INLINED_PRED                (1 << kMIRInlinedPred)
 #define MIR_CALLEE                      (1 << kMIRCallee)
+#define MIR_IGNORE_SUSPEND_CHECK        (1 << kMIRIgnoreSuspendCheck)
 
 typedef struct CallsiteInfo {
     const char* classDescriptor;
@@ -239,6 +241,7 @@
     GrowableList dfsOrder;
     GrowableList domPostOrderTraversal;
     GrowableList throwLaunchpads;
+    GrowableList suspendLaunchpads;
     ArenaBitVector* tryBlockAddr;
     ArenaBitVector** defBlockMatrix;    // numDalvikRegister x numBlocks
     ArenaBitVector* tempBlockV;
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index fdcce9c..6a01e36 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -725,6 +725,9 @@
     /* Intialize the throwLaunchpads list */
     oatInitGrowableList(&cUnit.throwLaunchpads, 4);
 
+    /* Intialize the suspendLaunchpads list */
+    oatInitGrowableList(&cUnit.suspendLaunchpads, 4);
+
     /* Allocate the bit-vector to track the beginning of basic blocks */
     ArenaBitVector *tryBlockAddr = oatAllocBitVector(cUnit.insnsSize,
                                                      true /* expandable */);
diff --git a/src/compiler/codegen/arm/ArchUtility.cc b/src/compiler/codegen/arm/ArchUtility.cc
index 1d6bb41..45e1b19 100644
--- a/src/compiler/codegen/arm/ArchUtility.cc
+++ b/src/compiler/codegen/arm/ArchUtility.cc
@@ -354,6 +354,9 @@
         case kArmPseudoThrowTarget:
             LOG(INFO) << "LT" << (intptr_t)lir << ":";
             break;
+        case kArmPseudoSuspendTarget:
+            LOG(INFO) << "LS" << (intptr_t)lir << ":";
+            break;
         case kArmPseudoCaseLabel:
             LOG(INFO) << "LC" << (intptr_t)lir << ": Case target 0x" <<
                 std::hex << lir->operands[0] << "|" << std::dec <<
diff --git a/src/compiler/codegen/arm/ArmLIR.h b/src/compiler/codegen/arm/ArmLIR.h
index 07e2e97..e436eea 100644
--- a/src/compiler/codegen/arm/ArmLIR.h
+++ b/src/compiler/codegen/arm/ArmLIR.h
@@ -28,7 +28,7 @@
  *        pointer in r0 as a hidden arg0. Otherwise used as codegen scratch
  *        registers.
  * r0-r1: As in C/C++ r0 is 32-bit return register and r0/r1 is 64-bit
- * r4   : Callee save (promotion target)
+ * r4   : (rSUSPEND) is reserved (suspend check assist)
  * r5   : Callee save (promotion target)
  * r6   : Callee save (promotion target)
  * r7   : Callee save (promotion target)
@@ -243,7 +243,7 @@
 
 /*
  * Annotate special-purpose core registers:
- *   - VM: r4PC, r5FP, and r6SELF
+ *   - VM: r6SELF
  *   - ARM architecture: r13sp, r14lr, and r15pc
  *
  * rPC, rFP, and rSELF are for architecture-independent code to use.
@@ -253,7 +253,7 @@
     r1     = 1,
     r2     = 2,
     r3     = 3,
-    r4     = 4,
+    rSUSPEND = 4,
     r5     = 5,
     r6     = 6,
     r7     = 7,
@@ -366,6 +366,7 @@
  * Assemble.c.
  */
 typedef enum ArmOpcode {
+    kArmPseudoSuspendTarget = -15,
     kArmPseudoThrowTarget = -14,
     kArmPseudoCaseLabel = -13,
     kArmPseudoMethodEntry = -12,
diff --git a/src/compiler/codegen/arm/MethodCodegenDriver.cc b/src/compiler/codegen/arm/MethodCodegenDriver.cc
index ce65803..41053a2 100644
--- a/src/compiler/codegen/arm/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/arm/MethodCodegenDriver.cc
@@ -1113,10 +1113,12 @@
 
         case OP_RETURN:
         case OP_RETURN_OBJECT:
+            genSuspendPoll(cUnit, mir);
             storeValue(cUnit, retLoc, rlSrc[0]);
             break;
 
         case OP_RETURN_WIDE:
+            genSuspendPoll(cUnit, mir);
             rlDest = retLocWide;
             rlDest.fp = rlSrc[0].fp;
             storeValueWide(cUnit, rlDest, rlSrc[0]);
@@ -1277,11 +1279,8 @@
         case OP_GOTO:
         case OP_GOTO_16:
         case OP_GOTO_32:
-            // TUNING: add MIR flag to disable when unnecessary
-            bool backwardBranch;
-            backwardBranch = (bb->taken->startOffset <= mir->offset);
-            if (backwardBranch) {
-                genSuspendPoll(cUnit, mir);
+            if (bb->taken->startOffset <= mir->offset) {
+                genSuspendTest(cUnit, mir);
             }
             genUnconditionalBranch(cUnit, &labelList[bb->taken->id]);
             break;
@@ -1315,7 +1314,7 @@
             ArmConditionCode cond;
             backwardBranch = (bb->taken->startOffset <= mir->offset);
             if (backwardBranch) {
-                genSuspendPoll(cUnit, mir);
+                genSuspendTest(cUnit, mir);
             }
             rlSrc[0] = loadValue(cUnit, rlSrc[0], kCoreReg);
             rlSrc[1] = loadValue(cUnit, rlSrc[1], kCoreReg);
@@ -1358,7 +1357,7 @@
             ArmConditionCode cond;
             backwardBranch = (bb->taken->startOffset <= mir->offset);
             if (backwardBranch) {
-                genSuspendPoll(cUnit, mir);
+                genSuspendTest(cUnit, mir);
             }
             rlSrc[0] = loadValue(cUnit, rlSrc[0], kCoreReg);
             opRegImm(cUnit, kOpCmp, rlSrc[0].lowReg, 0);
@@ -1999,6 +1998,27 @@
     }
 }
 
+static void handleSuspendLaunchpads(CompilationUnit *cUnit)
+{
+    ArmLIR** suspendLabel =
+        (ArmLIR **) cUnit->suspendLaunchpads.elemList;
+    int numElems = cUnit->suspendLaunchpads.numUsed;
+
+    for (int i = 0; i < numElems; i++) {
+        /* TUNING: move suspend count load into helper */
+        ArmLIR* lab = suspendLabel[i];
+        ArmLIR* resumeLab = (ArmLIR*)lab->operands[0];
+        cUnit->currentDalvikOffset = lab->operands[1];
+        oatAppendLIR(cUnit, (LIR *)lab);
+        loadWordDisp(cUnit, rSELF,
+                     OFFSETOF_MEMBER(Thread, pTestSuspendFromCode), rLR);
+        loadWordDisp(cUnit, rSELF,
+            art::Thread::SuspendCountOffset().Int32Value(), rSUSPEND);
+        opReg(cUnit, kOpBlx, rLR);
+        genUnconditionalBranch(cUnit, resumeLab);
+    }
+}
+
 static void handleThrowLaunchpads(CompilationUnit *cUnit)
 {
     ArmLIR** throwLabel =
@@ -2084,9 +2104,11 @@
 
     oatDataFlowAnalysisDispatcher(cUnit, methodBlockCodeGen,
                                   kPreOrderDFSTraversal, false /* Iterative */);
-    removeRedundantBranches(cUnit);
+    handleSuspendLaunchpads(cUnit);
 
     handleThrowLaunchpads(cUnit);
+
+    removeRedundantBranches(cUnit);
 }
 
 /* Common initialization routine for an architecture family */
diff --git a/src/compiler/codegen/arm/Thumb2/Factory.cc b/src/compiler/codegen/arm/Thumb2/Factory.cc
index 254802d..9321753 100644
--- a/src/compiler/codegen/arm/Thumb2/Factory.cc
+++ b/src/compiler/codegen/arm/Thumb2/Factory.cc
@@ -22,9 +22,9 @@
  *
  */
 
-static int coreRegs[] = {r0, r1, r2, r3, r4, r5, r6, r7, rSELF, r8, r10, r11,
-                         r12, rSP, rLR, rPC};
-static int reservedRegs[] = {rSELF, rSP, rLR, rPC};
+static int coreRegs[] = {r0, r1, r2, r3, rSUSPEND, r5, r6, r7, rSELF, r8, r10,
+                         r11, r12, rSP, rLR, rPC};
+static int reservedRegs[] = {rSUSPEND, rSELF, rSP, rLR, rPC};
 static int fpRegs[] = {fr0, fr1, fr2, fr3, fr4, fr5, fr6, fr7,
                        fr8, fr9, fr10, fr11, fr12, fr13, fr14, fr15,
                        fr16, fr17, fr18, fr19, fr20, fr21, fr22, fr23,
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index 2404ca7..76d8b45 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -1683,9 +1683,31 @@
     return false;
 }
 
+/* Check if we need to check for pending suspend request */
+static void genSuspendTest(CompilationUnit* cUnit, MIR* mir)
+{
+    if (mir->optimizationFlags & MIR_IGNORE_SUSPEND_CHECK) {
+        return;
+    }
+    newLIR2(cUnit, kThumbSubRI8, rSUSPEND, 1);
+    ArmLIR* branch = opCondBranch(cUnit, kArmCondEq);
+    ArmLIR* retLab = newLIR0(cUnit, kArmPseudoTargetLabel);
+    retLab->defMask = ENCODE_ALL;
+    ArmLIR* target = (ArmLIR*)oatNew(sizeof(ArmLIR), true);
+    target->generic.dalvikOffset = cUnit->currentDalvikOffset;
+    target->opcode = kArmPseudoSuspendTarget;
+    target->operands[0] = (intptr_t)retLab;
+    target->operands[1] = mir->offset;
+    branch->generic.target = (LIR*)target;
+    oatInsertGrowableList(&cUnit->suspendLaunchpads, (intptr_t)target);
+}
+
 /* Check for pending suspend request.  */
 static void genSuspendPoll(CompilationUnit* cUnit, MIR* mir)
 {
+    if (mir->optimizationFlags & MIR_IGNORE_SUSPEND_CHECK) {
+        return;
+    }
     oatLockCallTemps(cUnit);   // Explicit register usage
     int rSuspendCount = r1;
     ArmLIR* ld;