Loop detection, improved reg allocation

Detect loops and loop nesting depth, and use the latter to
weight register uses (which are then used to determine which
registers to promote).

Also:

   o Fixed typo that prevented squashing of useless fp reg copies

   o Rescheduled array access checks to hide latency of limit load.

   o Add basic-block optimization pass to remove duplicate range
     checks.

   o Fixed bug that prevented recognition of redundant null
     checks following iput-wide and aput-wide.

Change-Id: Icfbae39e89b1d14b8703ad6bbb0b29c0635fed1e
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index 5750913..54d9951 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -179,6 +179,7 @@
     kMIRCallee,                         // Instruction is inlined from callee
     kMIRIgnoreSuspendCheck,
     kMIRDup,
+    kMIRMark,                           // Temporary node mark
 };
 
 #define MIR_IGNORE_NULL_CHECK           (1 << kMIRIgnoreNullCheck)
@@ -190,6 +191,7 @@
 #define MIR_CALLEE                      (1 << kMIRCallee)
 #define MIR_IGNORE_SUSPEND_CHECK        (1 << kMIRIgnoreSuspendCheck)
 #define MIR_DUP                         (1 << kMIRDup)
+#define MIR_MARK                        (1 << kMIRMark)
 
 struct CallsiteInfo {
     const char* classDescriptor;
@@ -234,7 +236,8 @@
     bool hidden;
     bool catchEntry;
     bool fallThroughTarget;             // Reached via fallthrough
-    unsigned int startOffset;
+    uint16_t startOffset;
+    uint16_t nestingDepth;
     const Method* containingMethod;     // For blocks from the callee
     BBType blockType;
     bool needFallThroughBranch;         // For blocks ended due to length limit
@@ -335,6 +338,12 @@
     int* phiAliasMap;                   // length == numSSAReg
     MIR* phiList;
 
+    /* Use counts of ssa names */
+    GrowableList useCounts;
+
+    /* Optimization support */
+    GrowableList loopHeaders;
+
     /* Map SSA names to location */
     RegLocation* regLocation;
     int sequenceNumber;
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index 581c463..7b05a66 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -259,7 +259,7 @@
     DF_UA | DF_UB | DF_UC | DF_NULL_CHK_1 | DF_RANGE_CHK_2 | DF_IS_SETTER | DF_CORE_B | DF_CORE_C,
 
     // 4C APUT_WIDE vAA, vBB, vCC
-    DF_UA_WIDE | DF_UB | DF_UC | DF_NULL_CHK_1 | DF_RANGE_CHK_2 | DF_IS_SETTER | DF_CORE_B | DF_CORE_C,
+    DF_UA_WIDE | DF_UB | DF_UC | DF_NULL_CHK_2 | DF_RANGE_CHK_2 | DF_IS_SETTER | DF_CORE_B | DF_CORE_C,
 
     // 4D APUT_OBJECT vAA, vBB, vCC
     DF_UA | DF_UB | DF_UC | DF_NULL_CHK_1 | DF_RANGE_CHK_2 | DF_IS_SETTER | DF_CORE_B | DF_CORE_C,
@@ -301,7 +301,7 @@
     DF_UA | DF_UB | DF_NULL_CHK_1 | DF_IS_SETTER | DF_CORE_B,
 
     // 5A IPUT_WIDE vA, vB, field@CCCC
-    DF_UA_WIDE | DF_UB | DF_NULL_CHK_1 | DF_IS_SETTER | DF_CORE_B,
+    DF_UA_WIDE | DF_UB | DF_NULL_CHK_2 | DF_IS_SETTER | DF_CORE_B,
 
     // 5B IPUT_OBJECT vA, vB, field@CCCC
     DF_UA | DF_UB | DF_NULL_CHK_1 | DF_IS_SETTER | DF_CORE_B,
@@ -730,7 +730,7 @@
     DF_DA_WIDE | DF_UB | DF_NULL_CHK_0 | DF_CORE_B,
 
     // E9 IPUT_WIDE_VOLATILE
-    DF_UA_WIDE | DF_UB | DF_NULL_CHK_1 | DF_CORE_B,
+    DF_UA_WIDE | DF_UB | DF_NULL_CHK_2 | DF_CORE_B,
 
     // EA SGET_WIDE_VOLATILE
     DF_DA_WIDE,
@@ -769,7 +769,7 @@
     DF_UA | DF_UB | DF_NULL_CHK_1 | DF_IS_SETTER,
 
     // F6 IPUT_WIDE_QUICK
-    DF_UA_WIDE | DF_UB | DF_NULL_CHK_1 |DF_IS_SETTER,
+    DF_UA_WIDE | DF_UB | DF_NULL_CHK_2 |DF_IS_SETTER,
 
     // F7 IPUT_OBJECT_QUICK
     DF_UA | DF_UB | DF_NULL_CHK_1 | DF_IS_SETTER,
@@ -1631,7 +1631,8 @@
 }
 
 /* Advance to next strictly dominated MIR node in an extended basic block */
-MIR* advanceMIR(CompilationUnit* cUnit, BasicBlock** pBb, MIR* mir, ArenaBitVector* bv) {
+MIR* advanceMIR(CompilationUnit* cUnit, BasicBlock** pBb, MIR* mir, ArenaBitVector* bv,
+                bool clearMark) {
     BasicBlock* bb = *pBb;
     if (mir != NULL) {
         mir = mir->next;
@@ -1648,9 +1649,98 @@
             }
         }
     }
+    if (mir && clearMark) {
+        mir->optimizationFlags &= ~MIR_MARK;
+    }
     return mir;
 }
 
+/*
+ * For the rest of the extended run, replace uses of rOld with rNew.
+ */
+void replaceOperands(CompilationUnit* cUnit, BasicBlock** pBp, MIR* mir,
+                     int rOld, int rNew)
+{
+    while (true) {
+       mir = advanceMIR(cUnit, pBp, mir, NULL, false);
+       if (mir == NULL) {
+           break;
+       }
+       if (mir->ssaRep == NULL) {
+           continue;
+       }
+       if (mir->optimizationFlags & MIR_MARK) {
+           LOG(INFO) << "recursive replace";
+           mir->dalvikInsn.opcode = (Instruction::Code)kMirOpCopy;
+           mir->ssaRep->uses[0] = rNew;
+           mir->ssaRep->numUses = 1;
+           replaceOperands(cUnit, pBp, mir, mir->ssaRep->defs[0], rNew);
+           mir->optimizationFlags &= ~MIR_MARK;
+       } else {
+           for (int i = 0; i < mir->ssaRep->numUses; i++) {
+               if (mir->ssaRep->uses[i] == rOld) {
+                   mir->optimizationFlags |= MIR_IGNORE_NULL_CHECK;
+                   mir->ssaRep->uses[i] = rNew;
+               }
+           }
+       }
+    }
+}
+
+void squashDupRangeChecks(CompilationUnit* cUnit, BasicBlock** pBp, MIR* mir,
+                          int arraySreg, int indexSreg)
+{
+    while (true) {
+       mir = advanceMIR(cUnit, pBp, mir, NULL, false);
+       if (!mir) {
+           break;
+       }
+       if ((mir->ssaRep == NULL) ||
+           (mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
+           continue;
+       }
+       int checkArray = INVALID_SREG;
+       int checkIndex = INVALID_SREG;
+       switch(mir->dalvikInsn.opcode) {
+            case Instruction::AGET:
+            case Instruction::AGET_OBJECT:
+            case Instruction::AGET_BOOLEAN:
+            case Instruction::AGET_BYTE:
+            case Instruction::AGET_CHAR:
+            case Instruction::AGET_SHORT:
+            case Instruction::AGET_WIDE:
+                checkArray = mir->ssaRep->uses[0];
+                checkIndex = mir->ssaRep->uses[1];
+                break;
+                break;
+            case Instruction::APUT:
+            case Instruction::APUT_OBJECT:
+            case Instruction::APUT_SHORT:
+            case Instruction::APUT_CHAR:
+            case Instruction::APUT_BYTE:
+            case Instruction::APUT_BOOLEAN:
+                checkArray = mir->ssaRep->uses[1];
+                checkIndex = mir->ssaRep->uses[2];
+                break;
+            case Instruction::APUT_WIDE:
+                checkArray = mir->ssaRep->uses[2];
+                checkIndex = mir->ssaRep->uses[3];
+            default:
+                break;
+       }
+       if (checkArray == INVALID_SREG) {
+           continue;
+       }
+       if ((arraySreg == checkArray) && (indexSreg == checkIndex)) {
+           if (cUnit->printMe) {
+               LOG(INFO) << "Squashing range check @ 0x" << std::hex
+                         << mir->offset;
+           }
+           mir->optimizationFlags |= MIR_IGNORE_RANGE_CHECK;
+       }
+    }
+}
+
 /* Allocate a compiler temp, return Sreg.  Reuse existing if no conflict */
 int allocCompilerTempSreg(CompilationUnit* cUnit, ArenaBitVector* bv)
 {
@@ -1704,7 +1794,39 @@
 
     for (MIR* mir = bb->firstMIRInsn; mir; mir = mir->next) {
         // Look for interesting opcodes, skip otherwise
-        switch(mir->dalvikInsn.opcode) {
+        Instruction::Code opcode = mir->dalvikInsn.opcode;
+        switch(opcode) {
+            case Instruction::AGET:
+            case Instruction::AGET_OBJECT:
+            case Instruction::AGET_BOOLEAN:
+            case Instruction::AGET_BYTE:
+            case Instruction::AGET_CHAR:
+            case Instruction::AGET_SHORT:
+            case Instruction::AGET_WIDE:
+                if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
+                    int arrSreg = mir->ssaRep->uses[0];
+                    int idxSreg = mir->ssaRep->uses[1];
+                    BasicBlock* tbb = bb;
+                    squashDupRangeChecks(cUnit, &tbb, mir, arrSreg, idxSreg);
+                }
+                break;
+            case Instruction::APUT:
+            case Instruction::APUT_OBJECT:
+            case Instruction::APUT_SHORT:
+            case Instruction::APUT_CHAR:
+            case Instruction::APUT_BYTE:
+            case Instruction::APUT_BOOLEAN:
+            case Instruction::APUT_WIDE:
+                if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
+                    int start = (opcode == Instruction::APUT_WIDE) ? 2 : 1;
+                    int arrSreg = mir->ssaRep->uses[start];
+                    int idxSreg = mir->ssaRep->uses[start + 1];
+                    BasicBlock* tbb = bb;
+                    squashDupRangeChecks(cUnit, &tbb, mir, arrSreg, idxSreg);
+                }
+                break;
+#if 0
+            // Probably not worthwhile.
             case Instruction::IGET_OBJECT: {
                 // TODO: look for CSE
                     if (mir->optimizationFlags & MIR_DUP) {
@@ -1720,7 +1842,7 @@
                     BasicBlock* tbb = bb;
                     MIR* tm = mir;
                     while (true) {
-                        tm = advanceMIR(cUnit, &tbb, tm, tempBlockV);
+                        tm = advanceMIR(cUnit, &tbb, tm, tempBlockV, true);
                         if ((tm == NULL) || (tm == mir)) {
                             break;
                         }
@@ -1730,9 +1852,11 @@
                             && (tm->dalvikInsn.vC == fieldIdx)) {
                             if (cUnit->printMe) {
                                 LOG(INFO) << "Got DUP IGET_OBJECT @ 0x"
-                                          << std::hex << tm->offset << ", from 0x"
-                                          << std::hex <<mir->offset;
+                                          << std::hex << tm->offset
+                                          << ", from 0x" << std::hex
+                                          << mir->offset;
                             }
+                            tm->optimizationFlags |= MIR_MARK;
                             matches++;
                         } else if ((opcode == Instruction::IPUT_OBJECT)
                             && (tm->ssaRep->uses[0] == objSreg)
@@ -1753,24 +1877,19 @@
                         newMir->ssaRep->defs[0] = dstSreg;
                         newMir->ssaRep->uses[0] = tmpSreg;
                         mir->ssaRep->defs[0] = tmpSreg;
+                        mir->optimizationFlags &= ~MIR_MARK;
                         oatInsertMIRAfter(bb, mir, newMir);
+                        BasicBlock* tbb = bb;
+                        replaceOperands(cUnit, &tbb, mir, dstSreg, tmpSreg);
                     }
                 }
                 break;
-            case Instruction::IF_EQ:
-            case Instruction::IF_NE:
-            case Instruction::IF_LT:
-            case Instruction::IF_GE:
-            case Instruction::IF_GT:
-            case Instruction::IF_LE:
-                // TODO: Check for and fuse preceeding comparison
-                break;
-            case Instruction::IF_EQZ:
-            case Instruction::IF_NEZ:
-            case Instruction::IF_LTZ:
-            case Instruction::IF_GEZ:
-            case Instruction::IF_GTZ:
-            case Instruction::IF_LEZ:
+#endif
+            case Instruction::CMPL_FLOAT:
+            case Instruction::CMPL_DOUBLE:
+            case Instruction::CMPG_FLOAT:
+            case Instruction::CMPG_DOUBLE:
+            case Instruction::CMP_LONG:
                 // TODO: Check for and fuse preceeding comparison
                 break;
             default:
@@ -1901,8 +2020,15 @@
 
         // Already nullchecked?
         if (dfAttributes & DF_HAS_NULL_CHKS) {
-            int srcSreg = (dfAttributes & DF_NULL_CHK_1) ?
-                mir->ssaRep->uses[1] : mir->ssaRep->uses[0];
+            int srcIdx;
+            if (dfAttributes & DF_NULL_CHK_1) {
+                srcIdx = 1;
+            } else if (dfAttributes & DF_NULL_CHK_2) {
+                srcIdx = 2;
+            } else {
+                srcIdx = 0;
+            }
+            int srcSreg = mir->ssaRep->uses[srcIdx];
             if (oatIsBitSet(cUnit->tempSSARegisterV, srcSreg)) {
                 // Eliminate the null check
                 mir->optimizationFlags |= MIR_IGNORE_NULL_CHECK;
@@ -1938,13 +2064,202 @@
 
 void oatMethodBasicBlockOptimization(CompilationUnit *cUnit)
 {
-    oatInitGrowableList(cUnit, &cUnit->compilerTemps, 6, kListMisc);
-    DCHECK_EQ(cUnit->numCompilerTemps, 0);
     if (!(cUnit->disableOpt & (1 << kBBOpt))) {
-        oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
-                                      kAllNodes,
-                                      false /* isIterative */);
+        oatInitGrowableList(cUnit, &cUnit->compilerTemps, 6, kListMisc);
+        DCHECK_EQ(cUnit->numCompilerTemps, 0);
+        if (!(cUnit->disableOpt & (1 << kBBOpt))) {
+            oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
+                                          kAllNodes, false /* isIterative */);
+        }
     }
 }
 
+void addLoopHeader(CompilationUnit* cUnit, BasicBlock* header,
+                   BasicBlock* backEdge)
+{
+    GrowableListIterator iter;
+    oatGrowableListIteratorInit(&cUnit->loopHeaders, &iter);
+    for (LoopInfo* loop = (LoopInfo*)oatGrowableListIteratorNext(&iter);
+         (loop != NULL); loop = (LoopInfo*)oatGrowableListIteratorNext(&iter)) {
+         if (loop->header == header) {
+             oatInsertGrowableList(cUnit, &loop->incomingBackEdges,
+                                   (intptr_t)backEdge);
+             return;
+         }
+    }
+    LoopInfo* info = (LoopInfo*)oatNew(cUnit, sizeof(LoopInfo), true,
+                                       kAllocDFInfo);
+    info->header = header;
+    oatInitGrowableList(cUnit, &info->incomingBackEdges, 2, kListMisc);
+    oatInsertGrowableList(cUnit, &info->incomingBackEdges, (intptr_t)backEdge);
+    oatInsertGrowableList(cUnit, &cUnit->loopHeaders, (intptr_t)info);
+}
+
+bool findBackEdges(struct CompilationUnit* cUnit, struct BasicBlock* bb)
+{
+    if ((bb->dataFlowInfo == NULL) || (bb->lastMIRInsn == NULL)) {
+        return false;
+    }
+    Instruction::Code opcode = bb->lastMIRInsn->dalvikInsn.opcode;
+    if (Instruction::Flags(opcode) & Instruction::kBranch) {
+        if (bb->taken && (bb->taken->startOffset <= bb->startOffset)) {
+            DCHECK(bb->dominators != NULL);
+            if (oatIsBitSet(bb->dominators, bb->taken->id)) {
+                if (cUnit->printMe) {
+                    LOG(INFO) << "Loop backedge from 0x"
+                              << std::hex << bb->lastMIRInsn->offset
+                              << " to 0x" << std::hex << bb->taken->startOffset;
+                }
+                addLoopHeader(cUnit, bb->taken, bb);
+            }
+        }
+    }
+    return false;
+}
+
+void addBlocksToLoop(CompilationUnit* cUnit, ArenaBitVector* blocks,
+                     BasicBlock* bb, int headId)
+{
+    if (!oatIsBitSet(bb->dominators, headId) ||
+        oatIsBitSet(blocks, bb->id)) {
+        return;
+    }
+    oatSetBit(cUnit, blocks, bb->id);
+    GrowableListIterator iter;
+    oatGrowableListIteratorInit(bb->predecessors, &iter);
+    BasicBlock* predBB;
+    for (predBB = (BasicBlock*)oatGrowableListIteratorNext(&iter); predBB;
+         predBB = (BasicBlock*)oatGrowableListIteratorNext(&iter)) {
+             addBlocksToLoop(cUnit, blocks, predBB, headId);
+    }
+}
+
+void oatDumpLoops(CompilationUnit *cUnit)
+{
+    GrowableListIterator iter;
+    oatGrowableListIteratorInit(&cUnit->loopHeaders, &iter);
+    for (LoopInfo* loop = (LoopInfo*)oatGrowableListIteratorNext(&iter);
+         (loop != NULL); loop = (LoopInfo*)oatGrowableListIteratorNext(&iter)) {
+         LOG(INFO) << "Loop head block id " << loop->header->id
+                   << ", offset 0x" << std::hex << loop->header->startOffset
+                   << ", Depth: " << loop->header->nestingDepth;
+         GrowableListIterator iter;
+         oatGrowableListIteratorInit(&loop->incomingBackEdges, &iter);
+         BasicBlock* edgeBB;
+         for (edgeBB = (BasicBlock*)oatGrowableListIteratorNext(&iter); edgeBB;
+              edgeBB = (BasicBlock*)oatGrowableListIteratorNext(&iter)) {
+              LOG(INFO) << "    Backedge block id " << edgeBB->id
+                        << ", offset 0x" << std::hex << edgeBB->startOffset;
+              ArenaBitVectorIterator bIter;
+              oatBitVectorIteratorInit(loop->blocks, &bIter);
+              for (int bbId = oatBitVectorIteratorNext(&bIter); bbId != -1;
+                   bbId = oatBitVectorIteratorNext(&bIter)) {
+                  BasicBlock *bb;
+                  bb = (BasicBlock*)
+                        oatGrowableListGetElement(&cUnit->blockList, bbId);
+                  LOG(INFO) << "        (" << bb->id << ", 0x" << std::hex
+                            << bb->startOffset << ")";
+              }
+         }
+    }
+}
+
+void oatMethodLoopDetection(CompilationUnit *cUnit)
+{
+    if (cUnit->disableOpt & (1 << kPromoteRegs)) {
+        return;
+    }
+    oatInitGrowableList(cUnit, &cUnit->loopHeaders, 6, kListMisc);
+    // Find the loop headers
+    oatDataFlowAnalysisDispatcher(cUnit, findBackEdges,
+                                  kAllNodes, false /* isIterative */);
+    GrowableListIterator iter;
+    oatGrowableListIteratorInit(&cUnit->loopHeaders, &iter);
+    // Add blocks to each header
+    for (LoopInfo* loop = (LoopInfo*)oatGrowableListIteratorNext(&iter);
+         loop; loop = (LoopInfo*)oatGrowableListIteratorNext(&iter)) {
+         loop->blocks = oatAllocBitVector(cUnit, cUnit->numBlocks, true,
+                                          kBitMapMisc);
+         oatSetBit(cUnit, loop->blocks, loop->header->id);
+         GrowableListIterator iter;
+         oatGrowableListIteratorInit(&loop->incomingBackEdges, &iter);
+         BasicBlock* edgeBB;
+         for (edgeBB = (BasicBlock*)oatGrowableListIteratorNext(&iter); edgeBB;
+              edgeBB = (BasicBlock*)oatGrowableListIteratorNext(&iter)) {
+             addBlocksToLoop(cUnit, loop->blocks, edgeBB, loop->header->id);
+         }
+    }
+    // Compute the nesting depth of each header
+    oatGrowableListIteratorInit(&cUnit->loopHeaders, &iter);
+    for (LoopInfo* loop = (LoopInfo*)oatGrowableListIteratorNext(&iter);
+         loop; loop = (LoopInfo*)oatGrowableListIteratorNext(&iter)) {
+        GrowableListIterator iter2;
+        oatGrowableListIteratorInit(&cUnit->loopHeaders, &iter2);
+        LoopInfo* loop2;
+        for (loop2 = (LoopInfo*)oatGrowableListIteratorNext(&iter2);
+             loop2; loop2 = (LoopInfo*)oatGrowableListIteratorNext(&iter2)) {
+            if (oatIsBitSet(loop2->blocks, loop->header->id)) {
+                loop->header->nestingDepth++;
+            }
+        }
+    }
+    // Assign nesting depth to each block in all loops
+    oatGrowableListIteratorInit(&cUnit->loopHeaders, &iter);
+    for (LoopInfo* loop = (LoopInfo*)oatGrowableListIteratorNext(&iter);
+         (loop != NULL); loop = (LoopInfo*)oatGrowableListIteratorNext(&iter)) {
+        ArenaBitVectorIterator bIter;
+        oatBitVectorIteratorInit(loop->blocks, &bIter);
+        for (int bbId = oatBitVectorIteratorNext(&bIter); bbId != -1;
+            bbId = oatBitVectorIteratorNext(&bIter)) {
+            BasicBlock *bb;
+            bb = (BasicBlock*) oatGrowableListGetElement(&cUnit->blockList,
+                                                         bbId);
+            bb->nestingDepth = std::max(bb->nestingDepth,
+                                        loop->header->nestingDepth);
+        }
+    }
+    if (cUnit->printMe) {
+        oatDumpLoops(cUnit);
+    }
+}
+
+/*
+ * Count uses, weighting by loop nesting depth.  This code only
+ * counts explicitly used sRegs.  A later phase will add implicit
+ * counts for things such as Method*, null-checked references, etc.
+ */
+bool countUses(struct CompilationUnit* cUnit, struct BasicBlock* bb)
+{
+    if (bb->blockType != kDalvikByteCode) {
+        return false;
+    }
+    for (MIR* mir = bb->firstMIRInsn; (mir != NULL); mir = mir->next) {
+        if (mir->ssaRep == NULL) {
+            continue;
+        }
+        for (int i = 0; i < mir->ssaRep->numUses; i++) {
+            int sReg = mir->ssaRep->uses[i];
+            uint32_t weight = std::min(16U, (uint32_t)bb->nestingDepth);
+            DCHECK_LT(sReg, (int)cUnit->useCounts.numUsed);
+            cUnit->useCounts.elemList[sReg] += (1 << weight);
+        }
+    }
+    return false;
+}
+
+void oatMethodUseCount(CompilationUnit *cUnit)
+{
+    if (cUnit->disableOpt & (1 << kPromoteRegs)) {
+        return;
+    }
+    oatInitGrowableList(cUnit, &cUnit->useCounts, cUnit->numSSARegs + 32,
+                        kListMisc);
+    // Initialize list
+    for (int i = 0; i < cUnit->numSSARegs; i++) {
+        oatInsertGrowableList(cUnit, &cUnit->useCounts, 0);
+    }
+    oatDataFlowAnalysisDispatcher(cUnit, countUses,
+                                  kAllNodes, false /* isIterative */);
+}
+
 }  // namespace art
diff --git a/src/compiler/Dataflow.h b/src/compiler/Dataflow.h
index 2df9373..77bf756 100644
--- a/src/compiler/Dataflow.h
+++ b/src/compiler/Dataflow.h
@@ -37,15 +37,16 @@
     kFormat35c,
     kFormat3rc,
     kPhi,
-    kNullCheckSrc0,        // Null check of src[0]
-    kNullCheckSrc1,        // Null check of src[1]
+    kNullCheckSrc0,        // Null check of uses[0]
+    kNullCheckSrc1,        // Null check of uses[1]
+    kNullCheckSrc2,        // Null check of uses[2]
     kNullCheckOut0,        // Null check out outgoing arg0
     kDstNonNull,           // May assume dst is non-null
     kRetNonNull,           // May assume retval is non-null
     kNullTransferSrc0,     // Object copy src[0] -> dst
     kNullTransferSrcN,     // Phi null check state transfer
-    kRangeCheckSrc1,       // Range check of src[1]
-    kRangeCheckSrc2,       // Range check of src[2]
+    kRangeCheckSrc1,       // Range check of uses[1]
+    kRangeCheckSrc2,       // Range check of uses[2]
     kFPA,
     kFPB,
     kFPC,
@@ -73,6 +74,7 @@
 #define DF_PHI                  (1 << kPhi)
 #define DF_NULL_CHK_0           (1 << kNullCheckSrc0)
 #define DF_NULL_CHK_1           (1 << kNullCheckSrc1)
+#define DF_NULL_CHK_2           (1 << kNullCheckSrc2)
 #define DF_NULL_CHK_OUT0        (1 << kNullCheckOut0)
 #define DF_NON_NULL_DST         (1 << kDstNonNull)
 #define DF_NON_NULL_RET         (1 << kRetNonNull)
@@ -96,6 +98,7 @@
 
 #define DF_HAS_NULL_CHKS        (DF_NULL_CHK_0 | \
                                  DF_NULL_CHK_1 | \
+                                 DF_NULL_CHK_2 | \
                                  DF_NULL_CHK_OUT0)
 
 #define DF_HAS_NR_CHKS          (DF_HAS_NULL_CHKS | \
@@ -147,6 +150,16 @@
     int minC;                   // For DIV - will affect lower bound checking
 };
 
+struct LoopInfo {
+    BasicBlock* header;
+    GrowableList incomingBackEdges;
+    ArenaBitVector* blocks;
+};
+
+void oatMethodLoopDetection(CompilationUnit*);
+
+void oatMethodUseCount(CompilationUnit*);
+
 void oatMethodNullCheckElimination(CompilationUnit*);
 
 void oatMethodBasicBlockOptimization(CompilationUnit*);
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index fc7fcb4..d97c20f 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -34,7 +34,7 @@
      //(1 << kTrackLiveTemps) |
      //(1 << kSkipLargeMethodOptimization) |
      //(1 << kSafeOptimizations) |
-     (1 << kBBOpt) |
+     //(1 << kBBOpt) |
      0;
 
 uint32_t compilerDebugFlags = 0 |     // Enable debug/testing modes
@@ -967,13 +967,17 @@
     /* Perform SSA transformation for the whole method */
     oatMethodSSATransformation(cUnit.get());
 
+    /* Detect loops */
+    oatMethodLoopDetection(cUnit.get());
+
+    /* Count uses */
+    oatMethodUseCount(cUnit.get());
+
     /* Perform null check elimination */
     oatMethodNullCheckElimination(cUnit.get());
 
-#if 0
     /* Do some basic block optimizations */
     oatMethodBasicBlockOptimization(cUnit.get());
-#endif
 
     oatInitializeRegAlloc(cUnit.get());  // Needs to happen after SSA naming
 
diff --git a/src/compiler/codegen/CodegenUtil.cc b/src/compiler/codegen/CodegenUtil.cc
index 27433ca..cc31b29 100644
--- a/src/compiler/codegen/CodegenUtil.cc
+++ b/src/compiler/codegen/CodegenUtil.cc
@@ -371,6 +371,7 @@
     LOG(INFO) << "Outs                 : " << cUnit->numOuts;
     LOG(INFO) << "CoreSpills           : " << cUnit->numCoreSpills;
     LOG(INFO) << "FPSpills             : " << cUnit->numFPSpills;
+    LOG(INFO) << "CompilerTemps        : " << cUnit->numCompilerTemps;
     LOG(INFO) << "Frame size           : " << cUnit->frameSize;
     LOG(INFO) << "code size is " << cUnit->totalSize <<
         " bytes, Dalvik size is " << insnsSize * 2;
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index bfc5639..34d9fb9 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -1341,22 +1341,23 @@
         opRegCopy(cUnit, regPtr, rlArray.lowReg);
     }
 
-    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
-        int regLen = oatAllocTemp(cUnit);
+    bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    int regLen = INVALID_REG;
+    if (needsRangeCheck) {
+        regLen = oatAllocTemp(cUnit);
         //NOTE: max live temps(4) here.
         /* Get len */
         loadWordDisp(cUnit, rlArray.lowReg, lenOffset, regLen);
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
+    }
+    /* regPtr -> array data */
+    opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
+    /* at this point, regPtr points to array, 2 live temps */
+    rlSrc = loadValue(cUnit, rlSrc, regClass);
+    if (needsRangeCheck) {
         genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
                        kThrowArrayBounds);
         oatFreeTemp(cUnit, regLen);
-    } else {
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
     }
-    /* at this point, regPtr points to array, 2 live temps */
-    rlSrc = loadValue(cUnit, rlSrc, regClass);
     storeBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlSrc.lowReg,
                      scale, kWord);
 }
@@ -1406,21 +1407,15 @@
     }
 #else
     int regPtr = oatAllocTemp(cUnit);
-    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
-        int regLen = oatAllocTemp(cUnit);
+    bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    int regLen = INVALID_REG;
+    if (needsRangeCheck) {
+        regLen = oatAllocTemp(cUnit);
         /* Get len */
         loadWordDisp(cUnit, rlArray.lowReg, lenOffset, regLen);
-        /* regPtr -> array data */
-        opRegRegImm(cUnit, kOpAdd, regPtr, rlArray.lowReg, dataOffset);
-        // TODO: change kCondCS to a more meaningful name, is the sense of
-        // carry-set/clear flipped?
-        genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
-                       kThrowArrayBounds);
-        oatFreeTemp(cUnit, regLen);
-    } else {
-        /* regPtr -> array data */
-        opRegRegImm(cUnit, kOpAdd, regPtr, rlArray.lowReg, dataOffset);
     }
+    /* regPtr -> array data */
+    opRegRegImm(cUnit, kOpAdd, regPtr, rlArray.lowReg, dataOffset);
     oatFreeTemp(cUnit, rlArray.lowReg);
     if ((size == kLong) || (size == kDouble)) {
         if (scale) {
@@ -1434,6 +1429,13 @@
         oatFreeTemp(cUnit, rlIndex.lowReg);
         rlResult = oatEvalLoc(cUnit, rlDest, regClass, true);
 
+        if (needsRangeCheck) {
+            // TODO: change kCondCS to a more meaningful name, is the sense of
+            // carry-set/clear flipped?
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
         loadPair(cUnit, regPtr, rlResult.lowReg, rlResult.highReg);
 
         oatFreeTemp(cUnit, regPtr);
@@ -1441,6 +1443,13 @@
     } else {
         rlResult = oatEvalLoc(cUnit, rlDest, regClass, true);
 
+        if (needsRangeCheck) {
+            // TODO: change kCondCS to a more meaningful name, is the sense of
+            // carry-set/clear flipped?
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
         loadBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlResult.lowReg,
                         scale, size);
 
@@ -1483,20 +1492,16 @@
     /* null object? */
     genNullCheck(cUnit, rlArray.sRegLow, rlArray.lowReg, mir);
 
-    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
-        int regLen = oatAllocTemp(cUnit);
+    bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    int regLen = INVALID_REG;
+    if (needsRangeCheck) {
+        regLen = oatAllocTemp(cUnit);
         //NOTE: max live temps(4) here.
         /* Get len */
         loadWordDisp(cUnit, rlArray.lowReg, lenOffset, regLen);
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
-        genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
-                       kThrowArrayBounds);
-        oatFreeTemp(cUnit, regLen);
-    } else {
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
     }
+    /* regPtr -> array data */
+    opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
     /* at this point, regPtr points to array, 2 live temps */
     if ((size == kLong) || (size == kDouble)) {
         //TUNING: specific wide routine that can handle fp regs
@@ -1510,12 +1515,22 @@
         }
         rlSrc = loadValueWide(cUnit, rlSrc, regClass);
 
+        if (needsRangeCheck) {
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
+
         storePair(cUnit, regPtr, rlSrc.lowReg, rlSrc.highReg);
 
         oatFreeTemp(cUnit, regPtr);
     } else {
         rlSrc = loadValue(cUnit, rlSrc, regClass);
-
+        if (needsRangeCheck) {
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
         storeBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlSrc.lowReg,
                          scale, size);
     }
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 64f55c6..45a0c75 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -757,9 +757,12 @@
             newLIR1(cUnit, kPseudoSSARep, (int) ssaString);
             break;
         }
-        case kMirOpCopy:
-            UNIMPLEMENTED(FATAL) << "Need kMirOpCopy";
+        case kMirOpCopy: {
+            RegLocation rlSrc = oatGetSrc(cUnit, mir, 0);
+            RegLocation rlDest = oatGetDest(cUnit, mir, 0);
+            storeValue(cUnit, rlDest, rlSrc);
             break;
+        }
         default:
             break;
     }
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index afbefff..c08b2e8 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -1043,52 +1043,31 @@
 void oatCountRefs(CompilationUnit *cUnit, BasicBlock* bb,
                   RefCounts* coreCounts, RefCounts* fpCounts)
 {
-    MIR* mir;
-    if (bb->blockType != kDalvikByteCode && bb->blockType != kEntryBlock &&
-        bb->blockType != kExitBlock)
+    if ((cUnit->disableOpt & (1 << kPromoteRegs)) ||
+        !((bb->blockType == kEntryBlock) || (bb->blockType == kExitBlock) ||
+          (bb->blockType == kDalvikByteCode))) {
         return;
-
-    for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
-        SSARepresentation *ssaRep = mir->ssaRep;
-        if (ssaRep) {
-            for (int i = 0; i < ssaRep->numDefs;) {
-                RegLocation loc = cUnit->regLocation[ssaRep->defs[i]];
-                RefCounts* counts = loc.fp ? fpCounts : coreCounts;
-                int vReg = SRegToVReg(cUnit, ssaRep->defs[i]);
-                if (loc.defined) {
-                    counts[vReg].count++;
-                }
-                if (loc.wide) {
-                    if (loc.defined) {
-                        if (loc.fp) {
-                            counts[vReg].doubleStart = true;
-                        }
-                        counts[vReg+1].count++;
-                    }
-                    i += 2;
-                } else {
-                    i++;
+    }
+    for (int i = 0; i < cUnit->numSSARegs;) {
+        RegLocation loc = cUnit->regLocation[i];
+        RefCounts* counts = loc.fp ? fpCounts : coreCounts;
+        int vReg = SRegToVReg(cUnit, loc.sRegLow);
+        if (vReg < 0) {
+            vReg = cUnit->numDalvikRegisters - (vReg + 1);
+        }
+        if (loc.defined) {
+            counts[vReg].count += cUnit->useCounts.elemList[i];
+        }
+        if (loc.wide) {
+            if (loc.defined) {
+                if (loc.fp) {
+                    counts[vReg].doubleStart = true;
+                counts[vReg+1].count += cUnit->useCounts.elemList[i+1];
                 }
             }
-            for (int i = 0; i < ssaRep->numUses;) {
-                RegLocation loc = cUnit->regLocation[ssaRep->uses[i]];
-                RefCounts* counts = loc.fp ? fpCounts : coreCounts;
-                int vReg = SRegToVReg(cUnit, ssaRep->uses[i]);
-                if (loc.defined) {
-                    counts[vReg].count++;
-                }
-                if (loc.wide) {
-                    if (loc.defined) {
-                        if (loc.fp) {
-                            counts[vReg].doubleStart = true;
-                        }
-                        counts[vReg+1].count++;
-                    }
-                    i += 2;
-                } else {
-                    i++;
-                }
-            }
+            i += 2;
+        } else {
+            i++;
         }
     }
 }
@@ -1115,7 +1094,9 @@
  */
 extern void oatDoPromotion(CompilationUnit* cUnit)
 {
-    int numRegs = cUnit->numDalvikRegisters;
+    int regBias = cUnit->numCompilerTemps + 1;
+    int dalvikRegs = cUnit->numDalvikRegisters;
+    int numRegs = dalvikRegs + regBias;
 
     // Allow target code to add any special registers
     oatAdjustSpillMask(cUnit);
@@ -1135,9 +1116,14 @@
           oatNew(cUnit, sizeof(RefCounts) * numRegs, true, kAllocRegAlloc);
     RefCounts *fpRegs = (RefCounts *)
           oatNew(cUnit, sizeof(RefCounts) * numRegs, true, kAllocRegAlloc);
-    for (int i = 0; i < numRegs; i++) {
+    // Set ssa names for original Dalvik registers
+    for (int i = 0; i < dalvikRegs; i++) {
         coreRegs[i].sReg = fpRegs[i].sReg = i;
     }
+    // Set ssa names for Method* and compiler temps
+    for (int i = 0; i < regBias; i++) {
+        coreRegs[dalvikRegs + i].sReg = fpRegs[dalvikRegs + i].sReg = (-1 - i);
+    }
     GrowableListIterator iterator;
     oatGrowableListIteratorInit(&cUnit->blockList, &iterator);
     while (true) {
diff --git a/src/compiler/codegen/arm/Thumb2/Factory.cc b/src/compiler/codegen/arm/Thumb2/Factory.cc
index fdf0ca2..c79f7c6 100644
--- a/src/compiler/codegen/arm/Thumb2/Factory.cc
+++ b/src/compiler/codegen/arm/Thumb2/Factory.cc
@@ -1034,7 +1034,7 @@
         }
     }
     LIR* res = rawLIR(cUnit, cUnit->currentDalvikOffset, opcode, rDest, rSrc);
-    if (!(cUnit->disableOpt && (1 << kSafeOptimizations)) && rDest == rSrc) {
+    if (!(cUnit->disableOpt & (1 << kSafeOptimizations)) && rDest == rSrc) {
         res->flags.isNop = true;
     }
     return res;
diff --git a/src/compiler/codegen/mips/Mips32/Gen.cc b/src/compiler/codegen/mips/Mips32/Gen.cc
index e86a942..dc98508 100644
--- a/src/compiler/codegen/mips/Mips32/Gen.cc
+++ b/src/compiler/codegen/mips/Mips32/Gen.cc
@@ -450,7 +450,7 @@
 #endif
     LIR* res = rawLIR(cUnit, cUnit->currentDalvikOffset, kMipsMove,
                       rDest, rSrc);
-    if (!(cUnit->disableOpt && (1 << kSafeOptimizations)) && rDest == rSrc) {
+    if (!(cUnit->disableOpt & (1 << kSafeOptimizations)) && rDest == rSrc) {
         res->flags.isNop = true;
     }
     return res;