Dataflow analysis rework

Rework key portions of the dataflow analysis code to avoid the
massive slowdowns we're seeing with methods containing more than
20,000 basic blocks.

Add a somewhat hacky lookup cache in front of findBlock.  Later
we'll want to rework the "GrowableList" mechanism to incorporate
this properly.

Add memory to the bit vector utilities to speed up the iterators
for large sparse vectors. (Similarly, in the future we'll want
to include a rewrite of the home-grown bit-vector manipluation
utilites to be better with large ones).

With this CL, compilation speed on the degenerate cases is > 10x
faster.  Compilation of typically-sized methods will see a smallish
improvement.

Change-Id: I7f086c1229dd4fe62f0a5fc361234bf204ebc2b1
diff --git a/src/compiler/Compiler.h b/src/compiler/Compiler.h
index ac5e0c9..24bc8af 100644
--- a/src/compiler/Compiler.h
+++ b/src/compiler/Compiler.h
@@ -54,6 +54,7 @@
     kDebugSlowestFieldPath,
     kDebugSlowestStringPath,
     kDebugExerciseResolveMethod,
+    kDebugVerifyDataflow,
 };
 
 extern uint32_t compilerDebugFlags;
@@ -91,6 +92,7 @@
     kPreOrderDFSTraversal,      // Depth-First-Search / Pre-Order
     kPostOrderDFSTraversal,     // Depth-First-Search / Post-Order
     kPostOrderDOMTraversal,     // Dominator tree / Post-Order
+    kReversePostOrderTraversal, // Depth-First-Search / reverse Post-Order
 } DataFlowAnalysisMode;
 
 struct CompilationUnit;
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index f981ccd..317924b 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -147,6 +147,7 @@
 
 typedef struct BasicBlock {
     int id;
+    int dfsId;
     bool visited;
     bool hidden;
     bool catchEntry;
@@ -191,6 +192,8 @@
     kRetryHalve
 } AssemblerStatus;
 
+#define NOTVISITED (-1)
+
 typedef struct CompilationUnit {
     int numInsts;
     int numBlocks;
@@ -265,9 +268,11 @@
     BasicBlock* curBlock;
     BasicBlock* nextCodegenBlock;       // for extended trace codegen
     GrowableList dfsOrder;
+    GrowableList dfsPostOrder;
     GrowableList domPostOrderTraversal;
     GrowableList throwLaunchpads;
     GrowableList suspendLaunchpads;
+    int* iDomList;
     ArenaBitVector* tryBlockAddr;
     ArenaBitVector** defBlockMatrix;    // numDalvikRegister x numBlocks
     ArenaBitVector* tempBlockV;
@@ -310,6 +315,7 @@
      GrowableList fillArrayData;
      const u2* insns;
      u4 insnsSize;
+     std::map<unsigned int, BasicBlock*> blockMap; // findBlock lookup cache
 } CompilationUnit;
 
 BasicBlock* oatNewBB(BBType blockType, int blockId);
diff --git a/src/compiler/CompilerUtility.h b/src/compiler/CompilerUtility.h
index d7612ff..a68ee5f 100644
--- a/src/compiler/CompilerUtility.h
+++ b/src/compiler/CompilerUtility.h
@@ -58,8 +58,20 @@
  */
 struct ArenaBitVector {
     bool    expandable;     /* expand bitmap if we run out? */
+    bool    firstDirty;     /* when true, don't believe firstBitSet */
+    bool    lastDirty;      /* when true, don't believe lastBitSet */
     u4      storageSize;    /* current size, in 32-bit words */
     u4*     storage;
+                            /*
+                             * Opportunistically remember first and
+                             * last set bits.  This yeilds a performance
+                             * advantage in cases where large
+                             * sparse vectors are repeatedly scanned
+                             * (something that can happen a lot during
+                             * dataflow analysis.
+                             */
+    int     firstBitSet;
+    int     lastBitSet;
 };
 
 /* Handy iterator to walk through the bit positions set to 1 */
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index 211cffb..3f88ea5 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -1509,82 +1509,104 @@
     while (change) {
         change = false;
 
+        switch (dfaMode) {
         /* Scan all blocks and perform the operations specified in func */
-        if (dfaMode == kAllNodes) {
-            GrowableListIterator iterator;
-            oatGrowableListIteratorInit(&cUnit->blockList, &iterator);
-            while (true) {
-                BasicBlock* bb =
-                    (BasicBlock *) oatGrowableListIteratorNext(&iterator);
-                if (bb == NULL) break;
-                if (bb->hidden == true) continue;
-                change |= (*func)(cUnit, bb);
+        case kAllNodes:
+            {
+                GrowableListIterator iterator;
+                oatGrowableListIteratorInit(&cUnit->blockList, &iterator);
+                while (true) {
+                    BasicBlock* bb =
+                        (BasicBlock *) oatGrowableListIteratorNext(&iterator);
+                    if (bb == NULL) break;
+                    if (bb->hidden == true) continue;
+                    change |= (*func)(cUnit, bb);
+                }
             }
-        }
-        /*
-         * Scan all reachable blocks and perform the operations specified in
-         * func.
-         */
-        else if (dfaMode == kReachableNodes) {
-            int numReachableBlocks = cUnit->numReachableBlocks;
-            int idx;
-            const GrowableList *blockList = &cUnit->blockList;
+            break;
+        /* Scan reachable blocks and perform the ops specified in func. */
+        case kReachableNodes:
+            {
+                int numReachableBlocks = cUnit->numReachableBlocks;
+                int idx;
+                const GrowableList *blockList = &cUnit->blockList;
 
-            for (idx = 0; idx < numReachableBlocks; idx++) {
-                int blockIdx = cUnit->dfsOrder.elemList[idx];
-                BasicBlock* bb =
-                    (BasicBlock *) oatGrowableListGetElement(blockList,
-                                                             blockIdx);
-                change |= (*func)(cUnit, bb);
+                for (idx = 0; idx < numReachableBlocks; idx++) {
+                    int blockIdx = cUnit->dfsOrder.elemList[idx];
+                    BasicBlock* bb =
+                        (BasicBlock *) oatGrowableListGetElement(blockList,
+                                                                 blockIdx);
+                    change |= (*func)(cUnit, bb);
+                }
             }
-        }
-        /*
-         * Scan all reachable blocks by the pre-order in the depth-first-search
-         * CFG and perform the operations specified in func.
-         */
-        else if (dfaMode == kPreOrderDFSTraversal) {
-            int numReachableBlocks = cUnit->numReachableBlocks;
-            int idx;
-            const GrowableList *blockList = &cUnit->blockList;
+            break;
 
-            for (idx = 0; idx < numReachableBlocks; idx++) {
-                int dfsIdx = cUnit->dfsOrder.elemList[idx];
-                BasicBlock* bb =
-                    (BasicBlock *) oatGrowableListGetElement(blockList, dfsIdx);
-                change |= (*func)(cUnit, bb);
-            }
-        }
-        /*
-         * Scan all reachable blocks by the post-order in the depth-first-search
-         * CFG and perform the operations specified in func.
-         */
-        else if (dfaMode == kPostOrderDFSTraversal) {
-            int numReachableBlocks = cUnit->numReachableBlocks;
-            int idx;
-            const GrowableList *blockList = &cUnit->blockList;
+        /* Scan reachable blocks by pre-order dfs and invoke func on each. */
+        case kPreOrderDFSTraversal:
+            {
+                int numReachableBlocks = cUnit->numReachableBlocks;
+                int idx;
+                const GrowableList *blockList = &cUnit->blockList;
 
-            for (idx = numReachableBlocks - 1; idx >= 0; idx--) {
-                int dfsIdx = cUnit->dfsOrder.elemList[idx];
-                BasicBlock* bb =
-                    (BasicBlock *) oatGrowableListGetElement(blockList, dfsIdx);
-                change |= (*func)(cUnit, bb);
+                for (idx = 0; idx < numReachableBlocks; idx++) {
+                    int dfsIdx = cUnit->dfsOrder.elemList[idx];
+                    BasicBlock* bb =
+                        (BasicBlock *) oatGrowableListGetElement(blockList,
+                                                                 dfsIdx);
+                    change |= (*func)(cUnit, bb);
+                }
             }
-        }
-        /*
-         * Scan all reachable blocks by the post-order in the dominator tree
-         * and perform the operations specified in func.
-         */
-        else if (dfaMode == kPostOrderDOMTraversal) {
-            int numReachableBlocks = cUnit->numReachableBlocks;
-            int idx;
-            const GrowableList *blockList = &cUnit->blockList;
+            break;
+        /* Scan reachable blocks post-order dfs and invoke func on each. */
+        case kPostOrderDFSTraversal:
+            {
+                int numReachableBlocks = cUnit->numReachableBlocks;
+                int idx;
+                const GrowableList *blockList = &cUnit->blockList;
 
-            for (idx = 0; idx < numReachableBlocks; idx++) {
-                int domIdx = cUnit->domPostOrderTraversal.elemList[idx];
-                BasicBlock* bb =
-                    (BasicBlock *) oatGrowableListGetElement(blockList, domIdx);
-                change |= (*func)(cUnit, bb);
+                for (idx = numReachableBlocks - 1; idx >= 0; idx--) {
+                    int dfsIdx = cUnit->dfsOrder.elemList[idx];
+                    BasicBlock* bb =
+                        (BasicBlock *) oatGrowableListGetElement(blockList,
+                                                                 dfsIdx);
+                    change |= (*func)(cUnit, bb);
+                }
             }
+            break;
+        /* Scan reachable post-order dom tree and invoke func on each. */
+        case kPostOrderDOMTraversal:
+            {
+                int numReachableBlocks = cUnit->numReachableBlocks;
+                int idx;
+                const GrowableList *blockList = &cUnit->blockList;
+
+                for (idx = 0; idx < numReachableBlocks; idx++) {
+                    int domIdx = cUnit->domPostOrderTraversal.elemList[idx];
+                    BasicBlock* bb =
+                        (BasicBlock *) oatGrowableListGetElement(blockList,
+                                                                 domIdx);
+                    change |= (*func)(cUnit, bb);
+                }
+            }
+            break;
+        /* Scan reachable blocks reverse post-order dfs, invoke func on each */
+        case kReversePostOrderTraversal:
+            {
+                int numReachableBlocks = cUnit->numReachableBlocks;
+                int idx;
+                const GrowableList *blockList = &cUnit->blockList;
+
+                for (idx = numReachableBlocks - 1; idx >= 0; idx--) {
+                    int revIdx = cUnit->dfsPostOrder.elemList[idx];
+                    BasicBlock* bb =
+                        (BasicBlock *) oatGrowableListGetElement(blockList,
+                                                                 revIdx);
+                    change |= (*func)(cUnit, bb);
+                }
+            }
+            break;
+        default:
+            LOG(FATAL) << "Unknown traversal mode " << (int)dfaMode;
         }
         /* If isIterative is false, exit the loop after the first iteration */
         change &= isIterative;
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index 8070e6f..1b1215a 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -44,6 +44,7 @@
      //(1 << kDebugSlowestFieldPath) |
      //(1 << kDebugSlowestStringPath) |
      //(1 << kDebugExerciseResolveMethod) |
+     //(1 << kDebugVerifyDataflow) |
      0;
 
 std::string compilerMethodMatch;      // Method name match to apply above flags
@@ -136,6 +137,10 @@
     bottomBlock->firstMIRInsn = insn;
     bottomBlock->lastMIRInsn = origBlock->lastMIRInsn;
 
+    /* Add it to the quick lookup cache */
+    cUnit->blockMap.insert(std::make_pair(bottomBlock->startOffset,
+                                          bottomBlock));
+
     /* Handle the taken path */
     bottomBlock->taken = origBlock->taken;
     if (bottomBlock->taken) {
@@ -196,6 +201,7 @@
  * is not non-null and is the block being split, update *immedPredBlockP to
  * point to the bottom block so that outgoing edges can be set up properly
  * (by the caller)
+ * Utilizes a map for fast lookup of the typical cases.
  */
 STATIC BasicBlock *findBlock(CompilationUnit* cUnit,
                              unsigned int codeOffset,
@@ -205,28 +211,36 @@
     GrowableList* blockList = &cUnit->blockList;
     BasicBlock* bb;
     unsigned int i;
+    std::map<unsigned int, BasicBlock*>::iterator it;
 
-    for (i = 0; i < blockList->numUsed; i++) {
-        bb = (BasicBlock *) blockList->elemList[i];
-        if (bb->blockType != kDalvikByteCode) continue;
-        if (bb->startOffset == codeOffset) return bb;
-        /* Check if a branch jumps into the middle of an existing block */
-        if ((split == true) && (codeOffset > bb->startOffset) &&
-            (bb->lastMIRInsn != NULL) &&
-            (codeOffset <= bb->lastMIRInsn->offset)) {
-            BasicBlock *newBB = splitBlock(cUnit, codeOffset, bb,
-                                           bb == *immedPredBlockP ?
-                                           immedPredBlockP : NULL);
-            return newBB;
+    it = cUnit->blockMap.find(codeOffset);
+    if (it != cUnit->blockMap.end()) {
+        return it->second;
+    } else if (!create) {
+        return NULL;
+    }
+
+    if (split) {
+        for (i = 0; i < blockList->numUsed; i++) {
+            bb = (BasicBlock *) blockList->elemList[i];
+            if (bb->blockType != kDalvikByteCode) continue;
+            /* Check if a branch jumps into the middle of an existing block */
+            if ((codeOffset > bb->startOffset) && (bb->lastMIRInsn != NULL) &&
+                (codeOffset <= bb->lastMIRInsn->offset)) {
+                BasicBlock *newBB = splitBlock(cUnit, codeOffset, bb,
+                                               bb == *immedPredBlockP ?
+                                               immedPredBlockP : NULL);
+                return newBB;
+            }
         }
     }
-    if (create) {
-          bb = oatNewBB(kDalvikByteCode, cUnit->numBlocks++);
-          oatInsertGrowableList(&cUnit->blockList, (intptr_t) bb);
-          bb->startOffset = codeOffset;
-          return bb;
-    }
-    return NULL;
+
+    /* Create a new one */
+    bb = oatNewBB(kDalvikByteCode, cUnit->numBlocks++);
+    oatInsertGrowableList(&cUnit->blockList, (intptr_t) bb);
+    bb->startOffset = codeOffset;
+    cUnit->blockMap.insert(std::make_pair(bb->startOffset, bb));
+    return bb;
 }
 
 /* Dump the CFG into a DOT graph */
@@ -748,6 +762,8 @@
     cUnit->numOuts = code_item->outs_size_;
     /* Adjust this value accordingly once inlining is performed */
     cUnit->numDalvikRegisters = code_item->registers_size_;
+    cUnit->blockMap = std::map<unsigned int, BasicBlock*>();
+    cUnit->blockMap.clear();
     bool useMatch = compilerMethodMatch.length() != 0;
     bool match = useMatch && (compilerFlipMatch ^
         (PrettyMethod(method_idx, dex_file).find(compilerMethodMatch) != std::string::npos));
@@ -794,6 +810,8 @@
     BasicBlock *curBlock = oatNewBB(kDalvikByteCode, numBlocks++);
     curBlock->startOffset = 0;
     oatInsertGrowableList(&cUnit->blockList, (intptr_t) curBlock);
+    /* Add first block to the fast lookup cache */
+    cUnit->blockMap.insert(std::make_pair(curBlock->startOffset, curBlock));
     entryBlock->fallThrough = curBlock;
     oatSetBit(curBlock->predecessors, entryBlock->id);
 
@@ -885,8 +903,10 @@
         oatDumpCompilationUnit(cUnit.get());
     }
 
-    /* Verify if all blocks are connected as claimed */
-    oatDataFlowAnalysisDispatcher(cUnit.get(), verifyPredInfo, kAllNodes, false /* isIterative */);
+    if (cUnit->enableDebug & (1 << kDebugVerifyDataflow)) {
+        /* Verify if all blocks are connected as claimed */
+        oatDataFlowAnalysisDispatcher(cUnit.get(), verifyPredInfo, kAllNodes, false /* isIterative */);
+    }
 
     /* Perform SSA transformation for the whole method */
     oatMethodSSATransformation(cUnit.get());
diff --git a/src/compiler/SSATransformation.cc b/src/compiler/SSATransformation.cc
index 1f6a3b5..a25d225 100644
--- a/src/compiler/SSATransformation.cc
+++ b/src/compiler/SSATransformation.cc
@@ -20,17 +20,17 @@
 namespace art {
 
 /* Enter the node to the dfsOrder list then visit its successors */
-STATIC void recordDFSPreOrder(CompilationUnit* cUnit, BasicBlock* block)
+STATIC void recordDFSOrders(CompilationUnit* cUnit, BasicBlock* block)
 {
 
     if (block->visited || block->hidden) return;
     block->visited = true;
 
-    /* Enqueue the block id */
+    /* Enqueue the preOrder block id */
     oatInsertGrowableList(&cUnit->dfsOrder, block->id);
 
-    if (block->fallThrough) recordDFSPreOrder(cUnit, block->fallThrough);
-    if (block->taken) recordDFSPreOrder(cUnit, block->taken);
+    if (block->fallThrough) recordDFSOrders(cUnit, block->fallThrough);
+    if (block->taken) recordDFSOrders(cUnit, block->taken);
     if (block->successorBlockList.blockListType != kNotUsed) {
         GrowableListIterator iterator;
         oatGrowableListIteratorInit(&block->successorBlockList.blocks,
@@ -40,16 +40,20 @@
                 (SuccessorBlockInfo *) oatGrowableListIteratorNext(&iterator);
             if (successorBlockInfo == NULL) break;
             BasicBlock* succBB = successorBlockInfo->block;
-            recordDFSPreOrder(cUnit, succBB);
+            recordDFSOrders(cUnit, succBB);
         }
     }
+
+    /* Record postorder in basic block and enqueue normal id in dfsPostOrder */
+    block->dfsId = cUnit->dfsPostOrder.numUsed;
+    oatInsertGrowableList(&cUnit->dfsPostOrder, block->id);
     return;
 }
 
-/* Sort the blocks by the Depth-First-Search pre-order */
-STATIC void computeDFSOrder(CompilationUnit* cUnit)
+/* Sort the blocks by the Depth-First-Search */
+STATIC void computeDFSOrders(CompilationUnit* cUnit)
 {
-    /* Initialize or reset the DFS order list */
+    /* Initialize or reset the DFS preOrder list */
     if (cUnit->dfsOrder.elemList == NULL) {
         oatInitGrowableList(&cUnit->dfsOrder, cUnit->numBlocks);
     } else {
@@ -57,11 +61,19 @@
         cUnit->dfsOrder.numUsed = 0;
     }
 
+    /* Initialize or reset the DFS postOrder list */
+    if (cUnit->dfsPostOrder.elemList == NULL) {
+        oatInitGrowableList(&cUnit->dfsPostOrder, cUnit->numBlocks);
+    } else {
+        /* Just reset the used length on the counter */
+        cUnit->dfsPostOrder.numUsed = 0;
+    }
+
     oatDataFlowAnalysisDispatcher(cUnit, oatClearVisitedFlag,
                                           kAllNodes,
                                           false /* isIterative */);
 
-    recordDFSPreOrder(cUnit, cUnit->entryBlock);
+    recordDFSOrders(cUnit, cUnit->entryBlock);
     cUnit->numReachableBlocks = cUnit->dfsOrder.numUsed;
 }
 
@@ -125,6 +137,7 @@
 
     /* Iterate through the dominated blocks first */
     while (true) {
+        //TUNING: hot call to oatBitVectorIteratorNext
         int bbIdx = oatBitVectorIteratorNext(&bvIterator);
         if (bbIdx == -1) break;
         BasicBlock* dominatedBB =
@@ -184,6 +197,7 @@
     ArenaBitVectorIterator bvIterator;
     oatBitVectorIteratorInit(bb->iDominated, &bvIterator);
     while (true) {
+        //TUNING: hot call to oatBitVectorIteratorNext
         int dominatedIdx = oatBitVectorIteratorNext(&bvIterator);
         if (dominatedIdx == -1) break;
         BasicBlock* dominatedBB = (BasicBlock* )
@@ -191,6 +205,7 @@
         ArenaBitVectorIterator dfIterator;
         oatBitVectorIteratorInit(dominatedBB->domFrontier, &dfIterator);
         while (true) {
+            //TUNING: hot call to oatBitVectorIteratorNext
             int dfUpIdx = oatBitVectorIteratorNext(&dfIterator);
             if (dfUpIdx == -1) break;
             BasicBlock* dfUpBlock = (BasicBlock* )
@@ -225,8 +240,12 @@
     return true;
 }
 
-/* Worker function to compute each block's dominators */
-STATIC bool computeBlockDominators(CompilationUnit* cUnit, BasicBlock* bb)
+/*
+ * Worker function to compute each block's dominators.  This implementation
+ * is only used when kDebugVerifyDataflow is active and should compute
+ * the same dominator sets as computeBlockDominators.
+ */
+STATIC bool slowComputeBlockDominators(CompilationUnit* cUnit, BasicBlock* bb)
 {
     GrowableList* blockList = &cUnit->blockList;
     int numTotalBlocks = blockList->numUsed;
@@ -261,8 +280,12 @@
     return false;
 }
 
-/* Worker function to compute the idom */
-STATIC bool computeImmediateDominator(CompilationUnit* cUnit, BasicBlock* bb)
+/*
+ * Worker function to compute the idom.  This implementation is only
+ * used when kDebugVerifyDataflow is active and should compute the
+ * same iDom as computeBlockIDom.
+ */
+STATIC bool slowComputeBlockIDom(CompilationUnit* cUnit, BasicBlock* bb)
 {
     GrowableList* blockList = &cUnit->blockList;
     ArenaBitVector* tempBlockV = cUnit->tempBlockV;
@@ -304,6 +327,107 @@
     return true;
 }
 
+/*
+ * Walk through the ordered iDom list until we reach common parent.
+ * Given the ordering of iDomList, this common parent represents the
+ * last element of the intersection of block1 and block2 dominators.
+  */
+int findCommonParent(CompilationUnit *cUnit, int block1, int block2)
+{
+    while (block1 != block2) {
+        while (block1 < block2) {
+            block1 = cUnit->iDomList[block1];
+            DCHECK_NE(block1, NOTVISITED);
+        }
+        while (block2 < block1) {
+            block2 = cUnit->iDomList[block2];
+            DCHECK_NE(block2, NOTVISITED);
+        }
+    }
+    return block1;
+}
+
+/* Worker function to compute each block's immediate dominator */
+STATIC bool computeBlockIDom(CompilationUnit* cUnit, BasicBlock* bb)
+{
+    GrowableList* blockList = &cUnit->blockList;
+    ArenaBitVectorIterator bvIterator;
+    int idom = -1;
+
+    /* Special-case entry block */
+    if (bb == cUnit->entryBlock) {
+        return false;
+    }
+
+    /* Iterate through the predecessors */
+    oatBitVectorIteratorInit(bb->predecessors, &bvIterator);
+
+    /* Find the first processed predecessor */
+    while (true) {
+        //TUNING: hot call to oatBitVectorIteratorNext
+        int predIdx = oatBitVectorIteratorNext(&bvIterator);
+        DCHECK_NE(predIdx, -1);  /* Should find one */
+        BasicBlock* predBB = (BasicBlock* ) oatGrowableListGetElement(
+                                 blockList, predIdx);
+        if (cUnit->iDomList[predBB->dfsId] != NOTVISITED) {
+            idom = predBB->dfsId;
+            break;
+        }
+    }
+
+    /* Scan the rest of the predecessors */
+    while (true) {
+        int predIdx = oatBitVectorIteratorNext(&bvIterator);
+        if (predIdx == -1) break;
+        BasicBlock* predBB = (BasicBlock* ) oatGrowableListGetElement(
+                                 blockList, predIdx);
+        if (cUnit->iDomList[predBB->dfsId] == NOTVISITED) {
+            continue;
+        } else {
+            idom = findCommonParent(cUnit, predBB->dfsId, idom);
+        }
+    }
+
+    DCHECK_NE(idom, NOTVISITED);
+
+    /* Did something change? */
+    if (cUnit->iDomList[bb->dfsId] != idom) {
+        cUnit->iDomList[bb->dfsId] = idom;
+        return true;
+    }
+    return false;
+}
+
+/* Worker function to compute each block's domintors */
+STATIC bool computeBlockDominators(CompilationUnit* cUnit, BasicBlock* bb)
+{
+    if (bb == cUnit->entryBlock) {
+        oatClearAllBits(bb->dominators);
+    } else {
+        oatCopyBitVector(bb->dominators, bb->iDom->dominators);
+    }
+    oatSetBit(bb->dominators, bb->id);
+    return false;
+}
+
+STATIC bool setDominators(CompilationUnit* cUnit, BasicBlock* bb)
+{
+    if (bb != cUnit->entryBlock) {
+        int iDomDFSIdx = cUnit->iDomList[bb->dfsId];
+        DCHECK_NE(iDomDFSIdx, NOTVISITED);
+        int iDomIdx = cUnit->dfsPostOrder.elemList[iDomDFSIdx];
+        BasicBlock* iDom = (BasicBlock*)
+              oatGrowableListGetElement(&cUnit->blockList, iDomIdx);
+        if (cUnit->enableDebug & (1 << kDebugVerifyDataflow)) {
+            DCHECK_EQ(bb->iDom->id, iDom->id);
+        }
+        bb->iDom = iDom;
+        /* Add bb to the iDominated set of the immediate dominator block */
+        oatSetBit(iDom->iDominated, bb->id);
+    }
+    return false;
+}
+
 /* Compute dominators, immediate dominator, and dominance fronter */
 STATIC void computeDominators(CompilationUnit* cUnit)
 {
@@ -315,6 +439,23 @@
                                           kReachableNodes,
                                           false /* isIterative */);
 
+    /* Initalize & Clear iDomList */
+    if (cUnit->iDomList == NULL) {
+        cUnit->iDomList = (int*)oatNew(sizeof(int) * numReachableBlocks, false);
+    }
+    for (int i = 0; i < numReachableBlocks; i++) {
+        cUnit->iDomList[i] = NOTVISITED;
+    }
+
+    /* For post-order, last block is entry block.  Set its iDom to istelf */
+    DCHECK_EQ(cUnit->entryBlock->dfsId, numReachableBlocks-1);
+    cUnit->iDomList[cUnit->entryBlock->dfsId] = cUnit->entryBlock->dfsId;
+
+    /* Compute the immediate dominators */
+    oatDataFlowAnalysisDispatcher(cUnit, computeBlockIDom,
+                                  kReversePostOrderTraversal,
+                                  true /* isIterative */);
+
     /* Set the dominator for the root node */
     oatClearAllBits(cUnit->entryBlock->dominators);
     oatSetBit(cUnit->entryBlock->dominators, cUnit->entryBlock->id);
@@ -325,14 +466,27 @@
     } else {
         oatClearAllBits(cUnit->tempBlockV);
     }
-    oatDataFlowAnalysisDispatcher(cUnit, computeBlockDominators,
-                                          kPreOrderDFSTraversal,
-                                          true /* isIterative */);
-
     cUnit->entryBlock->iDom = NULL;
-    oatDataFlowAnalysisDispatcher(cUnit, computeImmediateDominator,
-                                          kReachableNodes,
-                                          false /* isIterative */);
+
+    /* For testing, compute sets using alternate mechanism */
+    if (cUnit->enableDebug & (1 << kDebugVerifyDataflow)) {
+        // Use alternate mechanism to compute dominators for comparison
+        oatDataFlowAnalysisDispatcher(cUnit, slowComputeBlockDominators,
+                                      kPreOrderDFSTraversal,
+                                      true /* isIterative */);
+
+       oatDataFlowAnalysisDispatcher(cUnit, slowComputeBlockIDom,
+                                     kReachableNodes,
+                                     false /* isIterative */);
+    }
+
+    oatDataFlowAnalysisDispatcher(cUnit, setDominators,
+                                  kReachableNodes,
+                                  false /* isIterative */);
+
+    oatDataFlowAnalysisDispatcher(cUnit, computeBlockDominators,
+                                  kReversePostOrderTraversal,
+                                  false /* isIterative */);
 
     /*
      * Now go ahead and compute the post order traversal based on the
@@ -456,6 +610,7 @@
                     (BasicBlock* ) oatGrowableListGetElement(blockList, idx);
 
                 /* Merge the dominance frontier to tmpBlocks */
+                //TUNING: hot call to oatUnifyBitVectors
                 if (defBB->domFrontier != NULL) {
                     oatUnifyBitVectors(tmpBlocks, tmpBlocks, defBB->domFrontier);
                 }
@@ -602,7 +757,7 @@
 void oatMethodSSATransformation(CompilationUnit* cUnit)
 {
     /* Compute the DFS order */
-    computeDFSOrder(cUnit);
+    computeDFSOrders(cUnit);
 
     /* Compute the dominator info */
     computeDominators(cUnit);
diff --git a/src/compiler/Utility.cc b/src/compiler/Utility.cc
index 8d70136..c76143b 100644
--- a/src/compiler/Utility.cc
+++ b/src/compiler/Utility.cc
@@ -231,6 +231,8 @@
     bv->storageSize = count;
     bv->expandable = expandable;
     bv->storage = (u4*) oatNew(count * sizeof(u4), true);
+    bv->firstDirty = true;
+    bv->lastDirty = true;
     return bv;
 }
 
@@ -252,6 +254,8 @@
 {
     unsigned int count = pBits->storageSize;
     memset(pBits->storage, 0, count * sizeof(u4));
+    pBits->firstDirty = true;
+    pBits->lastDirty = true;
 }
 
 /*
@@ -281,6 +285,12 @@
     }
 
     pBits->storage[num >> 5] |= checkMasks[num & 0x1f];
+    if (!pBits->firstDirty && ((int)num < pBits->firstBitSet)) {
+        pBits->firstBitSet = num;
+    }
+    if (!pBits->lastDirty && ((int)num > pBits->lastBitSet)) {
+        pBits->lastBitSet = num;
+    }
     return true;
 }
 
@@ -299,6 +309,8 @@
     }
 
     pBits->storage[num >> 5] &= ~checkMasks[num & 0x1f];
+    pBits->firstDirty = true;
+    pBits->lastDirty = true;
     return true;
 }
 
@@ -309,6 +321,8 @@
 {
     int value = set ? -1 : 0;
     memset(pBits->storage, value, pBits->storageSize * (int)sizeof(u4));
+    pBits->firstDirty = true;
+    pBits->lastDirty = true;
 }
 
 void oatDebugBitVector(char* msg, const ArenaBitVector* bv, int length)
@@ -374,6 +388,10 @@
     checkSizes(dest, src);
 
     memcpy(dest->storage, src->storage, sizeof(u4) * dest->storageSize);
+    dest->firstDirty = src->firstDirty;
+    dest->firstBitSet = src->firstBitSet;
+    dest->lastDirty = src->lastDirty;
+    dest->lastBitSet = src->lastBitSet;
 }
 
 /*
@@ -395,6 +413,8 @@
     for (idx = 0; idx < dest->storageSize; idx++) {
         dest->storage[idx] = src1->storage[idx] & src2->storage[idx];
     }
+    dest->firstDirty = true;
+    dest->lastDirty = true;
     return true;
 }
 
@@ -416,6 +436,8 @@
     for (idx = 0; idx < dest->storageSize; idx++) {
         dest->storage[idx] = src1->storage[idx] | src2->storage[idx];
     }
+    dest->firstDirty = true;
+    dest->lastDirty = true;
     return true;
 }
 
@@ -466,18 +488,36 @@
 /* Return the next position set to 1. -1 means end-of-element reached */
 int oatBitVectorIteratorNext(ArenaBitVectorIterator* iterator)
 {
-    const ArenaBitVector* pBits = iterator->pBits;
+    ArenaBitVector* pBits = iterator->pBits;
     u4 bitIndex = iterator->idx;
 
     DCHECK_EQ(iterator->bitSize, pBits->storageSize * sizeof(u4) * 8);
     if (bitIndex >= iterator->bitSize) return -1;
 
+    /* If we know, skip past leading zeros */
+    if (!pBits->firstDirty && ((int)bitIndex < pBits->firstBitSet)) {
+        iterator->idx = pBits->firstBitSet + 1;
+        return pBits->firstBitSet;
+    }
+
+    /* If we know, skip past trailing zeroes */
+    if (!pBits->lastDirty && ((int)bitIndex > pBits->lastBitSet)) {
+        iterator->idx = iterator->bitSize;
+        return -1;
+    }
+
+    bool firstPass = (bitIndex == 0);
+    u4 startIndex = bitIndex;
     for (; bitIndex < iterator->bitSize;) {
         unsigned int wordIndex = bitIndex >> 5;
         unsigned int bitPos = bitIndex & 0x1f;
         unsigned int word = pBits->storage[wordIndex];
         if (word & checkMasks[bitPos]) {
             iterator->idx = bitIndex+1;
+            if (firstPass && pBits->firstDirty) {
+                pBits->firstBitSet = bitIndex;
+                pBits->firstDirty = false;
+            }
             return bitIndex;
         }
         if (word == 0) {
@@ -488,6 +528,14 @@
         }
     }
     /* No more set bits */
+    if (firstPass) {
+        // Empty
+        pBits->firstBitSet = -1;
+        pBits->firstDirty = false;
+    } else {
+        pBits->lastBitSet = startIndex - 1;
+        pBits->lastDirty = false;
+    }
     return -1;
 }
 
@@ -507,6 +555,8 @@
     if (remNumBits) {
         pBits->storage[idx] = (1 << remNumBits) - 1;
     }
+    pBits->firstDirty = true;
+    pBits->lastDirty = true;
 }
 
 void oatGetBlockName(BasicBlock* bb, char* name)