Branch fusing

A belated birthday gift for irogers.  Fuse cmp-long/if-XXz,
cmp[lg]-[float|double]/if-XXz.

Change-Id: I8fa87f620fcf4e6bcf291bbc7a0ea6c8f5535467
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index 25fc89e..621cccc 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -161,10 +161,16 @@
 enum ExtendedMIROpcode {
     kMirOpFirst = kNumPackedOpcodes,
     kMirOpPhi = kMirOpFirst,
+    kMirOpCopy,
+    kMirOpFusedCmplFloat,
+    kMirOpFusedCmpgFloat,
+    kMirOpFusedCmplDouble,
+    kMirOpFusedCmpgDouble,
+    kMirOpFusedCmpLong,
+    kMirOpNop,
     kMirOpNullNRangeUpCheck,
     kMirOpNullNRangeDownCheck,
     kMirOpLowerBound,
-    kMirOpCopy,
     kMirOpLast,
 };
 
@@ -343,7 +349,8 @@
     MIR* phiList;
 
     /* Use counts of ssa names */
-    GrowableList useCounts;
+    GrowableList useCounts;             // Weighted by nesting depth
+    GrowableList rawUseCounts;          // Not weighted
 
     /* Optimization support */
     GrowableList loopHeaders;
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index 72e3dc4..5f632c0 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -801,10 +801,36 @@
     // Beginning of extended MIR opcodes
     // 100 MIR_PHI
     DF_PHI | DF_DA | DF_NULL_TRANSFER_N,
-    /*
-     * For extended MIR inserted at the MIR2LIR stage, it is okay to have
-     * undefined values here.
-     */
+
+    // 101 MIR_COPY
+    DF_DA | DF_UB | DF_IS_MOVE,
+
+    // 102 MIR_FUSED_CMPL_FLOAT
+    DF_UA | DF_UB | DF_FP_A | DF_FP_B,
+
+    // 103 MIR_FUSED_CMPG_FLOAT
+    DF_UA | DF_UB | DF_FP_A | DF_FP_B,
+
+    // 104 MIR_FUSED_CMPL_DOUBLE
+    DF_UA_WIDE | DF_UB_WIDE | DF_FP_A | DF_FP_B,
+
+    // 105 MIR_FUSED_CMPG_DOUBLE
+    DF_UA_WIDE | DF_UB_WIDE | DF_FP_A | DF_FP_B,
+
+    // 106 MIR_FUSED_CMP_LONG
+    DF_UA_WIDE | DF_UB_WIDE | DF_CORE_A | DF_CORE_B,
+
+    // 107 MIR_NOP
+    DF_NOP,
+
+    // 108 MIR_NULL_RANGE_UP_CHECK
+    0,
+
+    // 109 MIR_NULL_RANGE_DOWN_CHECK
+    0,
+
+    // 110 MIR_LOWER_BOUND
+    0,
 };
 
 /* Return the base virtual register for a SSA name */
@@ -820,6 +846,13 @@
     return GET_ELEM_N(cUnit->ssaSubscripts, int, ssaReg);
 }
 
+int getSSAUseCount(CompilationUnit* cUnit, int sReg)
+{
+    DCHECK(sReg < (int)cUnit->rawUseCounts.numUsed);
+    return cUnit->rawUseCounts.elemList[sReg];
+}
+
+
 char* oatGetDalvikDisassembly(CompilationUnit* cUnit,
                               const DecodedInstruction& insn, const char* note)
 {
@@ -1827,7 +1860,69 @@
             case Instruction::CMPG_FLOAT:
             case Instruction::CMPG_DOUBLE:
             case Instruction::CMP_LONG:
-                // TODO: Check for and fuse preceeding comparison
+                if (mir->next != NULL) {
+                    MIR* mirNext = mir->next;
+                    Instruction::Code brOpcode = mirNext->dalvikInsn.opcode;
+                    ConditionCode ccode = kCondNv;
+                    switch(brOpcode) {
+                        case Instruction::IF_EQZ:
+                            ccode = kCondEq;
+                            break;
+                        case Instruction::IF_NEZ:
+                           // ccode = kCondNe;
+                            break;
+                        case Instruction::IF_LTZ:
+                            // ccode = kCondLt;
+                            break;
+                        case Instruction::IF_GEZ:
+                            // ccode = kCondGe;
+                            break;
+                        case Instruction::IF_GTZ:
+                            // ccode = kCondGt;
+                            break;
+                        case Instruction::IF_LEZ:
+                            // ccode = kCondLe;
+                            break;
+                        default:
+                            break;
+                    }
+                    // Make sure result of cmp is used by next insn and nowhere else
+                    if ((ccode != kCondNv) &&
+                        (mir->ssaRep->defs[0] == mirNext->ssaRep->uses[0]) &&
+                        (getSSAUseCount(cUnit, mir->ssaRep->defs[0]) == 1)) {
+                        mirNext->dalvikInsn.arg[0] = ccode;
+                        switch(opcode) {
+                            case Instruction::CMPL_FLOAT:
+                                mirNext->dalvikInsn.opcode =
+                                    static_cast<Instruction::Code>(kMirOpFusedCmplFloat);
+                                break;
+                            case Instruction::CMPL_DOUBLE:
+                                mirNext->dalvikInsn.opcode =
+                                    static_cast<Instruction::Code>(kMirOpFusedCmplDouble);
+                                break;
+                            case Instruction::CMPG_FLOAT:
+                                mirNext->dalvikInsn.opcode =
+                                    static_cast<Instruction::Code>(kMirOpFusedCmpgFloat);
+                                break;
+                            case Instruction::CMPG_DOUBLE:
+                                mirNext->dalvikInsn.opcode =
+                                    static_cast<Instruction::Code>(kMirOpFusedCmpgDouble);
+                                break;
+                            case Instruction::CMP_LONG:
+                                mirNext->dalvikInsn.opcode =
+                                    static_cast<Instruction::Code>(kMirOpFusedCmpLong);
+                                break;
+                            default: LOG(ERROR) << "Unexpected opcode: " << (int)opcode;
+                        }
+                        mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+                        mirNext->ssaRep->numUses = mir->ssaRep->numUses;
+                        mirNext->ssaRep->uses = mir->ssaRep->uses;
+                        mirNext->ssaRep->fpUse = mir->ssaRep->fpUse;
+                        mirNext->ssaRep->numDefs = 0;
+                        mir->ssaRep->numUses = 0;
+                        mir->ssaRep->numDefs = 0;
+                    }
+                }
                 break;
             default:
                 break;
@@ -2230,6 +2325,7 @@
         for (int i = 0; i < mir->ssaRep->numUses; i++) {
             int sReg = mir->ssaRep->uses[i];
             DCHECK_LT(sReg, (int)cUnit->useCounts.numUsed);
+            cUnit->rawUseCounts.elemList[sReg]++;
             cUnit->useCounts.elemList[sReg] += (1 << weight);
         }
         if (!(cUnit->disableOpt & (1 << kPromoteCompilerTemps))) {
@@ -2248,6 +2344,7 @@
                     usesMethodStar &= invokeUsesMethodStar(cUnit, mir);
                 }
                 if (usesMethodStar) {
+                    cUnit->rawUseCounts.elemList[cUnit->methodSReg]++;
                     cUnit->useCounts.elemList[cUnit->methodSReg] += (1 << weight);
                 }
             }
@@ -2258,14 +2355,17 @@
 
 void oatMethodUseCount(CompilationUnit *cUnit)
 {
-    if (cUnit->disableOpt & (1 << kPromoteRegs)) {
-        return;
-    }
     oatInitGrowableList(cUnit, &cUnit->useCounts, cUnit->numSSARegs + 32,
                         kListMisc);
+    oatInitGrowableList(cUnit, &cUnit->rawUseCounts, cUnit->numSSARegs + 32,
+                        kListMisc);
     // Initialize list
     for (int i = 0; i < cUnit->numSSARegs; i++) {
         oatInsertGrowableList(cUnit, &cUnit->useCounts, 0);
+        oatInsertGrowableList(cUnit, &cUnit->rawUseCounts, 0);
+    }
+    if (cUnit->disableOpt & (1 << kPromoteRegs)) {
+        return;
     }
     oatDataFlowAnalysisDispatcher(cUnit, countUses,
                                   kAllNodes, false /* isIterative */);
diff --git a/src/compiler/codegen/CompilerCodegen.h b/src/compiler/codegen/CompilerCodegen.h
index 8f854da..20b2e45 100644
--- a/src/compiler/codegen/CompilerCodegen.h
+++ b/src/compiler/codegen/CompilerCodegen.h
@@ -26,6 +26,11 @@
 
 int oatGetInsnSize(LIR* lir);
 
+void genFusedLongCmpBranch(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir);
+void genFusedFPCmpBranch(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                         bool gtBias, bool isDouble);
+
+
 /* Lower middle-level IR to low-level IR for the whole method */
 void oatMethodMIR2LIR(CompilationUnit* cUnit);
 
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index d753fcc..1dc11da 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -741,10 +741,16 @@
     "kMirOpNullNRangeDownCheck",
     "kMirOpLowerBound",
     "kMirOpCopy",
+    "kMirFusedCmplFloat",
+    "kMirFusedCmpgFloat",
+    "kMirFusedCmplDouble",
+    "kMirFusedCmpgDouble",
+    "kMirFusedCmpLong",
+    "kMirNop",
 };
 
 /* Extended MIR instructions like PHI */
-void handleExtendedMethodMIR(CompilationUnit* cUnit, MIR* mir)
+void handleExtendedMethodMIR(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir)
 {
     int opOffset = mir->dalvikInsn.opcode - kMirOpFirst;
     char* msg = NULL;
@@ -771,6 +777,23 @@
             storeValue(cUnit, rlDest, rlSrc);
             break;
         }
+#if defined(TARGET_ARM)
+        case kMirOpFusedCmplFloat:
+            genFusedFPCmpBranch(cUnit, bb, mir, false /*gt bias*/, false /*double*/);
+            break;
+        case kMirOpFusedCmpgFloat:
+            genFusedFPCmpBranch(cUnit, bb, mir, true /*gt bias*/, false /*double*/);
+            break;
+        case kMirOpFusedCmplDouble:
+            genFusedFPCmpBranch(cUnit, bb, mir, false /*gt bias*/, true /*double*/);
+            break;
+        case kMirOpFusedCmpgDouble:
+            genFusedFPCmpBranch(cUnit, bb, mir, true /*gt bias*/, true /*double*/);
+            break;
+        case kMirOpFusedCmpLong:
+            genFusedLongCmpBranch(cUnit, bb, mir);
+            break;
+#endif
         default:
             break;
     }
@@ -827,11 +850,6 @@
         cUnit->liveSReg = INVALID_SREG;
 #endif
 
-        if ((int)mir->dalvikInsn.opcode >= (int)kMirOpFirst) {
-            handleExtendedMethodMIR(cUnit, mir);
-            continue;
-        }
-
         cUnit->currentDalvikOffset = mir->offset;
 
         Instruction::Code dalvikOpcode = mir->dalvikInsn.opcode;
@@ -864,6 +882,11 @@
             newLIR1(cUnit, kPseudoSSARep, (int) ssaString);
         }
 
+        if ((int)mir->dalvikInsn.opcode >= (int)kMirOpFirst) {
+            handleExtendedMethodMIR(cUnit, bb, mir);
+            continue;
+        }
+
         bool notHandled = compileDalvikInstruction(cUnit, mir, bb, labelList);
         if (notHandled) {
           LOG(FATAL) << StringPrintf("%#06x: Opcode %#x (%s) / Fmt %d not handled",
diff --git a/src/compiler/codegen/arm/FP/Thumb2VFP.cc b/src/compiler/codegen/arm/FP/Thumb2VFP.cc
index 72b4fec..380c014 100644
--- a/src/compiler/codegen/arm/FP/Thumb2VFP.cc
+++ b/src/compiler/codegen/arm/FP/Thumb2VFP.cc
@@ -182,6 +182,60 @@
     return false;
 }
 
+void genFusedFPCmpBranch(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
+                         bool gtBias, bool isDouble)
+{
+    LIR* labelList = (LIR*)cUnit->blockLabelList;
+    LIR* target = &labelList[bb->taken->id];
+    RegLocation rlSrc1;
+    RegLocation rlSrc2;
+    if (isDouble) {
+        rlSrc1 = oatGetSrcWide(cUnit, mir, 0, 1);
+        rlSrc2 = oatGetSrcWide(cUnit, mir, 2, 3);
+        rlSrc1 = loadValueWide(cUnit, rlSrc1, kFPReg);
+        rlSrc2 = loadValueWide(cUnit, rlSrc2, kFPReg);
+        newLIR2(cUnit, kThumb2Vcmpd, S2D(rlSrc1.lowReg, r1Src2.highReg),
+                S2D(rlSrc2.lowReg, rlSrc2.highReg));
+    } else {
+        rlSrc1 = oatGetSrc(cUnit, mir, 0);
+        rlSrc2 = oatGetSrc(cUnit, mir, 1);
+        rlSrc1 = loadValue(cUnit, rlSrc1, kFPReg);
+        rlSrc2 = loadValue(cUnit, rlSrc2, kFPReg);
+        newLIR2(cUnit, kThumb2Vcmps, rlSrc1.lowReg, rlSrc2.lowReg);
+    }
+    newLIR0(cUnit, kThumb2Fmstat);
+    ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
+    switch(ccode) {
+        case kCondEq:
+        case kCondNe:
+            break;
+        case kCondLt:
+            if (gtBias) {
+                ccode = kCondMi;
+            }
+            break;
+        case kCondLe:
+            if (gtBias) {
+                ccode = kCondLs;
+            }
+            break;
+        case kCondGt:
+            if (gtBias) {
+                ccode = kCondHi;
+            }
+            break;
+        case kCondGe:
+            if (gtBias) {
+                ccode = kCondCs;
+            }
+            break;
+        default:
+            LOG(FATAL) << "Unexpected ccode: " << (int)ccode;
+    }
+    opCondBranch(cUnit, ccode, target);
+}
+
+
 bool genCmpFP(CompilationUnit* cUnit, MIR* mir, RegLocation rlDest,
               RegLocation rlSrc1, RegLocation rlSrc2)
 {
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index 9477d2c..ea02ca9 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -654,6 +654,48 @@
     branch3->target = branch1->target;
 }
 
+void genFusedLongCmpBranch(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir)
+{
+    LIR* labelList = (LIR*)cUnit->blockLabelList;
+    LIR* taken = &labelList[bb->taken->id];
+    RegLocation rlSrc1 = oatGetSrcWide(cUnit, mir, 0, 1);
+    RegLocation rlSrc2 = oatGetSrcWide(cUnit, mir, 2, 3);
+    rlSrc1 = loadValueWide(cUnit, rlSrc1, kCoreReg);
+    rlSrc2 = loadValueWide(cUnit, rlSrc2, kCoreReg);
+    ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
+    LIR* notTaken = rawLIR(cUnit, mir->offset, kPseudoTargetLabel);
+    opRegReg(cUnit, kOpCmp, rlSrc1.highReg, rlSrc2.highReg);
+    switch(ccode) {
+        case kCondEq:
+            opCondBranch(cUnit, kCondNe, notTaken);
+            break;
+        case kCondNe:
+            opCondBranch(cUnit, kCondNe, taken);
+            break;
+        case kCondLt:
+            opCondBranch(cUnit, kCondLt, taken);
+            opCondBranch(cUnit, kCondGt, notTaken);
+            break;
+        case kCondLe:
+            opCondBranch(cUnit, kCondLt, taken);
+            opCondBranch(cUnit, kCondGt, notTaken);
+            break;
+        case kCondGt:
+            opCondBranch(cUnit, kCondGt, taken);
+            opCondBranch(cUnit, kCondLt, notTaken);
+            break;
+        case kCondGe:
+            opCondBranch(cUnit, kCondGt, taken);
+            opCondBranch(cUnit, kCondLt, notTaken);
+            break;
+        default:
+            LOG(FATAL) << "Unexpected ccode: " << (int)ccode;
+    }
+    opRegReg(cUnit, kOpCmp, rlSrc1.lowReg, rlSrc2.lowReg);
+    opCondBranch(cUnit, ccode, taken);
+    oatAppendLIR(cUnit, notTaken);
+}
+
 /*
  * Generate a register comparison to an immediate and branch.  Caller
  * is responsible for setting branch target field.