ARM: Do not allocate temp registers in MulLong if possible.

Just use rl_result if we have enough registers and it is *not* either operand.

Change-Id: I5a6f3ec09653b97e41bbc6dce823aa8534f98a13
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index d22219a..1d959fa 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -816,7 +816,12 @@
 void ArmMir2Lir::GenMulLong(Instruction::Code opcode, RegLocation rl_dest,
                             RegLocation rl_src1, RegLocation rl_src2) {
     /*
-     * To pull off inline multiply, we have a worst-case requirement of 8 temporary
+     * tmp1     = src1.hi * src2.lo;  // src1.hi is no longer needed
+     * dest     = src1.lo * src2.lo;
+     * tmp1    += src1.lo * src2.hi;
+     * dest.hi += tmp1;
+     *
+     * To pull off inline multiply, we have a worst-case requirement of 7 temporary
      * registers.  Normally for Arm, we get 5.  We can get to 6 by including
      * lr in the temp set.  The only problematic case is all operands and result are
      * distinct, and none have been promoted.  In that case, we can succeed by aggressively
@@ -833,57 +838,85 @@
       StoreValueWide(rl_dest, rl_result);
       return;
     }
+
+    rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+    rl_src2 = LoadValueWide(rl_src2, kCoreReg);
+
+    int reg_status = 0;
+    int res_lo = INVALID_REG;
+    int res_hi = INVALID_REG;
+    bool dest_promoted = rl_dest.location == kLocPhysReg && !rl_dest.reg.IsInvalid() &&
+        !IsTemp(rl_dest.reg.GetReg()) && !IsTemp(rl_dest.reg.GetHighReg());
+    bool src1_promoted = !IsTemp(rl_src1.reg.GetReg()) && !IsTemp(rl_src1.reg.GetHighReg());
+    bool src2_promoted = !IsTemp(rl_src2.reg.GetReg()) && !IsTemp(rl_src2.reg.GetHighReg());
+    // Check if rl_dest is *not* either operand and we have enough temp registers.
+    if ((rl_dest.s_reg_low != rl_src1.s_reg_low && rl_dest.s_reg_low != rl_src2.s_reg_low) &&
+        (dest_promoted || src1_promoted || src2_promoted)) {
+      // In this case, we do not need to manually allocate temp registers for result.
+      rl_result = EvalLoc(rl_dest, kCoreReg, true);
+      res_lo = rl_result.reg.GetReg();
+      res_hi = rl_result.reg.GetHighReg();
+    } else {
+      res_lo = AllocTemp();
+      if ((rl_src1.s_reg_low == rl_src2.s_reg_low) || src1_promoted || src2_promoted) {
+        // In this case, we have enough temp registers to be allocated for result.
+        res_hi = AllocTemp();
+        reg_status = 1;
+      } else {
+        // In this case, all temps are now allocated.
+        // res_hi will be allocated after we can free src1_hi.
+        reg_status = 2;
+      }
+    }
+
     // Temporarily add LR to the temp pool, and assign it to tmp1
     MarkTemp(rARM_LR);
     FreeTemp(rARM_LR);
     int tmp1 = rARM_LR;
     LockTemp(rARM_LR);
 
-    rl_src1 = LoadValueWide(rl_src1, kCoreReg);
-    rl_src2 = LoadValueWide(rl_src2, kCoreReg);
-
-    bool special_case = true;
-    // If operands are the same, or any pair has been promoted we're not the special case.
-    if ((rl_src1.s_reg_low == rl_src2.s_reg_low) ||
-        (!IsTemp(rl_src1.reg.GetReg()) && !IsTemp(rl_src1.reg.GetHighReg())) ||
-        (!IsTemp(rl_src2.reg.GetReg()) && !IsTemp(rl_src2.reg.GetHighReg()))) {
-      special_case = false;
-    }
-    // Tuning: if rl_dest has been promoted and is *not* either operand, could use directly.
-    int res_lo = AllocTemp();
-    int res_hi;
     if (rl_src1.reg.GetReg() == rl_src2.reg.GetReg()) {
-      res_hi = AllocTemp();
+      DCHECK_NE(res_hi, INVALID_REG);
+      DCHECK_NE(res_lo, INVALID_REG);
       NewLIR3(kThumb2MulRRR, tmp1, rl_src1.reg.GetReg(), rl_src1.reg.GetHighReg());
       NewLIR4(kThumb2Umull, res_lo, res_hi, rl_src1.reg.GetReg(), rl_src1.reg.GetReg());
       OpRegRegRegShift(kOpAdd, res_hi, res_hi, tmp1, EncodeShift(kArmLsl, 1));
     } else {
-      // In the special case, all temps are now allocated
       NewLIR3(kThumb2MulRRR, tmp1, rl_src2.reg.GetReg(), rl_src1.reg.GetHighReg());
-      if (special_case) {
+      if (reg_status == 2) {
+        DCHECK_EQ(res_hi, INVALID_REG);
         DCHECK_NE(rl_src1.reg.GetReg(), rl_src2.reg.GetReg());
         DCHECK_NE(rl_src1.reg.GetHighReg(), rl_src2.reg.GetHighReg());
         FreeTemp(rl_src1.reg.GetHighReg());
+        res_hi = AllocTemp();
       }
-      res_hi = AllocTemp();
-
+      DCHECK_NE(res_hi, INVALID_REG);
+      DCHECK_NE(res_lo, INVALID_REG);
       NewLIR4(kThumb2Umull, res_lo, res_hi, rl_src2.reg.GetReg(), rl_src1.reg.GetReg());
       NewLIR4(kThumb2Mla, tmp1, rl_src1.reg.GetReg(), rl_src2.reg.GetHighReg(), tmp1);
       NewLIR4(kThumb2AddRRR, res_hi, tmp1, res_hi, 0);
-      if (special_case) {
+      if (reg_status == 2) {
+        // Clobber rl_src1 since it was corrupted.
         FreeTemp(rl_src1.reg.GetReg());
         Clobber(rl_src1.reg.GetReg());
         Clobber(rl_src1.reg.GetHighReg());
       }
     }
-    FreeTemp(tmp1);
-    rl_result = GetReturnWide(false);  // Just using as a template.
-    rl_result.reg.SetReg(res_lo);
-    rl_result.reg.SetHighReg(res_hi);
-    StoreValueWide(rl_dest, rl_result);
+
     // Now, restore lr to its non-temp status.
+    FreeTemp(tmp1);
     Clobber(rARM_LR);
     UnmarkTemp(rARM_LR);
+
+    if (reg_status != 0) {
+      // We had manually allocated registers for rl_result.
+      // Now construct a RegLocation.
+      rl_result = GetReturnWide(false);  // Just using as a template.
+      rl_result.reg.SetReg(res_lo);
+      rl_result.reg.SetHighReg(res_hi);
+    }
+
+    StoreValueWide(rl_dest, rl_result);
 }
 
 void ArmMir2Lir::GenAddLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,