Register promotion support for 64-bit targets

Not sufficiently tested for 64-bit targets, but should be
fairly close.

A significant amount of refactoring could stil be done, (in
later CLs).

With this change we are not making any changes to the vmap
scheme.  As a result, it is a requirement that if a vreg
is promoted to both a 32-bit view and the low half of a
64-bit view it must share the same physical register.  We
may change this restriction later on to allow for more flexibility
for 32-bit Arm.

For example, if v4, v5, v4/v5 and v5/v6 are all hot enough to
promote, we'd end up with something like:

v4 (as an int)    -> r10
v4/v5 (as a long) -> r10
v5 (as an int)    -> r11
v5/v6 (as a long) -> r11

Fix a couple of ARM64 bugs on the way...

Change-Id: I6a152b9c164d9f1a053622266e165428045362f3
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 3e0b3cf..56dcbe5 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -445,17 +445,59 @@
 
   NewLIR0(kPseudoMethodExit);
 
-  /* Need to restore any FP callee saves? */
-  if (fp_spill_mask_) {
-    int spill_offset = frame_size_ - kArm64PointerSize*(num_fp_spills_ + num_core_spills_);
-    UnSpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
-  }
-  if (core_spill_mask_) {
-    int spill_offset = frame_size_ - kArm64PointerSize*num_core_spills_;
-    UnSpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
+  // Restore saves and drop stack frame.
+  // 2 versions:
+  //
+  // 1. (Original): Try to address directly, then drop the whole frame.
+  //                Limitation: ldp is a 7b signed immediate. There should have been a DCHECK!
+  //
+  // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
+  //           in range. Then drop the rest.
+  //
+  // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
+  //       in variant 1.
+
+  if (frame_size_ <= 504) {
+    // "Magic" constant, 63 (max signed 7b) * 8. Do variant 1.
+    // Could be tighter, as the last load is below frame_size_ offset.
+    if (fp_spill_mask_) {
+      int spill_offset = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
+      UnSpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
+    }
+    if (core_spill_mask_) {
+      int spill_offset = frame_size_ - kArm64PointerSize * num_core_spills_;
+      UnSpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
+    }
+
+    OpRegImm64(kOpAdd, rs_sp, frame_size_);
+  } else {
+    // Second variant. Drop the frame part.
+    int drop = 0;
+    // TODO: Always use the first formula, as num_fp_spills would be zero?
+    if (fp_spill_mask_) {
+      drop = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
+    } else {
+      drop = frame_size_ - kArm64PointerSize * num_core_spills_;
+    }
+
+    // Drop needs to be 16B aligned, so that SP keeps aligned.
+    drop = RoundDown(drop, 16);
+
+    OpRegImm64(kOpAdd, rs_sp, drop);
+
+    if (fp_spill_mask_) {
+      int offset = frame_size_ - drop - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
+      UnSpillFPRegs(rs_sp, offset, fp_spill_mask_);
+    }
+    if (core_spill_mask_) {
+      int offset = frame_size_ - drop - kArm64PointerSize * num_core_spills_;
+      UnSpillCoreRegs(rs_sp, offset, core_spill_mask_);
+    }
+
+    OpRegImm64(kOpAdd, rs_sp, frame_size_ - drop);
   }
 
-  OpRegImm64(kOpAdd, rs_sp, frame_size_);
+  // Finally return.
   NewLIR0(kA64Ret);
 }
 
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index f71713f..7db6ab6 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -123,8 +123,6 @@
     void ClobberCallerSave();
     void FreeCallTemps();
     void LockCallTemps();
-    void MarkPreservedSingle(int v_reg, RegStorage reg);
-    void MarkPreservedDouble(int v_reg, RegStorage reg);
     void CompilerInitializeRegAlloc();
 
     // Required for target - miscellaneous.
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 18a4e8f..51c8723 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -19,6 +19,7 @@
 #include "arm64_lir.h"
 #include "codegen_arm64.h"
 #include "dex/quick/mir_to_lir-inl.h"
+#include "dex/reg_storage_eq.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array.h"
 
@@ -1054,6 +1055,7 @@
     if (UNLIKELY(reg2 < 0)) {
       NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     } else {
+      DCHECK_LE(offset, 63);
       NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
               RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     }
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index dcb0050..6985de6 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -22,6 +22,7 @@
 
 #include "dex/compiler_internals.h"
 #include "dex/quick/mir_to_lir-inl.h"
+#include "dex/reg_storage_eq.h"
 
 namespace art {
 
@@ -648,29 +649,6 @@
   num_core_spills_++;
 }
 
-/*
- * Mark a callee-save fp register as promoted.
- */
-void Arm64Mir2Lir::MarkPreservedSingle(int v_reg, RegStorage reg) {
-  DCHECK(reg.IsFloat());
-  int adjusted_reg_num = reg.GetRegNum() - A64_FP_CALLEE_SAVE_BASE;
-  // Ensure fp_vmap_table is large enough
-  int table_size = fp_vmap_table_.size();
-  for (int i = table_size; i < (adjusted_reg_num + 1); i++) {
-    fp_vmap_table_.push_back(INVALID_VREG);
-  }
-  // Add the current mapping
-  fp_vmap_table_[adjusted_reg_num] = v_reg;
-  // Size of fp_vmap_table is high-water mark, use to set mask
-  num_fp_spills_ = fp_vmap_table_.size();
-  fp_spill_mask_ = ((1 << num_fp_spills_) - 1) << A64_FP_CALLEE_SAVE_BASE;
-}
-
-void Arm64Mir2Lir::MarkPreservedDouble(int v_reg, RegStorage reg) {
-  DCHECK(reg.IsDouble());
-  MarkPreservedSingle(v_reg, reg);
-}
-
 /* Clobber all regs that might be used by an external C call */
 void Arm64Mir2Lir::ClobberCallerSave() {
   Clobber(rs_x0);
@@ -904,7 +882,7 @@
     int n = *num_gpr_used;
     if (n < 8) {
       *num_gpr_used = n + 1;
-      if (loc->wide) {
+      if (loc->wide || loc->ref) {
         *op_size = k64;
         return RegStorage::Solo64(n);
       } else {
@@ -965,35 +943,64 @@
   ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
   int start_vreg = cu_->num_dalvik_registers - cu_->num_ins;
   for (int i = 0; i < cu_->num_ins; i++) {
-    PromotionMap* v_map = &promotion_map_[start_vreg + i];
     RegLocation* t_loc = &ArgLocs[i];
     OpSize op_size;
     RegStorage reg = GetArgPhysicalReg(t_loc, &num_gpr_used, &num_fpr_used, &op_size);
 
     if (reg.Valid()) {
-      if ((v_map->core_location == kLocPhysReg) && !t_loc->fp) {
-        OpRegCopy(RegStorage::Solo32(v_map->core_reg), reg);
-      } else if ((v_map->fp_location == kLocPhysReg) && t_loc->fp) {
-        OpRegCopy(RegStorage::Solo32(v_map->FpReg), reg);
+      // If arriving in register.
+
+      // We have already updated the arg location with promoted info
+      // so we can be based on it.
+      if (t_loc->location == kLocPhysReg) {
+        // Just copy it.
+        OpRegCopy(t_loc->reg, reg);
       } else {
-        StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, op_size, kNotVolatile);
-        if (reg.Is64Bit()) {
-          if (SRegOffset(start_vreg + i) + 4 != SRegOffset(start_vreg + i + 1)) {
-            LOG(FATAL) << "64 bit value stored in non-consecutive 4 bytes slots";
-          }
-          i += 1;
+        // Needs flush.
+        if (t_loc->ref) {
+          StoreRefDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, kNotVolatile);
+        } else {
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, t_loc->wide ? k64 : k32,
+              kNotVolatile);
         }
       }
     } else {
-      // If arriving in frame & promoted
-      if (v_map->core_location == kLocPhysReg) {
-        LoadWordDisp(TargetReg(kSp), SRegOffset(start_vreg + i),
-                     RegStorage::Solo32(v_map->core_reg));
-      }
-      if (v_map->fp_location == kLocPhysReg) {
-        LoadWordDisp(TargetReg(kSp), SRegOffset(start_vreg + i), RegStorage::Solo32(v_map->FpReg));
+      // If arriving in frame & promoted.
+      if (t_loc->location == kLocPhysReg) {
+        if (t_loc->ref) {
+          LoadRefDisp(TargetReg(kSp), SRegOffset(start_vreg + i), t_loc->reg, kNotVolatile);
+        } else {
+          LoadBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), t_loc->reg,
+                       t_loc->wide ? k64 : k32, kNotVolatile);
+        }
       }
     }
+    if (t_loc->wide) {
+      // Increment i to skip the next one.
+      i++;
+    }
+    //      if ((v_map->core_location == kLocPhysReg) && !t_loc->fp) {
+    //        OpRegCopy(RegStorage::Solo32(v_map->core_reg), reg);
+    //      } else if ((v_map->fp_location == kLocPhysReg) && t_loc->fp) {
+    //        OpRegCopy(RegStorage::Solo32(v_map->fp_reg), reg);
+    //      } else {
+    //        StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, op_size, kNotVolatile);
+    //        if (reg.Is64Bit()) {
+    //          if (SRegOffset(start_vreg + i) + 4 != SRegOffset(start_vreg + i + 1)) {
+    //            LOG(FATAL) << "64 bit value stored in non-consecutive 4 bytes slots";
+    //          }
+    //          i += 1;
+    //        }
+    //      }
+    //    } else {
+    //      // If arriving in frame & promoted
+    //      if (v_map->core_location == kLocPhysReg) {
+    //        LoadWordDisp(TargetReg(kSp), SRegOffset(start_vreg + i),
+    //                     RegStorage::Solo32(v_map->core_reg));
+    //      }
+    //      if (v_map->fp_location == kLocPhysReg) {
+    //        LoadWordDisp(TargetReg(kSp), SRegOffset(start_vreg + i), RegStorage::Solo32(v_map->fp_reg));
+    //      }
   }
 }
 
@@ -1067,7 +1074,11 @@
         loc = UpdateLoc(loc);
         if (loc.location == kLocPhysReg) {
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32, kNotVolatile);
+          if (loc.ref) {
+            StoreRefDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, kNotVolatile);
+          } else {
+            StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32, kNotVolatile);
+          }
         }
         next_arg++;
       }
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index ca78e5b..aaee91b 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -17,6 +17,7 @@
 #include "arm64_lir.h"
 #include "codegen_arm64.h"
 #include "dex/quick/mir_to_lir-inl.h"
+#include "dex/reg_storage_eq.h"
 
 namespace art {