Register promotion support for 64-bit targets

Not sufficiently tested for 64-bit targets, but should be
fairly close.

A significant amount of refactoring could stil be done, (in
later CLs).

With this change we are not making any changes to the vmap
scheme.  As a result, it is a requirement that if a vreg
is promoted to both a 32-bit view and the low half of a
64-bit view it must share the same physical register.  We
may change this restriction later on to allow for more flexibility
for 32-bit Arm.

For example, if v4, v5, v4/v5 and v5/v6 are all hot enough to
promote, we'd end up with something like:

v4 (as an int)    -> r10
v4/v5 (as a long) -> r10
v5 (as an int)    -> r11
v5/v6 (as a long) -> r11

Fix a couple of ARM64 bugs on the way...

Change-Id: I6a152b9c164d9f1a053622266e165428045362f3
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 38370ad..6bedae8 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -178,7 +178,7 @@
   } else {
     RegisterInfo* info = GetRegInfo(reg);
     if (info->IsTemp() && !info->IsDead()) {
-      if (info->GetReg() != info->Partner()) {
+      if (info->GetReg().NotExactlyEquals(info->Partner())) {
         ClobberBody(GetRegInfo(info->Partner()));
       }
       ClobberBody(info);
@@ -225,7 +225,7 @@
     GrowableArray<RegisterInfo*>::Iterator iter(&tempreg_info_);
     for (RegisterInfo* info = iter.Next(); info != NULL; info = iter.Next()) {
       if (info->SReg() == s_reg) {
-        if (info->GetReg() != info->Partner()) {
+        if (info->GetReg().NotExactlyEquals(info->Partner())) {
           // Dealing with a pair - clobber the other half.
           DCHECK(!info->IsAliased());
           ClobberBody(GetRegInfo(info->Partner()));
@@ -284,8 +284,13 @@
 
 /* Reserve a callee-save register.  Return InvalidReg if none available */
 RegStorage Mir2Lir::AllocPreservedCoreReg(int s_reg) {
-  // TODO: 64-bit and refreg update
   RegStorage res;
+  /*
+   * Note: it really doesn't matter much whether we allocate from the core or core64
+   * pool for 64-bit targets - but for some targets it does matter whether allocations
+   * happens from the single or double pool.  This entire section of code could stand
+   * a good refactoring.
+   */
   GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->core_regs_);
   for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
     if (!info->IsTemp() && !info->InUse()) {
@@ -297,49 +302,50 @@
   return res;
 }
 
-void Mir2Lir::RecordSinglePromotion(RegStorage reg, int s_reg) {
+void Mir2Lir::RecordFpPromotion(RegStorage reg, int s_reg) {
+  DCHECK_NE(cu_->instruction_set, kThumb2);
   int p_map_idx = SRegToPMap(s_reg);
   int v_reg = mir_graph_->SRegToVReg(s_reg);
+  int reg_num = reg.GetRegNum();
   GetRegInfo(reg)->MarkInUse();
-  MarkPreservedSingle(v_reg, reg);
+  fp_spill_mask_ |= (1 << reg_num);
+  // Include reg for later sort
+  fp_vmap_table_.push_back(reg_num << VREG_NUM_WIDTH | (v_reg & ((1 << VREG_NUM_WIDTH) - 1)));
+  num_fp_spills_++;
   promotion_map_[p_map_idx].fp_location = kLocPhysReg;
-  promotion_map_[p_map_idx].FpReg = reg.GetReg();
+  promotion_map_[p_map_idx].fp_reg = reg.GetReg();
 }
 
-// Reserve a callee-save sp single register.
-RegStorage Mir2Lir::AllocPreservedSingle(int s_reg) {
+// Reserve a callee-save floating point.
+RegStorage Mir2Lir::AllocPreservedFpReg(int s_reg) {
+  /*
+   * For targets other than Thumb2, it doesn't matter whether we allocate from
+   * the sp_regs_ or dp_regs_ pool.  Some refactoring is in order here.
+   */
+  DCHECK_NE(cu_->instruction_set, kThumb2);
   RegStorage res;
   GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->sp_regs_);
   for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
     if (!info->IsTemp() && !info->InUse()) {
       res = info->GetReg();
-      RecordSinglePromotion(res, s_reg);
+      RecordFpPromotion(res, s_reg);
       break;
     }
   }
   return res;
 }
 
-void Mir2Lir::RecordDoublePromotion(RegStorage reg, int s_reg) {
-  int p_map_idx = SRegToPMap(s_reg);
-  int v_reg = mir_graph_->SRegToVReg(s_reg);
-  GetRegInfo(reg)->MarkInUse();
-  MarkPreservedDouble(v_reg, reg);
-  promotion_map_[p_map_idx].fp_location = kLocPhysReg;
-  promotion_map_[p_map_idx].FpReg = reg.GetReg();
-}
-
-// Reserve a callee-save dp solo register.
+// TODO: this is Thumb2 only.  Remove when DoPromotion refactored.
 RegStorage Mir2Lir::AllocPreservedDouble(int s_reg) {
   RegStorage res;
-  GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->dp_regs_);
-  for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
-    if (!info->IsTemp() && !info->InUse()) {
-      res = info->GetReg();
-      RecordDoublePromotion(res, s_reg);
-      break;
-    }
-  }
+  UNIMPLEMENTED(FATAL) << "Unexpected use of AllocPreservedDouble";
+  return res;
+}
+
+// TODO: this is Thumb2 only.  Remove when DoPromotion refactored.
+RegStorage Mir2Lir::AllocPreservedSingle(int s_reg) {
+  RegStorage res;
+  UNIMPLEMENTED(FATAL) << "Unexpected use of AllocPreservedSingle";
   return res;
 }
 
@@ -736,7 +742,8 @@
     RegisterInfo* info1 = GetRegInfo(reg.GetLow());
     RegisterInfo* info2 = GetRegInfo(reg.GetHigh());
     DCHECK(info1 && info2 && info1->IsWide() && info2->IsWide() &&
-         (info1->Partner() == info2->GetReg()) && (info2->Partner() == info1->GetReg()));
+           (info1->Partner().ExactlyEquals(info2->GetReg())) &&
+           (info2->Partner().ExactlyEquals(info1->GetReg())));
     if ((info1->IsLive() && info1->IsDirty()) || (info2->IsLive() && info2->IsDirty())) {
       if (!(info1->IsTemp() && info2->IsTemp())) {
         /* Should not happen.  If it does, there's a problem in eval_loc */
@@ -872,10 +879,10 @@
     RegisterInfo* info_lo = GetRegInfo(reg.GetLow());
     RegisterInfo* info_hi = GetRegInfo(reg.GetHigh());
     // Unpair any old partners.
-    if (info_lo->IsWide() && info_lo->Partner() != info_hi->GetReg()) {
+    if (info_lo->IsWide() && info_lo->Partner().NotExactlyEquals(info_hi->GetReg())) {
       GetRegInfo(info_lo->Partner())->SetIsWide(false);
     }
-    if (info_hi->IsWide() && info_hi->Partner() != info_lo->GetReg()) {
+    if (info_hi->IsWide() && info_hi->Partner().NotExactlyEquals(info_lo->GetReg())) {
       GetRegInfo(info_hi->Partner())->SetIsWide(false);
     }
     info_lo->SetIsWide(true);
@@ -1039,12 +1046,12 @@
         RegisterInfo* info_hi = GetRegInfo(reg.GetHigh());
         match &= info_lo->IsWide();
         match &= info_hi->IsWide();
-        match &= (info_lo->Partner() == info_hi->GetReg());
-        match &= (info_hi->Partner() == info_lo->GetReg());
+        match &= (info_lo->Partner().ExactlyEquals(info_hi->GetReg()));
+        match &= (info_hi->Partner().ExactlyEquals(info_lo->GetReg()));
       } else {
         RegisterInfo* info = GetRegInfo(reg);
         match &= info->IsWide();
-        match &= (info->GetReg() == info->Partner());
+        match &= (info->GetReg().ExactlyEquals(info->Partner()));
       }
       if (match) {
         loc.location = kLocPhysReg;
@@ -1147,16 +1154,23 @@
     RegLocation loc = mir_graph_->reg_location_[i];
     RefCounts* counts = loc.fp ? fp_counts : core_counts;
     int p_map_idx = SRegToPMap(loc.s_reg_low);
+    int use_count = mir_graph_->GetUseCount(i);
     if (loc.fp) {
       if (loc.wide) {
         // Treat doubles as a unit, using upper half of fp_counts array.
-        counts[p_map_idx + num_regs].count += mir_graph_->GetUseCount(i);
+        counts[p_map_idx + num_regs].count += use_count;
         i++;
       } else {
-        counts[p_map_idx].count += mir_graph_->GetUseCount(i);
+        counts[p_map_idx].count += use_count;
       }
     } else if (!IsInexpensiveConstant(loc)) {
-      counts[p_map_idx].count += mir_graph_->GetUseCount(i);
+      if (loc.wide && cu_->target64) {
+        // Treat long as a unit, using upper half of core_counts array.
+        counts[p_map_idx + num_regs].count += use_count;
+        i++;
+      } else {
+        counts[p_map_idx].count += use_count;
+      }
     }
   }
 }
@@ -1176,10 +1190,10 @@
 void Mir2Lir::DumpCounts(const RefCounts* arr, int size, const char* msg) {
   LOG(INFO) << msg;
   for (int i = 0; i < size; i++) {
-    if ((arr[i].s_reg & STARTING_DOUBLE_SREG) != 0) {
-      LOG(INFO) << "s_reg[D" << (arr[i].s_reg & ~STARTING_DOUBLE_SREG) << "]: " << arr[i].count;
+    if ((arr[i].s_reg & STARTING_WIDE_SREG) != 0) {
+      LOG(INFO) << "s_reg[64_" << (arr[i].s_reg & ~STARTING_WIDE_SREG) << "]: " << arr[i].count;
     } else {
-      LOG(INFO) << "s_reg[" << arr[i].s_reg << "]: " << arr[i].count;
+      LOG(INFO) << "s_reg[32_" << arr[i].s_reg << "]: " << arr[i].count;
     }
   }
 }
@@ -1210,69 +1224,83 @@
    * TUNING: replace with linear scan once we have the ability
    * to describe register live ranges for GC.
    */
+  size_t core_reg_count_size = cu_->target64 ? num_regs * 2 : num_regs;
+  size_t fp_reg_count_size = num_regs * 2;
   RefCounts *core_regs =
-      static_cast<RefCounts*>(arena_->Alloc(sizeof(RefCounts) * num_regs,
+      static_cast<RefCounts*>(arena_->Alloc(sizeof(RefCounts) * core_reg_count_size,
                                             kArenaAllocRegAlloc));
-  RefCounts *FpRegs =
-      static_cast<RefCounts *>(arena_->Alloc(sizeof(RefCounts) * num_regs * 2,
+  RefCounts *fp_regs =
+      static_cast<RefCounts *>(arena_->Alloc(sizeof(RefCounts) * fp_reg_count_size,
                                              kArenaAllocRegAlloc));
   // Set ssa names for original Dalvik registers
   for (int i = 0; i < dalvik_regs; i++) {
-    core_regs[i].s_reg = FpRegs[i].s_reg = i;
+    core_regs[i].s_reg = fp_regs[i].s_reg = i;
   }
 
   // Set ssa names for compiler temporaries
   for (unsigned int ct_idx = 0; ct_idx < mir_graph_->GetNumUsedCompilerTemps(); ct_idx++) {
     CompilerTemp* ct = mir_graph_->GetCompilerTemp(ct_idx);
     core_regs[dalvik_regs + ct_idx].s_reg = ct->s_reg_low;
-    FpRegs[dalvik_regs + ct_idx].s_reg = ct->s_reg_low;
-    FpRegs[num_regs + dalvik_regs + ct_idx].s_reg = ct->s_reg_low;
+    fp_regs[dalvik_regs + ct_idx].s_reg = ct->s_reg_low;
   }
 
-  // Duplicate in upper half to represent possible fp double starting sregs.
-  for (int i = 0; i < num_regs; i++) {
-    FpRegs[num_regs + i].s_reg = FpRegs[i].s_reg | STARTING_DOUBLE_SREG;
+  // Duplicate in upper half to represent possible wide starting sregs.
+  for (size_t i = num_regs; i < fp_reg_count_size; i++) {
+    fp_regs[i].s_reg = fp_regs[i - num_regs].s_reg | STARTING_WIDE_SREG;
+  }
+  for (size_t i = num_regs; i < core_reg_count_size; i++) {
+    core_regs[i].s_reg = core_regs[i - num_regs].s_reg | STARTING_WIDE_SREG;
   }
 
   // Sum use counts of SSA regs by original Dalvik vreg.
-  CountRefs(core_regs, FpRegs, num_regs);
+  CountRefs(core_regs, fp_regs, num_regs);
 
 
   // Sort the count arrays
-  qsort(core_regs, num_regs, sizeof(RefCounts), SortCounts);
-  qsort(FpRegs, num_regs * 2, sizeof(RefCounts), SortCounts);
+  qsort(core_regs, core_reg_count_size, sizeof(RefCounts), SortCounts);
+  qsort(fp_regs, fp_reg_count_size, sizeof(RefCounts), SortCounts);
 
   if (cu_->verbose) {
-    DumpCounts(core_regs, num_regs, "Core regs after sort");
-    DumpCounts(FpRegs, num_regs * 2, "Fp regs after sort");
+    DumpCounts(core_regs, core_reg_count_size, "Core regs after sort");
+    DumpCounts(fp_regs, fp_reg_count_size, "Fp regs after sort");
   }
 
   if (!(cu_->disable_opt & (1 << kPromoteRegs))) {
-    // Promote FpRegs
-    for (int i = 0; (i < (num_regs * 2)) && (FpRegs[i].count >= promotion_threshold); i++) {
-      int p_map_idx = SRegToPMap(FpRegs[i].s_reg & ~STARTING_DOUBLE_SREG);
-      if ((FpRegs[i].s_reg & STARTING_DOUBLE_SREG) != 0) {
-        if ((promotion_map_[p_map_idx].fp_location != kLocPhysReg) &&
-            (promotion_map_[p_map_idx + 1].fp_location != kLocPhysReg)) {
-          int low_sreg = FpRegs[i].s_reg & ~STARTING_DOUBLE_SREG;
-          // Ignore result - if can't alloc double may still be able to alloc singles.
-          AllocPreservedDouble(low_sreg);
+    // Promote fp regs
+    for (size_t i = 0; (i < fp_reg_count_size) && (fp_regs[i].count >= promotion_threshold); i++) {
+      int low_sreg = fp_regs[i].s_reg & ~STARTING_WIDE_SREG;
+      size_t p_map_idx = SRegToPMap(low_sreg);
+      RegStorage reg = RegStorage::InvalidReg();
+      if (promotion_map_[p_map_idx].fp_location != kLocPhysReg) {
+        // TODO: break out the Thumb2-specific code.
+        if (cu_->instruction_set == kThumb2) {
+          bool wide = fp_regs[i].s_reg & STARTING_WIDE_SREG;
+          if (wide) {
+            if (promotion_map_[p_map_idx + 1].fp_location == kLocPhysReg) {
+              // Ignore result - if can't alloc double may still be able to alloc singles.
+              AllocPreservedDouble(low_sreg);
+            }
+            // Continue regardless of success - might still be able to grab a single.
+            continue;
+          } else {
+            reg = AllocPreservedSingle(low_sreg);
+          }
+        } else {
+          reg = AllocPreservedFpReg(low_sreg);
         }
-      } else if (promotion_map_[p_map_idx].fp_location != kLocPhysReg) {
-        RegStorage reg = AllocPreservedSingle(FpRegs[i].s_reg);
         if (!reg.Valid()) {
-          break;  // No more left.
+           break;  // No more left
         }
       }
     }
 
     // Promote core regs
-    for (int i = 0; (i < num_regs) &&
-            (core_regs[i].count >= promotion_threshold); i++) {
-      int p_map_idx = SRegToPMap(core_regs[i].s_reg);
-      if (promotion_map_[p_map_idx].core_location !=
-          kLocPhysReg) {
-        RegStorage reg = AllocPreservedCoreReg(core_regs[i].s_reg);
+    for (size_t i = 0; (i < core_reg_count_size) &&
+         (core_regs[i].count >= promotion_threshold); i++) {
+      int low_sreg = core_regs[i].s_reg & ~STARTING_WIDE_SREG;
+      size_t p_map_idx = SRegToPMap(low_sreg);
+      if (promotion_map_[p_map_idx].core_location != kLocPhysReg) {
+        RegStorage reg = AllocPreservedCoreReg(low_sreg);
         if (!reg.Valid()) {
            break;  // No more left
         }
@@ -1284,51 +1312,35 @@
   for (int i = 0; i < mir_graph_->GetNumSSARegs(); i++) {
     RegLocation *curr = &mir_graph_->reg_location_[i];
     int p_map_idx = SRegToPMap(curr->s_reg_low);
-    if (!curr->wide) {
-      if (curr->fp) {
-        if (promotion_map_[p_map_idx].fp_location == kLocPhysReg) {
-          curr->location = kLocPhysReg;
-          curr->reg = RegStorage::Solo32(promotion_map_[p_map_idx].FpReg);
-          curr->home = true;
-        }
-      } else {
-        if (promotion_map_[p_map_idx].core_location == kLocPhysReg) {
-          curr->location = kLocPhysReg;
-          curr->reg = RegStorage::Solo32(promotion_map_[p_map_idx].core_reg);
-          curr->home = true;
-        }
-      }
-    } else {
-      if (curr->high_word) {
-        continue;
-      }
-      if (curr->fp) {
-        if ((promotion_map_[p_map_idx].fp_location == kLocPhysReg) &&
-            (promotion_map_[p_map_idx+1].fp_location == kLocPhysReg)) {
-          int low_reg = promotion_map_[p_map_idx].FpReg;
-          int high_reg = promotion_map_[p_map_idx+1].FpReg;
-          // Doubles require pair of singles starting at even reg
+    int reg_num = curr->fp ? promotion_map_[p_map_idx].fp_reg : promotion_map_[p_map_idx].core_reg;
+    bool wide = curr->wide || (cu_->target64 && curr->ref && cu_->instruction_set != kX86_64);
+    RegStorage reg = RegStorage::InvalidReg();
+    if (curr->fp && promotion_map_[p_map_idx].fp_location == kLocPhysReg) {
+      if (wide && cu_->instruction_set == kThumb2) {
+        if (promotion_map_[p_map_idx + 1].fp_location == kLocPhysReg) {
+          int high_reg = promotion_map_[p_map_idx+1].fp_reg;
           // TODO: move target-specific restrictions out of here.
-          if (((low_reg & 0x1) == 0) && ((low_reg + 1) == high_reg)) {
-            curr->location = kLocPhysReg;
-            if (cu_->instruction_set == kThumb2) {
-              curr->reg = RegStorage::FloatSolo64(RegStorage::RegNum(low_reg) >> 1);
-            } else {
-              curr->reg = RegStorage(RegStorage::k64BitPair, low_reg, high_reg);
-            }
-            curr->home = true;
+          if (((reg_num & 0x1) == 0) && ((reg_num + 1) == high_reg)) {
+            reg = RegStorage::FloatSolo64(RegStorage::RegNum(reg_num) >> 1);
           }
         }
       } else {
-        if ((promotion_map_[p_map_idx].core_location == kLocPhysReg)
-           && (promotion_map_[p_map_idx+1].core_location ==
-           kLocPhysReg)) {
-          curr->location = kLocPhysReg;
-          curr->reg = RegStorage(RegStorage::k64BitPair, promotion_map_[p_map_idx].core_reg,
-                                 promotion_map_[p_map_idx+1].core_reg);
-          curr->home = true;
-        }
+        reg = wide ? RegStorage::FloatSolo64(reg_num) : RegStorage::FloatSolo32(reg_num);
       }
+    } else if (!curr->fp && promotion_map_[p_map_idx].core_location == kLocPhysReg) {
+      if (wide && !cu_->target64) {
+        if (promotion_map_[p_map_idx + 1].core_location == kLocPhysReg) {
+          int high_reg = promotion_map_[p_map_idx+1].core_reg;
+          reg = RegStorage(RegStorage::k64BitPair, reg_num, high_reg);
+        }
+      } else {
+        reg = wide ? RegStorage::Solo64(reg_num) : RegStorage::Solo32(reg_num);
+      }
+    }
+    if (reg.Valid()) {
+      curr->reg = reg;
+      curr->location = kLocPhysReg;
+      curr->home = true;
     }
   }
   if (cu_->verbose) {