ART: Rework ARM64 entry sequence

Try to fold one sub of SP in the ARM64 entry sequence. When the
framesize is small, generate a sub over the full frame-size, and
adjust the spill offsets accordingly. If the framesize is too
large, use a pre-indexed store and fill upwards from there.

Change-Id: I1c15ac6276fb62b8164372de02fd92437f605938
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index e584548..6fa8a4a 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -330,19 +330,14 @@
 
   NewLIR0(kPseudoMethodEntry);
 
-  const size_t kStackOverflowReservedUsableBytes = GetStackOverflowReservedBytes(kArm64) -
-      Thread::kStackOverflowSignalReservedBytes;
-  const bool large_frame = static_cast<size_t>(frame_size_) > kStackOverflowReservedUsableBytes;
   const int spill_count = num_core_spills_ + num_fp_spills_;
   const int spill_size = (spill_count * kArm64PointerSize + 15) & ~0xf;  // SP 16 byte alignment.
   const int frame_size_without_spills = frame_size_ - spill_size;
 
   if (!skip_overflow_check) {
     if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitStackOverflowChecks()) {
-      if (!large_frame) {
-        // Load stack limit
-        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
-      }
+      // Load stack limit
+      LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
     } else {
       // TODO(Arm64) Implement implicit checks.
       // Implicit stack overflow check.
@@ -350,24 +345,21 @@
       // redzone we will get a segmentation fault.
       // Load32Disp(rs_wSP, -Thread::kStackOverflowReservedBytes, rs_wzr);
       // MarkPossibleStackOverflowException();
+      //
+      // TODO: If the frame size is small enough, is it possible to make this a pre-indexed load,
+      //       so that we can avoid the following "sub sp" when spilling?
       LOG(FATAL) << "Implicit stack overflow checks not implemented.";
     }
   }
 
-  if (frame_size_ > 0) {
-    OpRegImm64(kOpSub, rs_sp, spill_size);
+  int spilled_already = 0;
+  if (spill_size > 0) {
+    spilled_already = SpillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_);
+    DCHECK(spill_size == spilled_already || frame_size_ == spilled_already);
   }
 
-  /* Need to spill any FP regs? */
-  if (fp_spill_mask_) {
-    int spill_offset = spill_size - kArm64PointerSize*(num_fp_spills_ + num_core_spills_);
-    SpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
-  }
-
-  /* Spill core callee saves. */
-  if (core_spill_mask_) {
-    int spill_offset = spill_size - kArm64PointerSize*num_core_spills_;
-    SpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
+  if (spilled_already != frame_size_) {
+    OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
   }
 
   if (!skip_overflow_check) {
@@ -396,29 +388,9 @@
         const size_t sp_displace_;
       };
 
-      if (large_frame) {
-        // Compare Expected SP against bottom of stack.
-        // Branch to throw target if there is not enough room.
-        OpRegRegImm(kOpSub, rs_xIP1, rs_sp, frame_size_without_spills);
-        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP0);
-        LIR* branch = OpCmpBranch(kCondUlt, rs_xIP1, rs_xIP0, nullptr);
-        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, spill_size));
-        OpRegCopy(rs_sp, rs_xIP1);  // Establish stack after checks.
-      } else {
-        /*
-         * If the frame is small enough we are guaranteed to have enough space that remains to
-         * handle signals on the user stack.
-         * Establishes stack before checks.
-         */
-        OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size_without_spills);
-        LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
-        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
-      }
-    } else {
-      OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
+      LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
+      AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
     }
-  } else {
-    OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
   }
 
   FlushIns(ArgLocs, rl_method);
@@ -445,57 +417,7 @@
 
   NewLIR0(kPseudoMethodExit);
 
-  // Restore saves and drop stack frame.
-  // 2 versions:
-  //
-  // 1. (Original): Try to address directly, then drop the whole frame.
-  //                Limitation: ldp is a 7b signed immediate. There should have been a DCHECK!
-  //
-  // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
-  //           in range. Then drop the rest.
-  //
-  // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
-  //       in variant 1.
-
-  if (frame_size_ <= 504) {
-    // "Magic" constant, 63 (max signed 7b) * 8. Do variant 1.
-    // Could be tighter, as the last load is below frame_size_ offset.
-    if (fp_spill_mask_) {
-      int spill_offset = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
-      UnSpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
-    }
-    if (core_spill_mask_) {
-      int spill_offset = frame_size_ - kArm64PointerSize * num_core_spills_;
-      UnSpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
-    }
-
-    OpRegImm64(kOpAdd, rs_sp, frame_size_);
-  } else {
-    // Second variant. Drop the frame part.
-    int drop = 0;
-    // TODO: Always use the first formula, as num_fp_spills would be zero?
-    if (fp_spill_mask_) {
-      drop = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
-    } else {
-      drop = frame_size_ - kArm64PointerSize * num_core_spills_;
-    }
-
-    // Drop needs to be 16B aligned, so that SP keeps aligned.
-    drop = RoundDown(drop, 16);
-
-    OpRegImm64(kOpAdd, rs_sp, drop);
-
-    if (fp_spill_mask_) {
-      int offset = frame_size_ - drop - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
-      UnSpillFPRegs(rs_sp, offset, fp_spill_mask_);
-    }
-    if (core_spill_mask_) {
-      int offset = frame_size_ - drop - kArm64PointerSize * num_core_spills_;
-      UnSpillCoreRegs(rs_sp, offset, core_spill_mask_);
-    }
-
-    OpRegImm64(kOpAdd, rs_sp, frame_size_ - drop);
-  }
+  UnspillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_);
 
   // Finally return.
   NewLIR0(kA64Ret);