5 files changed, 265 insertions, 121 deletions
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 3a8ea3f96e..90cb156749 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -331,6 +331,7 @@ enum ArmOpcode {
   kA64Stp4ffXD,      // stp [0s10110100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64Stp4rrXD,      // stp [s010100100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64StpPost4rrXD,  // stp [s010100010] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
+  kA64StpPre4ffXD,   // stp [0s10110110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64StpPre4rrXD,   // stp [s010100110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64Str3fXD,       // str [1s11110100] imm_12[21-10] rn[9-5] rt[4-0].
   kA64Str4fXxG,      // str [1s111100001] rm[20-16] [011] S[12] [10] rn[9-5] rt[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 462be54e57..5351ce50bb 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -518,6 +518,10 @@ const ArmEncodingMap Arm64Mir2Lir::EncodingMap[kA64Last] = {
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
                  kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
                  "stp", "!0r, !1r, [!2X], #!3D", kFixupNone),
+    ENCODING_MAP(WIDE(kA64StpPre4ffXD), CUSTOM_VARIANTS(0x2d800000, 0x6d800000),
+                 kFmtRegF, 4, 0, kFmtRegF, 14, 10, kFmtRegXOrSp, 9, 5,
+                 kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
+                 "stp", "!0r, !1f, [!2X, #!3D]!!", kFixupNone),
     ENCODING_MAP(WIDE(kA64StpPre4rrXD), CUSTOM_VARIANTS(0x29800000, 0xa9800000),
                  kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
                  kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
@@ -723,6 +727,7 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
                              << " @ 0x" << std::hex << lir->dalvik_offset;
                 if (kFailOnSizeError) {
                   LOG(FATAL) << "Bad argument n. " << i << " of " << encoder->name
+                             << "(" << UNWIDE(encoder->opcode) << ", " << encoder->fmt << ")"
                              << ". Expected " << expected << ", got 0x" << std::hex << operand;
                 } else {
                   LOG(WARNING) << "Bad argument n. " << i << " of " << encoder->name
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index e584548558..6fa8a4aca5 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -330,19 +330,14 @@ void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method)
 
   NewLIR0(kPseudoMethodEntry);
 
-  const size_t kStackOverflowReservedUsableBytes = GetStackOverflowReservedBytes(kArm64) -
-      Thread::kStackOverflowSignalReservedBytes;
-  const bool large_frame = static_cast<size_t>(frame_size_) > kStackOverflowReservedUsableBytes;
   const int spill_count = num_core_spills_ + num_fp_spills_;
   const int spill_size = (spill_count * kArm64PointerSize + 15) & ~0xf;  // SP 16 byte alignment.
   const int frame_size_without_spills = frame_size_ - spill_size;
 
   if (!skip_overflow_check) {
     if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitStackOverflowChecks()) {
-      if (!large_frame) {
-        // Load stack limit
-        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
-      }
+      // Load stack limit
+      LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
     } else {
       // TODO(Arm64) Implement implicit checks.
       // Implicit stack overflow check.
@@ -350,24 +345,21 @@ void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method)
       // redzone we will get a segmentation fault.
       // Load32Disp(rs_wSP, -Thread::kStackOverflowReservedBytes, rs_wzr);
       // MarkPossibleStackOverflowException();
+      //
+      // TODO: If the frame size is small enough, is it possible to make this a pre-indexed load,
+      //       so that we can avoid the following "sub sp" when spilling?
       LOG(FATAL) << "Implicit stack overflow checks not implemented.";
     }
   }
 
-  if (frame_size_ > 0) {
-    OpRegImm64(kOpSub, rs_sp, spill_size);
+  int spilled_already = 0;
+  if (spill_size > 0) {
+    spilled_already = SpillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_);
+    DCHECK(spill_size == spilled_already || frame_size_ == spilled_already);
   }
 
-  /* Need to spill any FP regs? */
-  if (fp_spill_mask_) {
-    int spill_offset = spill_size - kArm64PointerSize*(num_fp_spills_ + num_core_spills_);
-    SpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
-  }
-
-  /* Spill core callee saves. */
-  if (core_spill_mask_) {
-    int spill_offset = spill_size - kArm64PointerSize*num_core_spills_;
-    SpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
+  if (spilled_already != frame_size_) {
+    OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
   }
 
   if (!skip_overflow_check) {
@@ -396,29 +388,9 @@ void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method)
         const size_t sp_displace_;
       };
 
-      if (large_frame) {
-        // Compare Expected SP against bottom of stack.
-        // Branch to throw target if there is not enough room.
-        OpRegRegImm(kOpSub, rs_xIP1, rs_sp, frame_size_without_spills);
-        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP0);
-        LIR* branch = OpCmpBranch(kCondUlt, rs_xIP1, rs_xIP0, nullptr);
-        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, spill_size));
-        OpRegCopy(rs_sp, rs_xIP1);  // Establish stack after checks.
-      } else {
-        /*
-         * If the frame is small enough we are guaranteed to have enough space that remains to
-         * handle signals on the user stack.
-         * Establishes stack before checks.
-         */
-        OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size_without_spills);
-        LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
-        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
-      }
-    } else {
-      OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
+      LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
+      AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
     }
-  } else {
-    OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
   }
 
   FlushIns(ArgLocs, rl_method);
@@ -445,57 +417,7 @@ void Arm64Mir2Lir::GenExitSequence() {
 
   NewLIR0(kPseudoMethodExit);
 
-  // Restore saves and drop stack frame.
-  // 2 versions:
-  //
-  // 1. (Original): Try to address directly, then drop the whole frame.
-  //                Limitation: ldp is a 7b signed immediate. There should have been a DCHECK!
-  //
-  // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
-  //           in range. Then drop the rest.
-  //
-  // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
-  //       in variant 1.
-
-  if (frame_size_ <= 504) {
-    // "Magic" constant, 63 (max signed 7b) * 8. Do variant 1.
-    // Could be tighter, as the last load is below frame_size_ offset.
-    if (fp_spill_mask_) {
-      int spill_offset = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
-      UnSpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
-    }
-    if (core_spill_mask_) {
-      int spill_offset = frame_size_ - kArm64PointerSize * num_core_spills_;
-      UnSpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
-    }
-
-    OpRegImm64(kOpAdd, rs_sp, frame_size_);
-  } else {
-    // Second variant. Drop the frame part.
-    int drop = 0;
-    // TODO: Always use the first formula, as num_fp_spills would be zero?
-    if (fp_spill_mask_) {
-      drop = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
-    } else {
-      drop = frame_size_ - kArm64PointerSize * num_core_spills_;
-    }
-
-    // Drop needs to be 16B aligned, so that SP keeps aligned.
-    drop = RoundDown(drop, 16);
-
-    OpRegImm64(kOpAdd, rs_sp, drop);
-
-    if (fp_spill_mask_) {
-      int offset = frame_size_ - drop - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
-      UnSpillFPRegs(rs_sp, offset, fp_spill_mask_);
-    }
-    if (core_spill_mask_) {
-      int offset = frame_size_ - drop - kArm64PointerSize * num_core_spills_;
-      UnSpillCoreRegs(rs_sp, offset, core_spill_mask_);
-    }
-
-    OpRegImm64(kOpAdd, rs_sp, frame_size_ - drop);
-  }
+  UnspillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_);
 
   // Finally return.
   NewLIR0(kA64Ret);
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index ac3651942d..bc12c455e7 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -219,11 +219,12 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
     void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
     void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
 
-    uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2);
-    void UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask);
-    void SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask);
-    void UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask);
-    void SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask);
+    // Spill core and FP registers. Returns the SP difference: either spill size, or whole
+    // frame size.
+    int SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
+
+    // Unspill core and FP registers.
+    void UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
 
     // Required for target - single operation generators.
     LIR* OpUnconditionalBranch(LIR* target);
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index f9f85f4223..d8df30fb08 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -22,6 +22,7 @@
 #include "dex/reg_storage_eq.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array.h"
+#include "utils.h"
 
 namespace art {
 
@@ -1237,6 +1238,14 @@ void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_de
   StoreValueWide(rl_dest, rl_result);
 }
 
+static uint32_t ExtractReg(uint32_t reg_mask, int* reg) {
+  // Find first register.
+  int first_bit_set = CTZ(reg_mask) + 1;
+  *reg = *reg + first_bit_set;
+  reg_mask >>= first_bit_set;
+  return reg_mask;
+}
+
 /**
  * @brief Split a register list in pairs or registers.
  *
@@ -1253,15 +1262,15 @@ void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_de
  *   }
  * @endcode
  */
-uint32_t Arm64Mir2Lir::GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
+static uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
   // Find first register.
-  int first_bit_set = __builtin_ctz(reg_mask) + 1;
+  int first_bit_set = CTZ(reg_mask) + 1;
   int reg = *reg1 + first_bit_set;
   reg_mask >>= first_bit_set;
 
   if (LIKELY(reg_mask)) {
     // Save the first register, find the second and use the pair opcode.
-    int second_bit_set = __builtin_ctz(reg_mask) + 1;
+    int second_bit_set = CTZ(reg_mask) + 1;
     *reg2 = reg;
     reg_mask >>= second_bit_set;
     *reg1 = reg + second_bit_set;
@@ -1274,68 +1283,274 @@ uint32_t Arm64Mir2Lir::GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
   return reg_mask;
 }
 
-void Arm64Mir2Lir::UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static void SpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
   const int reg_log2_size = 3;
 
   for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
-     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+    reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     } else {
-      DCHECK_LE(offset, 63);
-      NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
-              RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
-void Arm64Mir2Lir::SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
+// TODO(Arm64): consider using ld1 and st1?
+static void SpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
   const int reg_log2_size = 3;
 
   for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+                   offset);
     } else {
-      NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
-              RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                   RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
-void Arm64Mir2Lir::UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static int SpillRegsPreSub(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask,
+                           uint32_t fp_reg_mask, int frame_size) {
+  m2l->OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size);
+
+  int core_count = POPCOUNT(core_reg_mask);
+
+  if (fp_reg_mask != 0) {
+    // Spill FP regs.
+    int fp_count = POPCOUNT(fp_reg_mask);
+    int spill_offset = frame_size - (core_count + fp_count) * kArm64PointerSize;
+    SpillFPRegs(m2l, rs_sp, spill_offset, fp_reg_mask);
+  }
+
+  if (core_reg_mask != 0) {
+    // Spill core regs.
+    int spill_offset = frame_size - (core_count * kArm64PointerSize);
+    SpillCoreRegs(m2l, rs_sp, spill_offset, core_reg_mask);
+  }
+
+  return frame_size;
+}
+
+static int SpillRegsPreIndexed(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask,
+                               uint32_t fp_reg_mask, int frame_size) {
+  // Otherwise, spill both core and fp regs at the same time.
+  // The very first instruction will be an stp with pre-indexed address, moving the stack pointer
+  // down. From then on, we fill upwards. This will generate overall the same number of instructions
+  // as the specialized code above in most cases (exception being odd number of core and even
+  // non-zero fp spills), but is more flexible, as the offsets are guaranteed small.
+  //
+  // Some demonstrative fill cases : (c) = core, (f) = fp
+  // cc    44   cc    44   cc    22   cc    33   fc => 1[1/2]
+  // fc => 23   fc => 23   ff => 11   ff => 22
+  // ff    11    f    11               f    11
+  //
+  int reg1 = -1, reg2 = -1;
+  int core_count = POPCOUNT(core_reg_mask);
+  int fp_count = POPCOUNT(fp_reg_mask);
+
+  int combined = fp_count + core_count;
+  int all_offset = RoundUp(combined, 2);  // Needs to be 16B = 2-reg aligned.
+
+  int cur_offset = 2;  // What's the starting offset after the first stp? We expect the base slot
+                       // to be filled.
+
+  // First figure out whether the bottom is FP or core.
+  if (fp_count > 0) {
+    // Some FP spills.
+    //
+    // Four cases: (d0 is dummy to fill up stp)
+    // 1) Single FP, even number of core -> stp d0, fp_reg
+    // 2) Single FP, odd number of core -> stp fp_reg, d0
+    // 3) More FP, even number combined -> stp fp_reg1, fp_reg2
+    // 4) More FP, odd number combined -> stp d0, fp_reg
+    if (fp_count == 1) {
+      fp_reg_mask = ExtractReg(fp_reg_mask, &reg1);
+      DCHECK_EQ(fp_reg_mask, 0U);
+      if (core_count % 2 == 0) {
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     base.GetReg(), -all_offset);
+      } else {
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     base.GetReg(), -all_offset);
+        cur_offset = 0;  // That core reg needs to go into the upper half.
+      }
+    } else {
+      if (combined % 2 == 0) {
+        fp_reg_mask = GenPairWise(fp_reg_mask, &reg1, &reg2);
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), -all_offset);
+      } else {
+        fp_reg_mask = ExtractReg(fp_reg_mask, &reg1);
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD), rs_d0.GetReg(), RegStorage::FloatSolo64(reg1).GetReg(),
+                     base.GetReg(), -all_offset);
+      }
+    }
+  } else {
+    // No FP spills.
+    //
+    // Two cases:
+    // 1) Even number of core -> stp core1, core2
+    // 2) Odd number of core -> stp xzr, core1
+    if (core_count % 2 == 1) {
+      core_reg_mask = ExtractReg(core_reg_mask, &reg1);
+      m2l->NewLIR4(WIDE(kA64StpPre4rrXD), rs_xzr.GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset);
+    } else {
+      core_reg_mask = GenPairWise(core_reg_mask, &reg1, &reg2);
+      m2l->NewLIR4(WIDE(kA64StpPre4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset);
+    }
+  }
+
+  if (fp_count != 0) {
+    for (; fp_reg_mask != 0;) {
+      // Have some FP regs to do.
+      fp_reg_mask = GenPairWise(fp_reg_mask, &reg1, &reg2);
+      if (UNLIKELY(reg2 < 0)) {
+        m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+                     cur_offset);
+        // Do not increment offset here, as the second half will be filled by a core reg.
+      } else {
+        m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), cur_offset);
+        cur_offset += 2;
+      }
+    }
+
+    // Reset counting.
+    reg1 = -1;
+
+    // If there is an odd number of core registers, we need to store the bottom now.
+    if (core_count % 2 == 1) {
+      core_reg_mask = ExtractReg(core_reg_mask, &reg1);
+      m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(),
+                   cur_offset + 1);
+      cur_offset += 2;  // Half-slot filled now.
+    }
+  }
+
+  // Spill the rest of the core regs. They are guaranteed to be even.
+  DCHECK_EQ(POPCOUNT(core_reg_mask) % 2, 0);
+  for (; core_reg_mask != 0; cur_offset += 2) {
+    core_reg_mask = GenPairWise(core_reg_mask, &reg1, &reg2);
+    m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                 RegStorage::Solo64(reg1).GetReg(), base.GetReg(), cur_offset);
+  }
+
+  DCHECK_EQ(cur_offset, all_offset);
+
+  return all_offset * 8;
+}
+
+int Arm64Mir2Lir::SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask,
+                            int frame_size) {
+  // If the frame size is small enough that all offsets would fit into the immediates, use that
+  // setup, as it decrements sp early (kind of instruction scheduling), and is not worse
+  // instruction-count wise than the complicated code below.
+  //
+  // This case is also optimal when we have an odd number of core spills, and an even (non-zero)
+  // number of fp spills.
+  if ((RoundUp(frame_size, 8) / 8 <= 63)) {
+    return SpillRegsPreSub(this, base, core_reg_mask, fp_reg_mask, frame_size);
+  } else {
+    return SpillRegsPreIndexed(this, base, core_reg_mask, fp_reg_mask, frame_size);
+  }
+}
+
+static void UnSpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
   const int reg_log2_size = 3;
 
   for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
-     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+    reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     } else {
-      NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
-              RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      DCHECK_LE(offset, 63);
+      m2l->NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
-// TODO(Arm64): consider using ld1 and st1?
-void Arm64Mir2Lir::SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static void UnSpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
   const int reg_log2_size = 3;
 
   for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
-    reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+                   offset);
     } else {
-      NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
-              RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                   RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
+void Arm64Mir2Lir::UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask,
+                               int frame_size) {
+  // Restore saves and drop stack frame.
+  // 2 versions:
+  //
+  // 1. (Original): Try to address directly, then drop the whole frame.
+  //                Limitation: ldp is a 7b signed immediate.
+  //
+  // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
+  //           in range. Then drop the rest.
+  //
+  // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
+  //       in variant 1.
+
+  // "Magic" constant, 63 (max signed 7b) * 8.
+  static constexpr int kMaxFramesizeForOffset = 63 * kArm64PointerSize;
+
+  const int num_core_spills = POPCOUNT(core_reg_mask);
+  const int num_fp_spills = POPCOUNT(fp_reg_mask);
+
+  int early_drop = 0;
+
+  if (frame_size > kMaxFramesizeForOffset) {
+    // Second variant. Drop the frame part.
+
+    // TODO: Always use the first formula, as num_fp_spills would be zero?
+    if (fp_reg_mask != 0) {
+      early_drop = frame_size - kArm64PointerSize * (num_fp_spills + num_core_spills);
+    } else {
+      early_drop = frame_size - kArm64PointerSize * num_core_spills;
+    }
+
+    // Drop needs to be 16B aligned, so that SP keeps aligned.
+    early_drop = RoundDown(early_drop, 16);
+
+    OpRegImm64(kOpAdd, rs_sp, early_drop);
+  }
+
+  // Unspill.
+  if (fp_reg_mask != 0) {
+    int offset = frame_size - early_drop - kArm64PointerSize * (num_fp_spills + num_core_spills);
+    UnSpillFPRegs(this, rs_sp, offset, fp_reg_mask);
+  }
+  if (core_reg_mask != 0) {
+    int offset = frame_size - early_drop - kArm64PointerSize * num_core_spills;
+    UnSpillCoreRegs(this, rs_sp, offset, core_reg_mask);
+  }
+
+  // Drop the (rest of) the frame.
+  OpRegImm64(kOpAdd, rs_sp, frame_size - early_drop);
+}
+
 bool Arm64Mir2Lir::GenInlinedReverseBits(CallInfo* info, OpSize size) {
   ArmOpcode wide = (size == k64) ? WIDE(0) : UNWIDE(0);
   RegLocation rl_src_i = info->args[0];