ART: Rework ARM64 entry sequence
Try to fold one sub of SP in the ARM64 entry sequence. When the
framesize is small, generate a sub over the full frame-size, and
adjust the spill offsets accordingly. If the framesize is too
large, use a pre-indexed store and fill upwards from there.
Change-Id: I1c15ac6276fb62b8164372de02fd92437f605938
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 3a8ea3f..90cb156 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -331,6 +331,7 @@
kA64Stp4ffXD, // stp [0s10110100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
kA64Stp4rrXD, // stp [s010100100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
kA64StpPost4rrXD, // stp [s010100010] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
+ kA64StpPre4ffXD, // stp [0s10110110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
kA64StpPre4rrXD, // stp [s010100110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
kA64Str3fXD, // str [1s11110100] imm_12[21-10] rn[9-5] rt[4-0].
kA64Str4fXxG, // str [1s111100001] rm[20-16] [011] S[12] [10] rn[9-5] rt[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 462be54..5351ce5 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -518,6 +518,10 @@
kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
"stp", "!0r, !1r, [!2X], #!3D", kFixupNone),
+ ENCODING_MAP(WIDE(kA64StpPre4ffXD), CUSTOM_VARIANTS(0x2d800000, 0x6d800000),
+ kFmtRegF, 4, 0, kFmtRegF, 14, 10, kFmtRegXOrSp, 9, 5,
+ kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
+ "stp", "!0r, !1f, [!2X, #!3D]!!", kFixupNone),
ENCODING_MAP(WIDE(kA64StpPre4rrXD), CUSTOM_VARIANTS(0x29800000, 0xa9800000),
kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5,
kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE,
@@ -723,6 +727,7 @@
<< " @ 0x" << std::hex << lir->dalvik_offset;
if (kFailOnSizeError) {
LOG(FATAL) << "Bad argument n. " << i << " of " << encoder->name
+ << "(" << UNWIDE(encoder->opcode) << ", " << encoder->fmt << ")"
<< ". Expected " << expected << ", got 0x" << std::hex << operand;
} else {
LOG(WARNING) << "Bad argument n. " << i << " of " << encoder->name
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index e584548..6fa8a4a 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -330,19 +330,14 @@
NewLIR0(kPseudoMethodEntry);
- const size_t kStackOverflowReservedUsableBytes = GetStackOverflowReservedBytes(kArm64) -
- Thread::kStackOverflowSignalReservedBytes;
- const bool large_frame = static_cast<size_t>(frame_size_) > kStackOverflowReservedUsableBytes;
const int spill_count = num_core_spills_ + num_fp_spills_;
const int spill_size = (spill_count * kArm64PointerSize + 15) & ~0xf; // SP 16 byte alignment.
const int frame_size_without_spills = frame_size_ - spill_size;
if (!skip_overflow_check) {
if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitStackOverflowChecks()) {
- if (!large_frame) {
- // Load stack limit
- LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
- }
+ // Load stack limit
+ LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
} else {
// TODO(Arm64) Implement implicit checks.
// Implicit stack overflow check.
@@ -350,24 +345,21 @@
// redzone we will get a segmentation fault.
// Load32Disp(rs_wSP, -Thread::kStackOverflowReservedBytes, rs_wzr);
// MarkPossibleStackOverflowException();
+ //
+ // TODO: If the frame size is small enough, is it possible to make this a pre-indexed load,
+ // so that we can avoid the following "sub sp" when spilling?
LOG(FATAL) << "Implicit stack overflow checks not implemented.";
}
}
- if (frame_size_ > 0) {
- OpRegImm64(kOpSub, rs_sp, spill_size);
+ int spilled_already = 0;
+ if (spill_size > 0) {
+ spilled_already = SpillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_);
+ DCHECK(spill_size == spilled_already || frame_size_ == spilled_already);
}
- /* Need to spill any FP regs? */
- if (fp_spill_mask_) {
- int spill_offset = spill_size - kArm64PointerSize*(num_fp_spills_ + num_core_spills_);
- SpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
- }
-
- /* Spill core callee saves. */
- if (core_spill_mask_) {
- int spill_offset = spill_size - kArm64PointerSize*num_core_spills_;
- SpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
+ if (spilled_already != frame_size_) {
+ OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
}
if (!skip_overflow_check) {
@@ -396,29 +388,9 @@
const size_t sp_displace_;
};
- if (large_frame) {
- // Compare Expected SP against bottom of stack.
- // Branch to throw target if there is not enough room.
- OpRegRegImm(kOpSub, rs_xIP1, rs_sp, frame_size_without_spills);
- LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP0);
- LIR* branch = OpCmpBranch(kCondUlt, rs_xIP1, rs_xIP0, nullptr);
- AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, spill_size));
- OpRegCopy(rs_sp, rs_xIP1); // Establish stack after checks.
- } else {
- /*
- * If the frame is small enough we are guaranteed to have enough space that remains to
- * handle signals on the user stack.
- * Establishes stack before checks.
- */
- OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size_without_spills);
- LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
- AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
- }
- } else {
- OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
+ LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
+ AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
}
- } else {
- OpRegImm(kOpSub, rs_sp, frame_size_without_spills);
}
FlushIns(ArgLocs, rl_method);
@@ -445,57 +417,7 @@
NewLIR0(kPseudoMethodExit);
- // Restore saves and drop stack frame.
- // 2 versions:
- //
- // 1. (Original): Try to address directly, then drop the whole frame.
- // Limitation: ldp is a 7b signed immediate. There should have been a DCHECK!
- //
- // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
- // in range. Then drop the rest.
- //
- // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
- // in variant 1.
-
- if (frame_size_ <= 504) {
- // "Magic" constant, 63 (max signed 7b) * 8. Do variant 1.
- // Could be tighter, as the last load is below frame_size_ offset.
- if (fp_spill_mask_) {
- int spill_offset = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
- UnSpillFPRegs(rs_sp, spill_offset, fp_spill_mask_);
- }
- if (core_spill_mask_) {
- int spill_offset = frame_size_ - kArm64PointerSize * num_core_spills_;
- UnSpillCoreRegs(rs_sp, spill_offset, core_spill_mask_);
- }
-
- OpRegImm64(kOpAdd, rs_sp, frame_size_);
- } else {
- // Second variant. Drop the frame part.
- int drop = 0;
- // TODO: Always use the first formula, as num_fp_spills would be zero?
- if (fp_spill_mask_) {
- drop = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
- } else {
- drop = frame_size_ - kArm64PointerSize * num_core_spills_;
- }
-
- // Drop needs to be 16B aligned, so that SP keeps aligned.
- drop = RoundDown(drop, 16);
-
- OpRegImm64(kOpAdd, rs_sp, drop);
-
- if (fp_spill_mask_) {
- int offset = frame_size_ - drop - kArm64PointerSize * (num_fp_spills_ + num_core_spills_);
- UnSpillFPRegs(rs_sp, offset, fp_spill_mask_);
- }
- if (core_spill_mask_) {
- int offset = frame_size_ - drop - kArm64PointerSize * num_core_spills_;
- UnSpillCoreRegs(rs_sp, offset, core_spill_mask_);
- }
-
- OpRegImm64(kOpAdd, rs_sp, frame_size_ - drop);
- }
+ UnspillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_);
// Finally return.
NewLIR0(kA64Ret);
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index ac36519..bc12c45 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -219,11 +219,12 @@
void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
- uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2);
- void UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask);
- void SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask);
- void UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask);
- void SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask);
+ // Spill core and FP registers. Returns the SP difference: either spill size, or whole
+ // frame size.
+ int SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
+
+ // Unspill core and FP registers.
+ void UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size);
// Required for target - single operation generators.
LIR* OpUnconditionalBranch(LIR* target);
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index f9f85f4..d8df30f 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -22,6 +22,7 @@
#include "dex/reg_storage_eq.h"
#include "entrypoints/quick/quick_entrypoints.h"
#include "mirror/array.h"
+#include "utils.h"
namespace art {
@@ -1237,6 +1238,14 @@
StoreValueWide(rl_dest, rl_result);
}
+static uint32_t ExtractReg(uint32_t reg_mask, int* reg) {
+ // Find first register.
+ int first_bit_set = CTZ(reg_mask) + 1;
+ *reg = *reg + first_bit_set;
+ reg_mask >>= first_bit_set;
+ return reg_mask;
+}
+
/**
* @brief Split a register list in pairs or registers.
*
@@ -1253,15 +1262,15 @@
* }
* @endcode
*/
-uint32_t Arm64Mir2Lir::GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
+static uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
// Find first register.
- int first_bit_set = __builtin_ctz(reg_mask) + 1;
+ int first_bit_set = CTZ(reg_mask) + 1;
int reg = *reg1 + first_bit_set;
reg_mask >>= first_bit_set;
if (LIKELY(reg_mask)) {
// Save the first register, find the second and use the pair opcode.
- int second_bit_set = __builtin_ctz(reg_mask) + 1;
+ int second_bit_set = CTZ(reg_mask) + 1;
*reg2 = reg;
reg_mask >>= second_bit_set;
*reg1 = reg + second_bit_set;
@@ -1274,68 +1283,274 @@
return reg_mask;
}
-void Arm64Mir2Lir::UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
- int reg1 = -1, reg2 = -1;
- const int reg_log2_size = 3;
-
- for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
- reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
- if (UNLIKELY(reg2 < 0)) {
- NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
- } else {
- DCHECK_LE(offset, 63);
- NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
- RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
- }
- }
-}
-
-void Arm64Mir2Lir::SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static void SpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
int reg1 = -1, reg2 = -1;
const int reg_log2_size = 3;
for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
if (UNLIKELY(reg2 < 0)) {
- NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+ m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
} else {
- NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
- RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
- }
- }
-}
-
-void Arm64Mir2Lir::UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
- int reg1 = -1, reg2 = -1;
- const int reg_log2_size = 3;
-
- for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
- reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
- if (UNLIKELY(reg2 < 0)) {
- NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
- } else {
- NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
- RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+ m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+ RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
}
}
}
// TODO(Arm64): consider using ld1 and st1?
-void Arm64Mir2Lir::SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static void SpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
int reg1 = -1, reg2 = -1;
const int reg_log2_size = 3;
for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
if (UNLIKELY(reg2 < 0)) {
- NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+ m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+ offset);
} else {
- NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
- RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+ m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+ RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
}
}
}
+static int SpillRegsPreSub(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask,
+ uint32_t fp_reg_mask, int frame_size) {
+ m2l->OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size);
+
+ int core_count = POPCOUNT(core_reg_mask);
+
+ if (fp_reg_mask != 0) {
+ // Spill FP regs.
+ int fp_count = POPCOUNT(fp_reg_mask);
+ int spill_offset = frame_size - (core_count + fp_count) * kArm64PointerSize;
+ SpillFPRegs(m2l, rs_sp, spill_offset, fp_reg_mask);
+ }
+
+ if (core_reg_mask != 0) {
+ // Spill core regs.
+ int spill_offset = frame_size - (core_count * kArm64PointerSize);
+ SpillCoreRegs(m2l, rs_sp, spill_offset, core_reg_mask);
+ }
+
+ return frame_size;
+}
+
+static int SpillRegsPreIndexed(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask,
+ uint32_t fp_reg_mask, int frame_size) {
+ // Otherwise, spill both core and fp regs at the same time.
+ // The very first instruction will be an stp with pre-indexed address, moving the stack pointer
+ // down. From then on, we fill upwards. This will generate overall the same number of instructions
+ // as the specialized code above in most cases (exception being odd number of core and even
+ // non-zero fp spills), but is more flexible, as the offsets are guaranteed small.
+ //
+ // Some demonstrative fill cases : (c) = core, (f) = fp
+ // cc 44 cc 44 cc 22 cc 33 fc => 1[1/2]
+ // fc => 23 fc => 23 ff => 11 ff => 22
+ // ff 11 f 11 f 11
+ //
+ int reg1 = -1, reg2 = -1;
+ int core_count = POPCOUNT(core_reg_mask);
+ int fp_count = POPCOUNT(fp_reg_mask);
+
+ int combined = fp_count + core_count;
+ int all_offset = RoundUp(combined, 2); // Needs to be 16B = 2-reg aligned.
+
+ int cur_offset = 2; // What's the starting offset after the first stp? We expect the base slot
+ // to be filled.
+
+ // First figure out whether the bottom is FP or core.
+ if (fp_count > 0) {
+ // Some FP spills.
+ //
+ // Four cases: (d0 is dummy to fill up stp)
+ // 1) Single FP, even number of core -> stp d0, fp_reg
+ // 2) Single FP, odd number of core -> stp fp_reg, d0
+ // 3) More FP, even number combined -> stp fp_reg1, fp_reg2
+ // 4) More FP, odd number combined -> stp d0, fp_reg
+ if (fp_count == 1) {
+ fp_reg_mask = ExtractReg(fp_reg_mask, ®1);
+ DCHECK_EQ(fp_reg_mask, 0U);
+ if (core_count % 2 == 0) {
+ m2l->NewLIR4(WIDE(kA64StpPre4ffXD),
+ RegStorage::FloatSolo64(reg1).GetReg(),
+ RegStorage::FloatSolo64(reg1).GetReg(),
+ base.GetReg(), -all_offset);
+ } else {
+ m2l->NewLIR4(WIDE(kA64StpPre4ffXD),
+ RegStorage::FloatSolo64(reg1).GetReg(),
+ RegStorage::FloatSolo64(reg1).GetReg(),
+ base.GetReg(), -all_offset);
+ cur_offset = 0; // That core reg needs to go into the upper half.
+ }
+ } else {
+ if (combined % 2 == 0) {
+ fp_reg_mask = GenPairWise(fp_reg_mask, ®1, ®2);
+ m2l->NewLIR4(WIDE(kA64StpPre4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+ RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), -all_offset);
+ } else {
+ fp_reg_mask = ExtractReg(fp_reg_mask, ®1);
+ m2l->NewLIR4(WIDE(kA64StpPre4ffXD), rs_d0.GetReg(), RegStorage::FloatSolo64(reg1).GetReg(),
+ base.GetReg(), -all_offset);
+ }
+ }
+ } else {
+ // No FP spills.
+ //
+ // Two cases:
+ // 1) Even number of core -> stp core1, core2
+ // 2) Odd number of core -> stp xzr, core1
+ if (core_count % 2 == 1) {
+ core_reg_mask = ExtractReg(core_reg_mask, ®1);
+ m2l->NewLIR4(WIDE(kA64StpPre4rrXD), rs_xzr.GetReg(),
+ RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset);
+ } else {
+ core_reg_mask = GenPairWise(core_reg_mask, ®1, ®2);
+ m2l->NewLIR4(WIDE(kA64StpPre4rrXD), RegStorage::Solo64(reg2).GetReg(),
+ RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset);
+ }
+ }
+
+ if (fp_count != 0) {
+ for (; fp_reg_mask != 0;) {
+ // Have some FP regs to do.
+ fp_reg_mask = GenPairWise(fp_reg_mask, ®1, ®2);
+ if (UNLIKELY(reg2 < 0)) {
+ m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+ cur_offset);
+ // Do not increment offset here, as the second half will be filled by a core reg.
+ } else {
+ m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+ RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), cur_offset);
+ cur_offset += 2;
+ }
+ }
+
+ // Reset counting.
+ reg1 = -1;
+
+ // If there is an odd number of core registers, we need to store the bottom now.
+ if (core_count % 2 == 1) {
+ core_reg_mask = ExtractReg(core_reg_mask, ®1);
+ m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(),
+ cur_offset + 1);
+ cur_offset += 2; // Half-slot filled now.
+ }
+ }
+
+ // Spill the rest of the core regs. They are guaranteed to be even.
+ DCHECK_EQ(POPCOUNT(core_reg_mask) % 2, 0);
+ for (; core_reg_mask != 0; cur_offset += 2) {
+ core_reg_mask = GenPairWise(core_reg_mask, ®1, ®2);
+ m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+ RegStorage::Solo64(reg1).GetReg(), base.GetReg(), cur_offset);
+ }
+
+ DCHECK_EQ(cur_offset, all_offset);
+
+ return all_offset * 8;
+}
+
+int Arm64Mir2Lir::SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask,
+ int frame_size) {
+ // If the frame size is small enough that all offsets would fit into the immediates, use that
+ // setup, as it decrements sp early (kind of instruction scheduling), and is not worse
+ // instruction-count wise than the complicated code below.
+ //
+ // This case is also optimal when we have an odd number of core spills, and an even (non-zero)
+ // number of fp spills.
+ if ((RoundUp(frame_size, 8) / 8 <= 63)) {
+ return SpillRegsPreSub(this, base, core_reg_mask, fp_reg_mask, frame_size);
+ } else {
+ return SpillRegsPreIndexed(this, base, core_reg_mask, fp_reg_mask, frame_size);
+ }
+}
+
+static void UnSpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
+ int reg1 = -1, reg2 = -1;
+ const int reg_log2_size = 3;
+
+ for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
+ reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+ if (UNLIKELY(reg2 < 0)) {
+ m2l->NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+ } else {
+ DCHECK_LE(offset, 63);
+ m2l->NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+ RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+ }
+ }
+}
+
+static void UnSpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
+ int reg1 = -1, reg2 = -1;
+ const int reg_log2_size = 3;
+
+ for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
+ reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+ if (UNLIKELY(reg2 < 0)) {
+ m2l->NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+ offset);
+ } else {
+ m2l->NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+ RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+ }
+ }
+}
+
+void Arm64Mir2Lir::UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask,
+ int frame_size) {
+ // Restore saves and drop stack frame.
+ // 2 versions:
+ //
+ // 1. (Original): Try to address directly, then drop the whole frame.
+ // Limitation: ldp is a 7b signed immediate.
+ //
+ // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
+ // in range. Then drop the rest.
+ //
+ // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
+ // in variant 1.
+
+ // "Magic" constant, 63 (max signed 7b) * 8.
+ static constexpr int kMaxFramesizeForOffset = 63 * kArm64PointerSize;
+
+ const int num_core_spills = POPCOUNT(core_reg_mask);
+ const int num_fp_spills = POPCOUNT(fp_reg_mask);
+
+ int early_drop = 0;
+
+ if (frame_size > kMaxFramesizeForOffset) {
+ // Second variant. Drop the frame part.
+
+ // TODO: Always use the first formula, as num_fp_spills would be zero?
+ if (fp_reg_mask != 0) {
+ early_drop = frame_size - kArm64PointerSize * (num_fp_spills + num_core_spills);
+ } else {
+ early_drop = frame_size - kArm64PointerSize * num_core_spills;
+ }
+
+ // Drop needs to be 16B aligned, so that SP keeps aligned.
+ early_drop = RoundDown(early_drop, 16);
+
+ OpRegImm64(kOpAdd, rs_sp, early_drop);
+ }
+
+ // Unspill.
+ if (fp_reg_mask != 0) {
+ int offset = frame_size - early_drop - kArm64PointerSize * (num_fp_spills + num_core_spills);
+ UnSpillFPRegs(this, rs_sp, offset, fp_reg_mask);
+ }
+ if (core_reg_mask != 0) {
+ int offset = frame_size - early_drop - kArm64PointerSize * num_core_spills;
+ UnSpillCoreRegs(this, rs_sp, offset, core_reg_mask);
+ }
+
+ // Drop the (rest of) the frame.
+ OpRegImm64(kOpAdd, rs_sp, frame_size - early_drop);
+}
+
bool Arm64Mir2Lir::GenInlinedReverseBits(CallInfo* info, OpSize size) {
ArmOpcode wide = (size == k64) ? WIDE(0) : UNWIDE(0);
RegLocation rl_src_i = info->args[0];