diff options
| -rw-r--r-- | compiler/dex/quick/arm64/arm64_lir.h | 1 | ||||
| -rw-r--r-- | compiler/dex/quick/arm64/assemble_arm64.cc | 5 | ||||
| -rw-r--r-- | compiler/dex/quick/arm64/call_arm64.cc | 106 | ||||
| -rw-r--r-- | compiler/dex/quick/arm64/codegen_arm64.h | 11 | ||||
| -rw-r--r-- | compiler/dex/quick/arm64/int_arm64.cc | 263 |
5 files changed, 265 insertions, 121 deletions
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h index 3a8ea3f96e..90cb156749 100644 --- a/compiler/dex/quick/arm64/arm64_lir.h +++ b/compiler/dex/quick/arm64/arm64_lir.h @@ -331,6 +331,7 @@ enum ArmOpcode { kA64Stp4ffXD, // stp [0s10110100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0]. kA64Stp4rrXD, // stp [s010100100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0]. kA64StpPost4rrXD, // stp [s010100010] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0]. + kA64StpPre4ffXD, // stp [0s10110110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0]. kA64StpPre4rrXD, // stp [s010100110] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0]. kA64Str3fXD, // str [1s11110100] imm_12[21-10] rn[9-5] rt[4-0]. kA64Str4fXxG, // str [1s111100001] rm[20-16] [011] S[12] [10] rn[9-5] rt[4-0]. diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc index 462be54e57..5351ce50bb 100644 --- a/compiler/dex/quick/arm64/assemble_arm64.cc +++ b/compiler/dex/quick/arm64/assemble_arm64.cc @@ -518,6 +518,10 @@ const ArmEncodingMap Arm64Mir2Lir::EncodingMap[kA64Last] = { kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5, kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE, "stp", "!0r, !1r, [!2X], #!3D", kFixupNone), + ENCODING_MAP(WIDE(kA64StpPre4ffXD), CUSTOM_VARIANTS(0x2d800000, 0x6d800000), + kFmtRegF, 4, 0, kFmtRegF, 14, 10, kFmtRegXOrSp, 9, 5, + kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE, + "stp", "!0r, !1f, [!2X, #!3D]!!", kFixupNone), ENCODING_MAP(WIDE(kA64StpPre4rrXD), CUSTOM_VARIANTS(0x29800000, 0xa9800000), kFmtRegR, 4, 0, kFmtRegR, 14, 10, kFmtRegXOrSp, 9, 5, kFmtBitBlt, 21, 15, IS_QUAD_OP | REG_DEF2 | REG_USE012 | IS_STORE, @@ -723,6 +727,7 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) { << " @ 0x" << std::hex << lir->dalvik_offset; if (kFailOnSizeError) { LOG(FATAL) << "Bad argument n. " << i << " of " << encoder->name + << "(" << UNWIDE(encoder->opcode) << ", " << encoder->fmt << ")" << ". Expected " << expected << ", got 0x" << std::hex << operand; } else { LOG(WARNING) << "Bad argument n. " << i << " of " << encoder->name diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc index e584548558..6fa8a4aca5 100644 --- a/compiler/dex/quick/arm64/call_arm64.cc +++ b/compiler/dex/quick/arm64/call_arm64.cc @@ -330,19 +330,14 @@ void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) NewLIR0(kPseudoMethodEntry); - const size_t kStackOverflowReservedUsableBytes = GetStackOverflowReservedBytes(kArm64) - - Thread::kStackOverflowSignalReservedBytes; - const bool large_frame = static_cast<size_t>(frame_size_) > kStackOverflowReservedUsableBytes; const int spill_count = num_core_spills_ + num_fp_spills_; const int spill_size = (spill_count * kArm64PointerSize + 15) & ~0xf; // SP 16 byte alignment. const int frame_size_without_spills = frame_size_ - spill_size; if (!skip_overflow_check) { if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitStackOverflowChecks()) { - if (!large_frame) { - // Load stack limit - LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1); - } + // Load stack limit + LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1); } else { // TODO(Arm64) Implement implicit checks. // Implicit stack overflow check. @@ -350,24 +345,21 @@ void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) // redzone we will get a segmentation fault. // Load32Disp(rs_wSP, -Thread::kStackOverflowReservedBytes, rs_wzr); // MarkPossibleStackOverflowException(); + // + // TODO: If the frame size is small enough, is it possible to make this a pre-indexed load, + // so that we can avoid the following "sub sp" when spilling? LOG(FATAL) << "Implicit stack overflow checks not implemented."; } } - if (frame_size_ > 0) { - OpRegImm64(kOpSub, rs_sp, spill_size); + int spilled_already = 0; + if (spill_size > 0) { + spilled_already = SpillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_); + DCHECK(spill_size == spilled_already || frame_size_ == spilled_already); } - /* Need to spill any FP regs? */ - if (fp_spill_mask_) { - int spill_offset = spill_size - kArm64PointerSize*(num_fp_spills_ + num_core_spills_); - SpillFPRegs(rs_sp, spill_offset, fp_spill_mask_); - } - - /* Spill core callee saves. */ - if (core_spill_mask_) { - int spill_offset = spill_size - kArm64PointerSize*num_core_spills_; - SpillCoreRegs(rs_sp, spill_offset, core_spill_mask_); + if (spilled_already != frame_size_) { + OpRegImm(kOpSub, rs_sp, frame_size_without_spills); } if (!skip_overflow_check) { @@ -396,29 +388,9 @@ void Arm64Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) const size_t sp_displace_; }; - if (large_frame) { - // Compare Expected SP against bottom of stack. - // Branch to throw target if there is not enough room. - OpRegRegImm(kOpSub, rs_xIP1, rs_sp, frame_size_without_spills); - LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP0); - LIR* branch = OpCmpBranch(kCondUlt, rs_xIP1, rs_xIP0, nullptr); - AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, spill_size)); - OpRegCopy(rs_sp, rs_xIP1); // Establish stack after checks. - } else { - /* - * If the frame is small enough we are guaranteed to have enough space that remains to - * handle signals on the user stack. - * Establishes stack before checks. - */ - OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size_without_spills); - LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr); - AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_)); - } - } else { - OpRegImm(kOpSub, rs_sp, frame_size_without_spills); + LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr); + AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_)); } - } else { - OpRegImm(kOpSub, rs_sp, frame_size_without_spills); } FlushIns(ArgLocs, rl_method); @@ -445,57 +417,7 @@ void Arm64Mir2Lir::GenExitSequence() { NewLIR0(kPseudoMethodExit); - // Restore saves and drop stack frame. - // 2 versions: - // - // 1. (Original): Try to address directly, then drop the whole frame. - // Limitation: ldp is a 7b signed immediate. There should have been a DCHECK! - // - // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be - // in range. Then drop the rest. - // - // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads - // in variant 1. - - if (frame_size_ <= 504) { - // "Magic" constant, 63 (max signed 7b) * 8. Do variant 1. - // Could be tighter, as the last load is below frame_size_ offset. - if (fp_spill_mask_) { - int spill_offset = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_); - UnSpillFPRegs(rs_sp, spill_offset, fp_spill_mask_); - } - if (core_spill_mask_) { - int spill_offset = frame_size_ - kArm64PointerSize * num_core_spills_; - UnSpillCoreRegs(rs_sp, spill_offset, core_spill_mask_); - } - - OpRegImm64(kOpAdd, rs_sp, frame_size_); - } else { - // Second variant. Drop the frame part. - int drop = 0; - // TODO: Always use the first formula, as num_fp_spills would be zero? - if (fp_spill_mask_) { - drop = frame_size_ - kArm64PointerSize * (num_fp_spills_ + num_core_spills_); - } else { - drop = frame_size_ - kArm64PointerSize * num_core_spills_; - } - - // Drop needs to be 16B aligned, so that SP keeps aligned. - drop = RoundDown(drop, 16); - - OpRegImm64(kOpAdd, rs_sp, drop); - - if (fp_spill_mask_) { - int offset = frame_size_ - drop - kArm64PointerSize * (num_fp_spills_ + num_core_spills_); - UnSpillFPRegs(rs_sp, offset, fp_spill_mask_); - } - if (core_spill_mask_) { - int offset = frame_size_ - drop - kArm64PointerSize * num_core_spills_; - UnSpillCoreRegs(rs_sp, offset, core_spill_mask_); - } - - OpRegImm64(kOpAdd, rs_sp, frame_size_ - drop); - } + UnspillRegs(rs_sp, core_spill_mask_, fp_spill_mask_, frame_size_); // Finally return. NewLIR0(kA64Ret); diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h index ac3651942d..bc12c455e7 100644 --- a/compiler/dex/quick/arm64/codegen_arm64.h +++ b/compiler/dex/quick/arm64/codegen_arm64.h @@ -219,11 +219,12 @@ class Arm64Mir2Lir FINAL : public Mir2Lir { void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); - uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2); - void UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask); - void SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask); - void UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask); - void SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask); + // Spill core and FP registers. Returns the SP difference: either spill size, or whole + // frame size. + int SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size); + + // Unspill core and FP registers. + void UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, int frame_size); // Required for target - single operation generators. LIR* OpUnconditionalBranch(LIR* target); diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc index f9f85f4223..d8df30fb08 100644 --- a/compiler/dex/quick/arm64/int_arm64.cc +++ b/compiler/dex/quick/arm64/int_arm64.cc @@ -22,6 +22,7 @@ #include "dex/reg_storage_eq.h" #include "entrypoints/quick/quick_entrypoints.h" #include "mirror/array.h" +#include "utils.h" namespace art { @@ -1237,6 +1238,14 @@ void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_de StoreValueWide(rl_dest, rl_result); } +static uint32_t ExtractReg(uint32_t reg_mask, int* reg) { + // Find first register. + int first_bit_set = CTZ(reg_mask) + 1; + *reg = *reg + first_bit_set; + reg_mask >>= first_bit_set; + return reg_mask; +} + /** * @brief Split a register list in pairs or registers. * @@ -1253,15 +1262,15 @@ void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_de * } * @endcode */ -uint32_t Arm64Mir2Lir::GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) { +static uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) { // Find first register. - int first_bit_set = __builtin_ctz(reg_mask) + 1; + int first_bit_set = CTZ(reg_mask) + 1; int reg = *reg1 + first_bit_set; reg_mask >>= first_bit_set; if (LIKELY(reg_mask)) { // Save the first register, find the second and use the pair opcode. - int second_bit_set = __builtin_ctz(reg_mask) + 1; + int second_bit_set = CTZ(reg_mask) + 1; *reg2 = reg; reg_mask >>= second_bit_set; *reg1 = reg + second_bit_set; @@ -1274,68 +1283,274 @@ uint32_t Arm64Mir2Lir::GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) { return reg_mask; } -void Arm64Mir2Lir::UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) { +static void SpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) { int reg1 = -1, reg2 = -1; const int reg_log2_size = 3; for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) { - reg_mask = GenPairWise(reg_mask, & reg1, & reg2); + reg_mask = GenPairWise(reg_mask, & reg1, & reg2); if (UNLIKELY(reg2 < 0)) { - NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); } else { - DCHECK_LE(offset, 63); - NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(), - RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(), + RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); } } } -void Arm64Mir2Lir::SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) { +// TODO(Arm64): consider using ld1 and st1? +static void SpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) { int reg1 = -1, reg2 = -1; const int reg_log2_size = 3; for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) { reg_mask = GenPairWise(reg_mask, & reg1, & reg2); if (UNLIKELY(reg2 < 0)) { - NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), + offset); } else { - NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(), - RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(), + RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset); } } } -void Arm64Mir2Lir::UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) { +static int SpillRegsPreSub(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask, + uint32_t fp_reg_mask, int frame_size) { + m2l->OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size); + + int core_count = POPCOUNT(core_reg_mask); + + if (fp_reg_mask != 0) { + // Spill FP regs. + int fp_count = POPCOUNT(fp_reg_mask); + int spill_offset = frame_size - (core_count + fp_count) * kArm64PointerSize; + SpillFPRegs(m2l, rs_sp, spill_offset, fp_reg_mask); + } + + if (core_reg_mask != 0) { + // Spill core regs. + int spill_offset = frame_size - (core_count * kArm64PointerSize); + SpillCoreRegs(m2l, rs_sp, spill_offset, core_reg_mask); + } + + return frame_size; +} + +static int SpillRegsPreIndexed(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask, + uint32_t fp_reg_mask, int frame_size) { + // Otherwise, spill both core and fp regs at the same time. + // The very first instruction will be an stp with pre-indexed address, moving the stack pointer + // down. From then on, we fill upwards. This will generate overall the same number of instructions + // as the specialized code above in most cases (exception being odd number of core and even + // non-zero fp spills), but is more flexible, as the offsets are guaranteed small. + // + // Some demonstrative fill cases : (c) = core, (f) = fp + // cc 44 cc 44 cc 22 cc 33 fc => 1[1/2] + // fc => 23 fc => 23 ff => 11 ff => 22 + // ff 11 f 11 f 11 + // + int reg1 = -1, reg2 = -1; + int core_count = POPCOUNT(core_reg_mask); + int fp_count = POPCOUNT(fp_reg_mask); + + int combined = fp_count + core_count; + int all_offset = RoundUp(combined, 2); // Needs to be 16B = 2-reg aligned. + + int cur_offset = 2; // What's the starting offset after the first stp? We expect the base slot + // to be filled. + + // First figure out whether the bottom is FP or core. + if (fp_count > 0) { + // Some FP spills. + // + // Four cases: (d0 is dummy to fill up stp) + // 1) Single FP, even number of core -> stp d0, fp_reg + // 2) Single FP, odd number of core -> stp fp_reg, d0 + // 3) More FP, even number combined -> stp fp_reg1, fp_reg2 + // 4) More FP, odd number combined -> stp d0, fp_reg + if (fp_count == 1) { + fp_reg_mask = ExtractReg(fp_reg_mask, ®1); + DCHECK_EQ(fp_reg_mask, 0U); + if (core_count % 2 == 0) { + m2l->NewLIR4(WIDE(kA64StpPre4ffXD), + RegStorage::FloatSolo64(reg1).GetReg(), + RegStorage::FloatSolo64(reg1).GetReg(), + base.GetReg(), -all_offset); + } else { + m2l->NewLIR4(WIDE(kA64StpPre4ffXD), + RegStorage::FloatSolo64(reg1).GetReg(), + RegStorage::FloatSolo64(reg1).GetReg(), + base.GetReg(), -all_offset); + cur_offset = 0; // That core reg needs to go into the upper half. + } + } else { + if (combined % 2 == 0) { + fp_reg_mask = GenPairWise(fp_reg_mask, ®1, ®2); + m2l->NewLIR4(WIDE(kA64StpPre4ffXD), RegStorage::FloatSolo64(reg2).GetReg(), + RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), -all_offset); + } else { + fp_reg_mask = ExtractReg(fp_reg_mask, ®1); + m2l->NewLIR4(WIDE(kA64StpPre4ffXD), rs_d0.GetReg(), RegStorage::FloatSolo64(reg1).GetReg(), + base.GetReg(), -all_offset); + } + } + } else { + // No FP spills. + // + // Two cases: + // 1) Even number of core -> stp core1, core2 + // 2) Odd number of core -> stp xzr, core1 + if (core_count % 2 == 1) { + core_reg_mask = ExtractReg(core_reg_mask, ®1); + m2l->NewLIR4(WIDE(kA64StpPre4rrXD), rs_xzr.GetReg(), + RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset); + } else { + core_reg_mask = GenPairWise(core_reg_mask, ®1, ®2); + m2l->NewLIR4(WIDE(kA64StpPre4rrXD), RegStorage::Solo64(reg2).GetReg(), + RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset); + } + } + + if (fp_count != 0) { + for (; fp_reg_mask != 0;) { + // Have some FP regs to do. + fp_reg_mask = GenPairWise(fp_reg_mask, ®1, ®2); + if (UNLIKELY(reg2 < 0)) { + m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), + cur_offset); + // Do not increment offset here, as the second half will be filled by a core reg. + } else { + m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(), + RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), cur_offset); + cur_offset += 2; + } + } + + // Reset counting. + reg1 = -1; + + // If there is an odd number of core registers, we need to store the bottom now. + if (core_count % 2 == 1) { + core_reg_mask = ExtractReg(core_reg_mask, ®1); + m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), + cur_offset + 1); + cur_offset += 2; // Half-slot filled now. + } + } + + // Spill the rest of the core regs. They are guaranteed to be even. + DCHECK_EQ(POPCOUNT(core_reg_mask) % 2, 0); + for (; core_reg_mask != 0; cur_offset += 2) { + core_reg_mask = GenPairWise(core_reg_mask, ®1, ®2); + m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(), + RegStorage::Solo64(reg1).GetReg(), base.GetReg(), cur_offset); + } + + DCHECK_EQ(cur_offset, all_offset); + + return all_offset * 8; +} + +int Arm64Mir2Lir::SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, + int frame_size) { + // If the frame size is small enough that all offsets would fit into the immediates, use that + // setup, as it decrements sp early (kind of instruction scheduling), and is not worse + // instruction-count wise than the complicated code below. + // + // This case is also optimal when we have an odd number of core spills, and an even (non-zero) + // number of fp spills. + if ((RoundUp(frame_size, 8) / 8 <= 63)) { + return SpillRegsPreSub(this, base, core_reg_mask, fp_reg_mask, frame_size); + } else { + return SpillRegsPreIndexed(this, base, core_reg_mask, fp_reg_mask, frame_size); + } +} + +static void UnSpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) { int reg1 = -1, reg2 = -1; const int reg_log2_size = 3; for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) { - reg_mask = GenPairWise(reg_mask, & reg1, & reg2); + reg_mask = GenPairWise(reg_mask, & reg1, & reg2); if (UNLIKELY(reg2 < 0)) { - NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); } else { - NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(), - RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset); + DCHECK_LE(offset, 63); + m2l->NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(), + RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset); } } } -// TODO(Arm64): consider using ld1 and st1? -void Arm64Mir2Lir::SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) { +static void UnSpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) { int reg1 = -1, reg2 = -1; const int reg_log2_size = 3; for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) { - reg_mask = GenPairWise(reg_mask, & reg1, & reg2); + reg_mask = GenPairWise(reg_mask, & reg1, & reg2); if (UNLIKELY(reg2 < 0)) { - NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), + offset); } else { - NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(), - RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset); + m2l->NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(), + RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset); } } } +void Arm64Mir2Lir::UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask, + int frame_size) { + // Restore saves and drop stack frame. + // 2 versions: + // + // 1. (Original): Try to address directly, then drop the whole frame. + // Limitation: ldp is a 7b signed immediate. + // + // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be + // in range. Then drop the rest. + // + // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads + // in variant 1. + + // "Magic" constant, 63 (max signed 7b) * 8. + static constexpr int kMaxFramesizeForOffset = 63 * kArm64PointerSize; + + const int num_core_spills = POPCOUNT(core_reg_mask); + const int num_fp_spills = POPCOUNT(fp_reg_mask); + + int early_drop = 0; + + if (frame_size > kMaxFramesizeForOffset) { + // Second variant. Drop the frame part. + + // TODO: Always use the first formula, as num_fp_spills would be zero? + if (fp_reg_mask != 0) { + early_drop = frame_size - kArm64PointerSize * (num_fp_spills + num_core_spills); + } else { + early_drop = frame_size - kArm64PointerSize * num_core_spills; + } + + // Drop needs to be 16B aligned, so that SP keeps aligned. + early_drop = RoundDown(early_drop, 16); + + OpRegImm64(kOpAdd, rs_sp, early_drop); + } + + // Unspill. + if (fp_reg_mask != 0) { + int offset = frame_size - early_drop - kArm64PointerSize * (num_fp_spills + num_core_spills); + UnSpillFPRegs(this, rs_sp, offset, fp_reg_mask); + } + if (core_reg_mask != 0) { + int offset = frame_size - early_drop - kArm64PointerSize * num_core_spills; + UnSpillCoreRegs(this, rs_sp, offset, core_reg_mask); + } + + // Drop the (rest of) the frame. + OpRegImm64(kOpAdd, rs_sp, frame_size - early_drop); +} + bool Arm64Mir2Lir::GenInlinedReverseBits(CallInfo* info, OpSize size) { ArmOpcode wide = (size == k64) ? WIDE(0) : UNWIDE(0); RegLocation rl_src_i = info->args[0]; |