summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Vladimir Marko <vmarko@google.com> 2019-07-05 13:37:42 +0100
committer Vladimir Marko <vmarko@google.com> 2019-07-18 13:37:15 +0000
commit1a225a76ee6bc29833aee048b6cfae20242bdc8b (patch)
tree069bfc01d827fcbf9aa4415c4d63d354648f396c
parent323844002e54243e295497e7f829e46a533da621 (diff)
ARM/ARM64: Improve frame entry/exit codegen.
On ARM64, use STP pre-index for the method and the lowest spilled core register for method entry if there's no gap or FP spills in between. On exit, use LDP post-index to restore in this case, ignoring the method by loading to XZR. Thus, we save one instruction for both entry end exit for such methods and the performance should be the same or better. On ARM, use a single PUSH/POP for method entry and core spills if the gap between them is 2 words or less and and we have one or no FP spill, spill args as filler if needed. On exit, load the FP spill if any and do a single POP for core registers and return in this situation, clobbering as many registers from r2-r4 as needed; these caller-save registers are not used to pass return values. If we cannot do this because of FP spills but the gap between the method and FP spills is 2 words or less, we adjust SP and save the method in one PUSH after spilling; there is no similar handling for method exit as the method does not need to be restored. This may improve or degrade performance a bit depending on the particular situation; in the worst case we PUSH/POP three additional registers as a cost for smaller code size. aosp_taimen-userdebug prebuils: - before: arm/boot*.oat: 19147484 arm64/boot*.oat: 22558344 oat/arm/services.odex: 21922256 - after: arm/boot*.oat: 19105436 (-41KiB, -0.2%) arm64/boot*.oat: 22549624 (-9KiB, -0.04%) oat/arm/services.odex: 21914128 (-8KiB, -0.04%) Test: aosp_taimen-userdebug boots. Test: run-gtests.sh Test: testrunner.py --target --optimizing Bug: 136144107 Change-Id: Id36c67b4e735418fb18bcd3269b72b25695fbaa2
-rw-r--r--compiler/optimizing/code_generator_arm64.cc65
-rw-r--r--compiler/optimizing/code_generator_arm_vixl.cc149
-rw-r--r--compiler/optimizing/common_arm.h9
-rw-r--r--compiler/utils/arm/assembler_arm_vixl.h9
-rw-r--r--compiler/utils/arm/jni_macro_assembler_arm_vixl.cc8
-rw-r--r--compiler/utils/arm64/assembler_arm64.cc9
-rw-r--r--compiler/utils/arm64/assembler_arm64.h10
-rw-r--r--libelffile/dwarf/debug_frame_opcode_writer.h6
8 files changed, 177 insertions, 88 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 749350741d..cf596c7bf1 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1090,27 +1090,42 @@ void CodeGeneratorARM64::GenerateFrameEntry() {
}
if (!HasEmptyFrame()) {
- int frame_size = GetFrameSize();
// Stack layout:
// sp[frame_size - 8] : lr.
// ... : other preserved core registers.
// ... : other preserved fp registers.
// ... : reserved frame space.
// sp[0] : current method.
-
- // Save the current method if we need it. Note that we do not
- // do this in HCurrentMethod, as the instruction might have been removed
- // in the SSA graph.
- if (RequiresCurrentMethod()) {
+ int32_t frame_size = dchecked_integral_cast<int32_t>(GetFrameSize());
+ uint32_t core_spills_offset = frame_size - GetCoreSpillSize();
+ CPURegList preserved_core_registers = GetFramePreservedCoreRegisters();
+ DCHECK(!preserved_core_registers.IsEmpty());
+ uint32_t fp_spills_offset = frame_size - FrameEntrySpillSize();
+ CPURegList preserved_fp_registers = GetFramePreservedFPRegisters();
+
+ // Save the current method if we need it, or if using STP reduces code
+ // size. Note that we do not do this in HCurrentMethod, as the
+ // instruction might have been removed in the SSA graph.
+ CPURegister lowest_spill;
+ if (core_spills_offset == kXRegSizeInBytes) {
+ // If there is no gap between the method and the lowest core spill, use
+ // aligned STP pre-index to store both. Max difference is 512. We do
+ // that to reduce code size even if we do not have to save the method.
+ DCHECK_LE(frame_size, 512); // 32 core registers are only 256 bytes.
+ lowest_spill = preserved_core_registers.PopLowestIndex();
+ __ Stp(kArtMethodRegister, lowest_spill, MemOperand(sp, -frame_size, PreIndex));
+ } else if (RequiresCurrentMethod()) {
__ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex));
} else {
__ Claim(frame_size);
}
GetAssembler()->cfi().AdjustCFAOffset(frame_size);
- GetAssembler()->SpillRegisters(GetFramePreservedCoreRegisters(),
- frame_size - GetCoreSpillSize());
- GetAssembler()->SpillRegisters(GetFramePreservedFPRegisters(),
- frame_size - FrameEntrySpillSize());
+ if (lowest_spill.IsValid()) {
+ GetAssembler()->cfi().RelOffset(DWARFReg(lowest_spill), core_spills_offset);
+ core_spills_offset += kXRegSizeInBytes;
+ }
+ GetAssembler()->SpillRegisters(preserved_core_registers, core_spills_offset);
+ GetAssembler()->SpillRegisters(preserved_fp_registers, fp_spills_offset);
if (GetGraph()->HasShouldDeoptimizeFlag()) {
// Initialize should_deoptimize flag to 0.
@@ -1125,12 +1140,30 @@ void CodeGeneratorARM64::GenerateFrameEntry() {
void CodeGeneratorARM64::GenerateFrameExit() {
GetAssembler()->cfi().RememberState();
if (!HasEmptyFrame()) {
- int frame_size = GetFrameSize();
- GetAssembler()->UnspillRegisters(GetFramePreservedFPRegisters(),
- frame_size - FrameEntrySpillSize());
- GetAssembler()->UnspillRegisters(GetFramePreservedCoreRegisters(),
- frame_size - GetCoreSpillSize());
- __ Drop(frame_size);
+ int32_t frame_size = dchecked_integral_cast<int32_t>(GetFrameSize());
+ uint32_t core_spills_offset = frame_size - GetCoreSpillSize();
+ CPURegList preserved_core_registers = GetFramePreservedCoreRegisters();
+ DCHECK(!preserved_core_registers.IsEmpty());
+ uint32_t fp_spills_offset = frame_size - FrameEntrySpillSize();
+ CPURegList preserved_fp_registers = GetFramePreservedFPRegisters();
+
+ CPURegister lowest_spill;
+ if (core_spills_offset == kXRegSizeInBytes) {
+ // If there is no gap between the method and the lowest core spill, use
+ // aligned LDP pre-index to pop both. Max difference is 504. We do
+ // that to reduce code size even though the loaded method is unused.
+ DCHECK_LE(frame_size, 504); // 32 core registers are only 256 bytes.
+ lowest_spill = preserved_core_registers.PopLowestIndex();
+ core_spills_offset += kXRegSizeInBytes;
+ }
+ GetAssembler()->UnspillRegisters(preserved_fp_registers, fp_spills_offset);
+ GetAssembler()->UnspillRegisters(preserved_core_registers, core_spills_offset);
+ if (lowest_spill.IsValid()) {
+ __ Ldp(xzr, lowest_spill, MemOperand(sp, frame_size, PostIndex));
+ GetAssembler()->cfi().Restore(DWARFReg(lowest_spill));
+ } else {
+ __ Drop(frame_size);
+ }
GetAssembler()->cfi().AdjustCFAOffset(-frame_size);
}
__ Ret();
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index f4f2aa3d3f..49e7695e31 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -47,7 +47,6 @@ namespace vixl32 = vixl::aarch32;
using namespace vixl32; // NOLINT(build/namespaces)
using helpers::DRegisterFrom;
-using helpers::DWARFReg;
using helpers::HighRegisterFrom;
using helpers::InputDRegisterAt;
using helpers::InputOperandAt;
@@ -2126,32 +2125,66 @@ void CodeGeneratorARMVIXL::GenerateFrameEntry() {
RecordPcInfo(nullptr, 0);
}
- __ Push(RegisterList(core_spill_mask_));
- GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(core_spill_mask_));
- GetAssembler()->cfi().RelOffsetForMany(DWARFReg(kMethodRegister),
- 0,
- core_spill_mask_,
- kArmWordSize);
- if (fpu_spill_mask_ != 0) {
- uint32_t first = LeastSignificantBit(fpu_spill_mask_);
-
- // Check that list is contiguous.
- DCHECK_EQ(fpu_spill_mask_ >> CTZ(fpu_spill_mask_), ~0u >> (32 - POPCOUNT(fpu_spill_mask_)));
-
- __ Vpush(SRegisterList(vixl32::SRegister(first), POPCOUNT(fpu_spill_mask_)));
- GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(fpu_spill_mask_));
- GetAssembler()->cfi().RelOffsetForMany(DWARFReg(s0), 0, fpu_spill_mask_, kArmWordSize);
- }
-
- int adjust = GetFrameSize() - FrameEntrySpillSize();
- __ Sub(sp, sp, adjust);
- GetAssembler()->cfi().AdjustCFAOffset(adjust);
-
- // Save the current method if we need it. Note that we do not
- // do this in HCurrentMethod, as the instruction might have been removed
- // in the SSA graph.
- if (RequiresCurrentMethod()) {
- GetAssembler()->StoreToOffset(kStoreWord, kMethodRegister, sp, 0);
+ uint32_t frame_size = GetFrameSize();
+ uint32_t core_spills_offset = frame_size - GetCoreSpillSize();
+ uint32_t fp_spills_offset = frame_size - FrameEntrySpillSize();
+ if ((fpu_spill_mask_ == 0u || IsPowerOfTwo(fpu_spill_mask_)) &&
+ core_spills_offset <= 3u * kArmWordSize) {
+ // Do a single PUSH for core registers including the method and up to two
+ // filler registers. Then store the single FP spill if any.
+ // (The worst case is when the method is not required and we actually
+ // store 3 extra registers but they are stored in the same properly
+ // aligned 16-byte chunk where we're already writing anyway.)
+ DCHECK_EQ(kMethodRegister.GetCode(), 0u);
+ uint32_t extra_regs = MaxInt<uint32_t>(core_spills_offset / kArmWordSize);
+ DCHECK_LT(MostSignificantBit(extra_regs), LeastSignificantBit(core_spill_mask_));
+ __ Push(RegisterList(core_spill_mask_ | extra_regs));
+ GetAssembler()->cfi().AdjustCFAOffset(frame_size);
+ GetAssembler()->cfi().RelOffsetForMany(DWARFReg(kMethodRegister),
+ core_spills_offset,
+ core_spill_mask_,
+ kArmWordSize);
+ if (fpu_spill_mask_ != 0u) {
+ DCHECK(IsPowerOfTwo(fpu_spill_mask_));
+ vixl::aarch32::SRegister sreg(LeastSignificantBit(fpu_spill_mask_));
+ GetAssembler()->StoreSToOffset(sreg, sp, fp_spills_offset);
+ GetAssembler()->cfi().RelOffset(DWARFReg(sreg), /*offset=*/ fp_spills_offset);
+ }
+ } else {
+ __ Push(RegisterList(core_spill_mask_));
+ GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(core_spill_mask_));
+ GetAssembler()->cfi().RelOffsetForMany(DWARFReg(kMethodRegister),
+ /*offset=*/ 0,
+ core_spill_mask_,
+ kArmWordSize);
+ if (fpu_spill_mask_ != 0) {
+ uint32_t first = LeastSignificantBit(fpu_spill_mask_);
+
+ // Check that list is contiguous.
+ DCHECK_EQ(fpu_spill_mask_ >> CTZ(fpu_spill_mask_), ~0u >> (32 - POPCOUNT(fpu_spill_mask_)));
+
+ __ Vpush(SRegisterList(vixl32::SRegister(first), POPCOUNT(fpu_spill_mask_)));
+ GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(fpu_spill_mask_));
+ GetAssembler()->cfi().RelOffsetForMany(DWARFReg(s0),
+ /*offset=*/ 0,
+ fpu_spill_mask_,
+ kArmWordSize);
+ }
+
+ // Adjust SP and save the current method if we need it. Note that we do
+ // not save the method in HCurrentMethod, as the instruction might have
+ // been removed in the SSA graph.
+ if (RequiresCurrentMethod() && fp_spills_offset <= 3 * kArmWordSize) {
+ DCHECK_EQ(kMethodRegister.GetCode(), 0u);
+ __ Push(RegisterList(MaxInt<uint32_t>(fp_spills_offset / kArmWordSize)));
+ GetAssembler()->cfi().AdjustCFAOffset(fp_spills_offset);
+ } else {
+ __ Sub(sp, sp, dchecked_integral_cast<int32_t>(fp_spills_offset));
+ GetAssembler()->cfi().AdjustCFAOffset(fp_spills_offset);
+ if (RequiresCurrentMethod()) {
+ GetAssembler()->StoreToOffset(kStoreWord, kMethodRegister, sp, 0);
+ }
+ }
}
if (GetGraph()->HasShouldDeoptimizeFlag()) {
@@ -2170,27 +2203,55 @@ void CodeGeneratorARMVIXL::GenerateFrameExit() {
__ Bx(lr);
return;
}
- GetAssembler()->cfi().RememberState();
- int adjust = GetFrameSize() - FrameEntrySpillSize();
- __ Add(sp, sp, adjust);
- GetAssembler()->cfi().AdjustCFAOffset(-adjust);
- if (fpu_spill_mask_ != 0) {
- uint32_t first = LeastSignificantBit(fpu_spill_mask_);
-
- // Check that list is contiguous.
- DCHECK_EQ(fpu_spill_mask_ >> CTZ(fpu_spill_mask_), ~0u >> (32 - POPCOUNT(fpu_spill_mask_)));
- __ Vpop(SRegisterList(vixl32::SRegister(first), POPCOUNT(fpu_spill_mask_)));
- GetAssembler()->cfi().AdjustCFAOffset(
- -static_cast<int>(kArmWordSize) * POPCOUNT(fpu_spill_mask_));
- GetAssembler()->cfi().RestoreMany(DWARFReg(vixl32::SRegister(0)), fpu_spill_mask_);
- }
// Pop LR into PC to return.
DCHECK_NE(core_spill_mask_ & (1 << kLrCode), 0U);
uint32_t pop_mask = (core_spill_mask_ & (~(1 << kLrCode))) | 1 << kPcCode;
- __ Pop(RegisterList(pop_mask));
- GetAssembler()->cfi().RestoreState();
- GetAssembler()->cfi().DefCFAOffset(GetFrameSize());
+
+ uint32_t frame_size = GetFrameSize();
+ uint32_t core_spills_offset = frame_size - GetCoreSpillSize();
+ uint32_t fp_spills_offset = frame_size - FrameEntrySpillSize();
+ if ((fpu_spill_mask_ == 0u || IsPowerOfTwo(fpu_spill_mask_)) &&
+ // r4 is blocked by TestCodeGeneratorARMVIXL used by some tests.
+ core_spills_offset <= (blocked_core_registers_[r4.GetCode()] ? 2u : 3u) * kArmWordSize) {
+ // Load the FP spill if any and then do a single POP including the method
+ // and up to two filler registers. If we have no FP spills, this also has
+ // the advantage that we do not need to emit CFI directives.
+ if (fpu_spill_mask_ != 0u) {
+ DCHECK(IsPowerOfTwo(fpu_spill_mask_));
+ vixl::aarch32::SRegister sreg(LeastSignificantBit(fpu_spill_mask_));
+ GetAssembler()->cfi().RememberState();
+ GetAssembler()->LoadSFromOffset(sreg, sp, fp_spills_offset);
+ GetAssembler()->cfi().Restore(DWARFReg(sreg));
+ }
+ // Clobber registers r2-r4 as they are caller-save in ART managed ABI and
+ // never hold the return value.
+ uint32_t extra_regs = MaxInt<uint32_t>(core_spills_offset / kArmWordSize) << r2.GetCode();
+ DCHECK_EQ(extra_regs & kCoreCalleeSaves.GetList(), 0u);
+ DCHECK_LT(MostSignificantBit(extra_regs), LeastSignificantBit(pop_mask));
+ __ Pop(RegisterList(pop_mask | extra_regs));
+ if (fpu_spill_mask_ != 0u) {
+ GetAssembler()->cfi().RestoreState();
+ }
+ } else {
+ GetAssembler()->cfi().RememberState();
+ __ Add(sp, sp, fp_spills_offset);
+ GetAssembler()->cfi().AdjustCFAOffset(-dchecked_integral_cast<int32_t>(fp_spills_offset));
+ if (fpu_spill_mask_ != 0) {
+ uint32_t first = LeastSignificantBit(fpu_spill_mask_);
+
+ // Check that list is contiguous.
+ DCHECK_EQ(fpu_spill_mask_ >> CTZ(fpu_spill_mask_), ~0u >> (32 - POPCOUNT(fpu_spill_mask_)));
+
+ __ Vpop(SRegisterList(vixl32::SRegister(first), POPCOUNT(fpu_spill_mask_)));
+ GetAssembler()->cfi().AdjustCFAOffset(
+ -static_cast<int>(kArmWordSize) * POPCOUNT(fpu_spill_mask_));
+ GetAssembler()->cfi().RestoreMany(DWARFReg(vixl32::SRegister(0)), fpu_spill_mask_);
+ }
+ __ Pop(RegisterList(pop_mask));
+ GetAssembler()->cfi().RestoreState();
+ GetAssembler()->cfi().DefCFAOffset(GetFrameSize());
+ }
}
void CodeGeneratorARMVIXL::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/common_arm.h b/compiler/optimizing/common_arm.h
index 7d3af9521a..320915ee57 100644
--- a/compiler/optimizing/common_arm.h
+++ b/compiler/optimizing/common_arm.h
@@ -17,7 +17,6 @@
#ifndef ART_COMPILER_OPTIMIZING_COMMON_ARM_H_
#define ART_COMPILER_OPTIMIZING_COMMON_ARM_H_
-#include "dwarf/register.h"
#include "instruction_simplifier_shared.h"
#include "locations.h"
#include "nodes.h"
@@ -38,14 +37,6 @@ namespace helpers {
static_assert(vixl::aarch32::kSpCode == SP, "vixl::aarch32::kSpCode must equal ART's SP");
-inline dwarf::Reg DWARFReg(vixl::aarch32::Register reg) {
- return dwarf::Reg::ArmCore(static_cast<int>(reg.GetCode()));
-}
-
-inline dwarf::Reg DWARFReg(vixl::aarch32::SRegister reg) {
- return dwarf::Reg::ArmFp(static_cast<int>(reg.GetCode()));
-}
-
inline vixl::aarch32::Register HighRegisterFrom(Location location) {
DCHECK(location.IsRegisterPair()) << location;
return vixl::aarch32::Register(location.AsRegisterPairHigh<vixl::aarch32::Register>());
diff --git a/compiler/utils/arm/assembler_arm_vixl.h b/compiler/utils/arm/assembler_arm_vixl.h
index 98c0191679..59d7eddc63 100644
--- a/compiler/utils/arm/assembler_arm_vixl.h
+++ b/compiler/utils/arm/assembler_arm_vixl.h
@@ -22,6 +22,7 @@
#include "base/arena_containers.h"
#include "base/macros.h"
#include "constants_arm.h"
+#include "dwarf/register.h"
#include "offsets.h"
#include "utils/arm/assembler_arm_shared.h"
#include "utils/arm/managed_register_arm.h"
@@ -39,6 +40,14 @@ namespace vixl32 = vixl::aarch32;
namespace art {
namespace arm {
+inline dwarf::Reg DWARFReg(vixl32::Register reg) {
+ return dwarf::Reg::ArmCore(static_cast<int>(reg.GetCode()));
+}
+
+inline dwarf::Reg DWARFReg(vixl32::SRegister reg) {
+ return dwarf::Reg::ArmFp(static_cast<int>(reg.GetCode()));
+}
+
class ArmVIXLMacroAssembler final : public vixl32::MacroAssembler {
public:
// Most methods fit in a 1KB code buffer, which results in more optimal alloc/realloc and
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index c6c764e3a9..47a067b2a7 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -68,14 +68,6 @@ void ArmVIXLJNIMacroAssembler::FinalizeCode() {
asm_.FinalizeCode();
}
-static dwarf::Reg DWARFReg(vixl32::Register reg) {
- return dwarf::Reg::ArmCore(static_cast<int>(reg.GetCode()));
-}
-
-static dwarf::Reg DWARFReg(vixl32::SRegister reg) {
- return dwarf::Reg::ArmFp(static_cast<int>(reg.GetCode()));
-}
-
static constexpr size_t kFramePointerSize = static_cast<size_t>(kArmPointerSize);
void ArmVIXLJNIMacroAssembler::BuildFrame(size_t frame_size,
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index d7ade058a4..0523797322 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -103,15 +103,6 @@ void Arm64Assembler::JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister
___ Br(reg_x(scratch.AsXRegister()));
}
-static inline dwarf::Reg DWARFReg(CPURegister reg) {
- if (reg.IsFPRegister()) {
- return dwarf::Reg::Arm64Fp(reg.GetCode());
- } else {
- DCHECK_LT(reg.GetCode(), 31u); // X0 - X30.
- return dwarf::Reg::Arm64Core(reg.GetCode());
- }
-}
-
void Arm64Assembler::SpillRegisters(CPURegList registers, int offset) {
int size = registers.GetRegisterSizeInBytes();
const Register sp = vixl_masm_.StackPointer();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 9e01a70ea9..594c6b4b75 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -25,6 +25,7 @@
#include "base/arena_containers.h"
#include "base/macros.h"
+#include "dwarf/register.h"
#include "offsets.h"
#include "utils/arm64/managed_register_arm64.h"
#include "utils/assembler.h"
@@ -42,6 +43,15 @@ class Arm64InstructionSetFeatures;
namespace arm64 {
+static inline dwarf::Reg DWARFReg(vixl::aarch64::CPURegister reg) {
+ if (reg.IsFPRegister()) {
+ return dwarf::Reg::Arm64Fp(reg.GetCode());
+ } else {
+ DCHECK_LT(reg.GetCode(), 31u); // X0 - X30.
+ return dwarf::Reg::Arm64Core(reg.GetCode());
+ }
+}
+
#define MEM_OP(...) vixl::aarch64::MemOperand(__VA_ARGS__)
enum LoadOperandType {
diff --git a/libelffile/dwarf/debug_frame_opcode_writer.h b/libelffile/dwarf/debug_frame_opcode_writer.h
index b255f9c1f4..65ca6bf1f4 100644
--- a/libelffile/dwarf/debug_frame_opcode_writer.h
+++ b/libelffile/dwarf/debug_frame_opcode_writer.h
@@ -80,8 +80,10 @@ class DebugFrameOpCodeWriter : private Writer<Vector> {
}
// Custom alias - spill many registers based on bitmask.
- void ALWAYS_INLINE RelOffsetForMany(Reg reg_base, int offset,
- uint32_t reg_mask, int reg_size) {
+ void ALWAYS_INLINE RelOffsetForMany(Reg reg_base,
+ int32_t offset,
+ uint32_t reg_mask,
+ int32_t reg_size) {
DCHECK(reg_size == 4 || reg_size == 8);
if (UNLIKELY(enabled_)) {
for (int i = 0; reg_mask != 0u; reg_mask >>= 1, i++) {