Enable core callee-save on x64.
Will work on other architectures and FP support in other CLs.
Change-Id: I8cef0343eedc7202d206f5217fdf0349035f0e4d
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index bc9649f..0af70f9 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -236,7 +236,8 @@
}
}
- SetupBlockedRegisters();
+ static constexpr bool kBaseline = true;
+ SetupBlockedRegisters(kBaseline);
// Allocate all unallocated input locations.
for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
@@ -547,8 +548,18 @@
size_t environment_size = instruction->EnvironmentSize();
- size_t register_mask = 0;
size_t inlining_depth = 0;
+ uint32_t register_mask = locations->GetRegisterMask();
+ if (locations->OnlyCallsOnSlowPath()) {
+ // In case of slow path, we currently set the location of caller-save registers
+ // to register (instead of their stack location when pushed before the slow-path
+ // call). Therefore register_mask contains both callee-save and caller-save
+ // registers that hold objects. We must remove the caller-save from the mask, since
+ // they will be overwritten by the callee.
+ register_mask &= core_callee_save_mask_;
+ }
+ // The register mask must be a subset of callee-save registers.
+ DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
stack_map_stream_.AddStackMapEntry(
dex_pc, pc_info.native_pc, register_mask,
locations->GetStackMask(), environment_size, inlining_depth);
@@ -684,20 +695,24 @@
RegisterSet* register_set = locations->GetLiveRegisters();
size_t stack_offset = first_register_slot_in_slow_path_;
for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
- if (register_set->ContainsCoreRegister(i)) {
- // If the register holds an object, update the stack mask.
- if (locations->RegisterContainsObject(i)) {
- locations->SetStackBit(stack_offset / kVRegSize);
+ if (!IsCoreCalleeSaveRegister(i)) {
+ if (register_set->ContainsCoreRegister(i)) {
+ // If the register holds an object, update the stack mask.
+ if (locations->RegisterContainsObject(i)) {
+ locations->SetStackBit(stack_offset / kVRegSize);
+ }
+ DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+ stack_offset += SaveCoreRegister(stack_offset, i);
}
- DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
- stack_offset += SaveCoreRegister(stack_offset, i);
}
}
for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
- if (register_set->ContainsFloatingPointRegister(i)) {
- DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
- stack_offset += SaveFloatingPointRegister(stack_offset, i);
+ if (!IsFloatingPointCalleeSaveRegister(i)) {
+ if (register_set->ContainsFloatingPointRegister(i)) {
+ DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+ stack_offset += SaveFloatingPointRegister(stack_offset, i);
+ }
}
}
}
@@ -706,16 +721,20 @@
RegisterSet* register_set = locations->GetLiveRegisters();
size_t stack_offset = first_register_slot_in_slow_path_;
for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
- if (register_set->ContainsCoreRegister(i)) {
- DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
- stack_offset += RestoreCoreRegister(stack_offset, i);
+ if (!IsCoreCalleeSaveRegister(i)) {
+ if (register_set->ContainsCoreRegister(i)) {
+ DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+ stack_offset += RestoreCoreRegister(stack_offset, i);
+ }
}
}
for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
- if (register_set->ContainsFloatingPointRegister(i)) {
- DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
- stack_offset += RestoreFloatingPointRegister(stack_offset, i);
+ if (!IsFloatingPointCalleeSaveRegister(i)) {
+ if (register_set->ContainsFloatingPointRegister(i)) {
+ DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+ stack_offset += RestoreFloatingPointRegister(stack_offset, i);
+ }
}
}
}
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index f66aed9..16080a4 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -127,7 +127,7 @@
size_t GetNumberOfCoreRegisters() const { return number_of_core_registers_; }
size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; }
- virtual void SetupBlockedRegisters() const = 0;
+ virtual void SetupBlockedRegisters(bool is_baseline) const = 0;
virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
@@ -151,6 +151,14 @@
}
virtual bool NeedsTwoRegisters(Primitive::Type type) const = 0;
+ bool IsCoreCalleeSaveRegister(int reg) const {
+ return (core_callee_save_mask_ & (1 << reg)) != 0;
+ }
+
+ bool IsFloatingPointCalleeSaveRegister(int reg) const {
+ return (fpu_callee_save_mask_ & (1 << reg)) != 0;
+ }
+
void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc);
bool CanMoveNullCheckToUser(HNullCheck* null_check);
void MaybeRecordImplicitNullCheck(HInstruction* instruction);
@@ -203,11 +211,17 @@
return type == Primitive::kPrimNot && !value->IsIntConstant();
}
+ void AddAllocatedRegister(Location location) {
+ allocated_registers_.Add(location);
+ }
+
protected:
CodeGenerator(HGraph* graph,
size_t number_of_core_registers,
size_t number_of_fpu_registers,
size_t number_of_register_pairs,
+ uint32_t core_callee_save_mask,
+ uint32_t fpu_callee_save_mask,
const CompilerOptions& compiler_options)
: frame_size_(kUninitializedFrameSize),
core_spill_mask_(0),
@@ -218,6 +232,8 @@
number_of_core_registers_(number_of_core_registers),
number_of_fpu_registers_(number_of_fpu_registers),
number_of_register_pairs_(number_of_register_pairs),
+ core_callee_save_mask_(core_callee_save_mask),
+ fpu_callee_save_mask_(fpu_callee_save_mask),
graph_(graph),
compiler_options_(compiler_options),
pc_infos_(graph->GetArena(), 32),
@@ -243,6 +259,9 @@
uint32_t core_spill_mask_;
uint32_t first_register_slot_in_slow_path_;
+ // Registers that were allocated during linear scan.
+ RegisterSet allocated_registers_;
+
// Arrays used when doing register allocation to know which
// registers we can allocate. `SetupBlockedRegisters` updates the
// arrays.
@@ -252,6 +271,8 @@
size_t number_of_core_registers_;
size_t number_of_fpu_registers_;
size_t number_of_register_pairs_;
+ const uint32_t core_callee_save_mask_;
+ const uint32_t fpu_callee_save_mask_;
private:
void InitLocations(HInstruction* instruction);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index c6a6974..bc8858b 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -390,7 +390,7 @@
const ArmInstructionSetFeatures& isa_features,
const CompilerOptions& compiler_options)
: CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters,
- kNumberOfRegisterPairs, compiler_options),
+ kNumberOfRegisterPairs, 0, 0, compiler_options),
block_labels_(graph->GetArena(), 0),
location_builder_(graph, this),
instruction_visitor_(graph, this),
@@ -453,7 +453,7 @@
return Location();
}
-void CodeGeneratorARM::SetupBlockedRegisters() const {
+void CodeGeneratorARM::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
// Don't allocate the dalvik style register pair passing.
blocked_register_pairs_[R1_R2] = true;
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 0de6669..f3b1ff5 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -197,7 +197,7 @@
return GetLabelOf(block)->Position();
}
- void SetupBlockedRegisters() const OVERRIDE;
+ void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 760d2be..21c1e9c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -567,6 +567,8 @@
kNumberOfAllocatableRegisters,
kNumberOfAllocatableFPRegisters,
kNumberOfAllocatableRegisterPairs,
+ 0,
+ 0,
compiler_options),
block_labels_(nullptr),
location_builder_(graph, this),
@@ -729,7 +731,7 @@
__ Bind(&done);
}
-void CodeGeneratorARM64::SetupBlockedRegisters() const {
+void CodeGeneratorARM64::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
// Block reserved registers:
// ip0 (VIXL temporary)
// ip1 (VIXL temporary)
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 27c6fbd..d81e481 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -217,7 +217,7 @@
// Register allocation.
- void SetupBlockedRegisters() const OVERRIDE;
+ void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
// AllocateFreeRegister() is only used when allocating registers locally
// during CompileBaseline().
Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 2d30412..9e26ddd 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -375,7 +375,7 @@
CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
: CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters,
- kNumberOfRegisterPairs, compiler_options),
+ kNumberOfRegisterPairs, 0, 0, compiler_options),
block_labels_(graph->GetArena(), 0),
location_builder_(graph, this),
instruction_visitor_(graph, this),
@@ -431,7 +431,7 @@
return Location();
}
-void CodeGeneratorX86::SetupBlockedRegisters() const {
+void CodeGeneratorX86::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
// Don't allocate the dalvik style register pair passing.
blocked_register_pairs_[ECX_EDX] = true;
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index a9086f8..dcfeb2f 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -196,7 +196,7 @@
return GetLabelOf(block)->Position();
}
- void SetupBlockedRegisters() const OVERRIDE;
+ void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index da83b76..285003d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -46,6 +46,7 @@
static constexpr FloatRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1 };
static constexpr size_t kRuntimeParameterFpuRegistersLength =
arraysize(kRuntimeParameterFpuRegisters);
+static constexpr Register kCoreCalleeSaves[] = { RBX, RBP, R12, R13, R14, R15 };
static constexpr int kC2ConditionMask = 0x400;
@@ -416,17 +417,27 @@
return kX86_64WordSize;
}
+static uint32_t ComputeCoreCalleeSaveMask() {
+ uint32_t mask = 0;
+ for (size_t i = 0, e = arraysize(kCoreCalleeSaves); i < e; ++i) {
+ mask |= (1 << kCoreCalleeSaves[i]);
+ }
+ return mask;
+}
+
CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options)
- : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfFloatRegisters, 0, compiler_options),
+ : CodeGenerator(graph,
+ kNumberOfCpuRegisters,
+ kNumberOfFloatRegisters,
+ 0,
+ ComputeCoreCalleeSaveMask(),
+ 0,
+ compiler_options),
block_labels_(graph->GetArena(), 0),
location_builder_(graph, this),
instruction_visitor_(graph, this),
move_resolver_(graph->GetArena(), this) {}
-size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
- return kNumberOfPushedRegistersAtEntry * kX86_64WordSize;
-}
-
InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph,
CodeGeneratorX86_64* codegen)
: HGraphVisitor(graph),
@@ -459,21 +470,26 @@
return Location();
}
-void CodeGeneratorX86_64::SetupBlockedRegisters() const {
+size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
+ uint32_t mask = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+ return kNumberOfPushedRegistersAtEntry * kX86_64WordSize
+ + __builtin_popcount(mask) * kX86_64WordSize;
+}
+
+void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const {
// Stack register is always reserved.
blocked_core_registers_[RSP] = true;
// Block the register used as TMP.
blocked_core_registers_[TMP] = true;
- // TODO: We currently don't use Quick's callee saved registers.
- blocked_core_registers_[RBX] = true;
- blocked_core_registers_[RBP] = true;
- blocked_core_registers_[R12] = true;
- blocked_core_registers_[R13] = true;
- blocked_core_registers_[R14] = true;
- blocked_core_registers_[R15] = true;
+ if (is_baseline) {
+ for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+ blocked_core_registers_[kCoreCalleeSaves[i]] = true;
+ }
+ }
+ // TODO: We currently don't use Quick's FP callee saved registers.
blocked_fpu_registers_[XMM12] = true;
blocked_fpu_registers_[XMM13] = true;
blocked_fpu_registers_[XMM14] = true;
@@ -484,6 +500,7 @@
// Create a fake register to mimic Quick.
static const int kFakeReturnRegister = 16;
core_spill_mask_ |= (1 << kFakeReturnRegister);
+ core_spill_mask_ |= (allocated_registers_.GetCoreRegisters() & core_callee_save_mask_);
bool skip_overflow_check = IsLeafMethod()
&& !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64);
@@ -494,10 +511,14 @@
CpuRegister(RSP), -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86_64))));
RecordPcInfo(nullptr, 0);
}
+
+ for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) {
+ if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
+ __ pushq(CpuRegister(kCoreCalleeSaves[i]));
+ }
+ }
- // The return PC has already been pushed on the stack.
- __ subq(CpuRegister(RSP),
- Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+ __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
if (!skip_overflow_check && !implicitStackOverflowChecks) {
SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathX86_64();
@@ -512,8 +533,13 @@
}
void CodeGeneratorX86_64::GenerateFrameExit() {
- __ addq(CpuRegister(RSP),
- Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+ __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
+
+ for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+ if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
+ __ popq(CpuRegister(kCoreCalleeSaves[i]));
+ }
+ }
}
void CodeGeneratorX86_64::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index ead771a..645fb17 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -218,7 +218,7 @@
Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
- void SetupBlockedRegisters() const OVERRIDE;
+ void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index dda6c94..6bf8f77 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -431,6 +431,14 @@
return __builtin_popcount(core_registers_) + __builtin_popcount(floating_point_registers_);
}
+ uint32_t GetCoreRegisters() const {
+ return core_registers_;
+ }
+
+ uint32_t GetFloatingPointRegisters() const {
+ return floating_point_registers_;
+ }
+
private:
uint32_t core_registers_;
uint32_t floating_point_registers_;
@@ -529,6 +537,10 @@
register_mask_ |= (1 << reg_id);
}
+ uint32_t GetRegisterMask() const {
+ return register_mask_;
+ }
+
bool RegisterContainsObject(uint32_t reg_id) {
return RegisterSet::Contains(register_mask_, reg_id);
}
@@ -557,7 +569,14 @@
return false;
}
Location input = inputs_.Get(input_index);
- if (input.IsRegister() || input.IsFpuRegister() || input.IsPair()) {
+ if (input.IsRegister()
+ || input.IsFpuRegister()
+ || input.IsPair()
+ || input.IsStackSlot()
+ || input.IsDoubleStackSlot()) {
+ // For fixed locations, the register allocator requires to have inputs die before
+ // the instruction, so that input moves use the location of the input just
+ // before that instruction (and not potential moves due to splitting).
return false;
}
return true;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index e120bc6..260076a 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -58,7 +58,8 @@
reserved_out_slots_(0),
maximum_number_of_live_core_registers_(0),
maximum_number_of_live_fp_registers_(0) {
- codegen->SetupBlockedRegisters();
+ static constexpr bool kIsBaseline = false;
+ codegen->SetupBlockedRegisters(kIsBaseline);
physical_core_register_intervals_.SetSize(codegen->GetNumberOfCoreRegisters());
physical_fp_register_intervals_.SetSize(codegen->GetNumberOfFloatingPointRegisters());
// Always reserve for the current method and the graph's max out registers.
@@ -278,14 +279,18 @@
if (locations->WillCall()) {
// Block all registers.
for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
- BlockRegister(Location::RegisterLocation(i),
- position,
- position + 1);
+ if (!codegen_->IsCoreCalleeSaveRegister(i)) {
+ BlockRegister(Location::RegisterLocation(i),
+ position,
+ position + 1);
+ }
}
for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
- BlockRegister(Location::FpuRegisterLocation(i),
- position,
- position + 1);
+ if (!codegen_->IsFloatingPointCalleeSaveRegister(i)) {
+ BlockRegister(Location::FpuRegisterLocation(i),
+ position,
+ position + 1);
+ }
}
}
@@ -627,6 +632,9 @@
// (6) If the interval had a register allocated, add it to the list of active
// intervals.
if (success) {
+ codegen_->AddAllocatedRegister(processing_core_registers_
+ ? Location::RegisterLocation(current->GetRegister())
+ : Location::FpuRegisterLocation(current->GetRegister()));
active_.Add(current);
if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) {
current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister()));
@@ -1357,9 +1365,11 @@
switch (source.GetKind()) {
case Location::kRegister: {
locations->AddLiveRegister(source);
- DCHECK_LE(locations->GetNumberOfLiveRegisters(),
- maximum_number_of_live_core_registers_ +
- maximum_number_of_live_fp_registers_);
+ if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) {
+ DCHECK_LE(locations->GetNumberOfLiveRegisters(),
+ maximum_number_of_live_core_registers_ +
+ maximum_number_of_live_fp_registers_);
+ }
if (current->GetType() == Primitive::kPrimNot) {
locations->SetRegisterBit(source.reg());
}