diff options
Diffstat (limited to 'compiler/optimizing')
70 files changed, 9852 insertions, 4176 deletions
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc index 1fc247faf1..8aefd9ea1f 100644 --- a/compiler/optimizing/bounds_check_elimination.cc +++ b/compiler/optimizing/bounds_check_elimination.cc @@ -533,9 +533,6 @@ class BCEVisitor : public HGraphVisitor { first_index_bounds_check_map_( std::less<int>(), graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)), - dynamic_bce_standby_( - graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)), - record_dynamic_bce_standby_(true), early_exit_loop_( std::less<uint32_t>(), graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)), @@ -560,14 +557,6 @@ class BCEVisitor : public HGraphVisitor { } void Finish() { - // Retry dynamic bce candidates on standby that are still in the graph. - record_dynamic_bce_standby_ = false; - for (HBoundsCheck* bounds_check : dynamic_bce_standby_) { - if (bounds_check->IsInBlock()) { - TryDynamicBCE(bounds_check); - } - } - // Preserve SSA structure which may have been broken by adding one or more // new taken-test structures (see TransformLoopForDeoptimizationIfNeeded()). InsertPhiNodes(); @@ -576,7 +565,6 @@ class BCEVisitor : public HGraphVisitor { early_exit_loop_.clear(); taken_test_loop_.clear(); finite_loop_.clear(); - dynamic_bce_standby_.clear(); } private: @@ -832,7 +820,6 @@ class BCEVisitor : public HGraphVisitor { array_length->IsArrayLength() || array_length->IsPhi()); bool try_dynamic_bce = true; - // Analyze index range. if (!index->IsIntConstant()) { // Non-constant index. @@ -896,10 +883,20 @@ class BCEVisitor : public HGraphVisitor { // If static analysis fails, and OOB is not certain, try dynamic elimination. if (try_dynamic_bce) { // Try loop-based dynamic elimination. - if (TryDynamicBCE(bounds_check)) { + HLoopInformation* loop = bounds_check->GetBlock()->GetLoopInformation(); + bool needs_finite_test = false; + bool needs_taken_test = false; + if (DynamicBCESeemsProfitable(loop, bounds_check->GetBlock()) && + induction_range_.CanGenerateCode( + bounds_check, index, &needs_finite_test, &needs_taken_test) && + CanHandleInfiniteLoop(loop, index, needs_finite_test) && + // Do this test last, since it may generate code. + CanHandleLength(loop, array_length, needs_taken_test)) { + TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test); + TransformLoopForDynamicBCE(loop, bounds_check); return; } - // Prepare dominator-based dynamic elimination. + // Otherwise, prepare dominator-based dynamic elimination. if (first_index_bounds_check_map_.find(array_length->GetId()) == first_index_bounds_check_map_.end()) { // Remember the first bounds check against each array_length. That bounds check @@ -1180,7 +1177,7 @@ class BCEVisitor : public HGraphVisitor { } } - // Perform dominator-based dynamic elimination on suitable set of bounds checks. + /** Performs dominator-based dynamic elimination on suitable set of bounds checks. */ void AddCompareWithDeoptimization(HBasicBlock* block, HInstruction* array_length, HInstruction* base, @@ -1190,6 +1187,12 @@ class BCEVisitor : public HGraphVisitor { // Construct deoptimization on single or double bounds on range [base-min_c,base+max_c], // for example either for a[0]..a[3] just 3 or for a[base-1]..a[base+3] both base-1 // and base+3, since we made the assumption any in between value may occur too. + // In code, using unsigned comparisons: + // (1) constants only + // if (max_c >= a.length) deoptimize; + // (2) general case + // if (base-min_c > base+max_c) deoptimize; + // if (base+max_c >= a.length ) deoptimize; static_assert(kMaxLengthForAddingDeoptimize < std::numeric_limits<int32_t>::max(), "Incorrect max length may be subject to arithmetic wrap-around"); HInstruction* upper = GetGraph()->GetIntConstant(max_c); @@ -1208,7 +1211,7 @@ class BCEVisitor : public HGraphVisitor { has_dom_based_dynamic_bce_ = true; } - // Attempt dominator-based dynamic elimination on remaining candidates. + /** Attempts dominator-based dynamic elimination on remaining candidates. */ void AddComparesWithDeoptimization(HBasicBlock* block) { for (const auto& entry : first_index_bounds_check_map_) { HBoundsCheck* bounds_check = entry.second; @@ -1272,17 +1275,19 @@ class BCEVisitor : public HGraphVisitor { candidates.push_back(other_bounds_check); } } - // Perform dominator-based deoptimization if it seems profitable. Note that we reject cases - // where the distance min_c:max_c range gets close to the maximum possible array length, - // since those cases are likely to always deopt (such situations do not necessarily go - // OOB, though, since the programmer could rely on wrap-around from max to min). + // Perform dominator-based deoptimization if it seems profitable, where we eliminate + // bounds checks and replace these with deopt checks that guard against any possible + // OOB. Note that we reject cases where the distance min_c:max_c range gets close to + // the maximum possible array length, since those cases are likely to always deopt + // (such situations do not necessarily go OOB, though, since the array could be really + // large, or the programmer could rely on arithmetic wrap-around from max to min). size_t threshold = kThresholdForAddingDeoptimize + (base == nullptr ? 0 : 1); // extra test? uint32_t distance = static_cast<uint32_t>(max_c) - static_cast<uint32_t>(min_c); if (candidates.size() >= threshold && (base != nullptr || min_c >= 0) && // reject certain OOB distance <= kMaxLengthForAddingDeoptimize) { // reject likely/certain deopt AddCompareWithDeoptimization(block, array_length, base, min_c, max_c); - for (HInstruction* other_bounds_check : candidates) { + for (HBoundsCheck* other_bounds_check : candidates) { // Only replace if still in the graph. This avoids visiting the same // bounds check twice if it occurred multiple times in the use list. if (other_bounds_check->IsInBlock()) { @@ -1328,45 +1333,127 @@ class BCEVisitor : public HGraphVisitor { } /** - * When the compiler fails to remove a bounds check statically, we try to remove the bounds - * check dynamically by adding runtime tests that trigger a deoptimization in case bounds - * will go out of range (we want to be rather certain of that given the slowdown of - * deoptimization). If no deoptimization occurs, the loop is executed with all corresponding - * bounds checks and related null checks removed. + * Performs loop-based dynamic elimination on a bounds check. In order to minimize the + * number of eventually generated tests, related bounds checks with tests that can be + * combined with tests for the given bounds check are collected first. */ - bool TryDynamicBCE(HBoundsCheck* instruction) { - HLoopInformation* loop = instruction->GetBlock()->GetLoopInformation(); - HInstruction* index = instruction->InputAt(0); - HInstruction* length = instruction->InputAt(1); - // If dynamic bounds check elimination seems profitable and is possible, then proceed. - bool needs_finite_test = false; - bool needs_taken_test = false; - if (DynamicBCESeemsProfitable(loop, instruction->GetBlock()) && - induction_range_.CanGenerateCode( - instruction, index, &needs_finite_test, &needs_taken_test) && - CanHandleInfiniteLoop(loop, instruction, index, needs_finite_test) && - CanHandleLength(loop, length, needs_taken_test)) { // do this test last (may code gen) - HInstruction* lower = nullptr; - HInstruction* upper = nullptr; - // Generate the following unsigned comparisons - // if (lower > upper) deoptimize; - // if (upper >= length) deoptimize; - // or, for a non-induction index, just the unsigned comparison on its 'upper' value - // if (upper >= length) deoptimize; - // as runtime test. By restricting dynamic bce to unit strides (with a maximum of 32-bit - // iterations) and by not combining access (e.g. a[i], a[i-3], a[i+5] etc.), these tests - // correctly guard against any possible OOB (including arithmetic wrap-around cases). - TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test); - HBasicBlock* block = GetPreHeader(loop, instruction); - induction_range_.GenerateRangeCode(instruction, index, GetGraph(), block, &lower, &upper); - if (lower != nullptr) { - InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(lower, upper)); - } - InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAboveOrEqual(upper, length)); - ReplaceInstruction(instruction, index); - return true; + void TransformLoopForDynamicBCE(HLoopInformation* loop, HBoundsCheck* bounds_check) { + HInstruction* index = bounds_check->InputAt(0); + HInstruction* array_length = bounds_check->InputAt(1); + DCHECK(loop->IsDefinedOutOfTheLoop(array_length)); // pre-checked + DCHECK(loop->DominatesAllBackEdges(bounds_check->GetBlock())); + // Collect all bounds checks in the same loop that are related as "a[base + constant]" + // for a base instruction (possibly absent) and various constants. + ValueBound value = ValueBound::AsValueBound(index); + HInstruction* base = value.GetInstruction(); + int32_t min_c = base == nullptr ? 0 : value.GetConstant(); + int32_t max_c = value.GetConstant(); + ArenaVector<HBoundsCheck*> candidates( + GetGraph()->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)); + ArenaVector<HBoundsCheck*> standby( + GetGraph()->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)); + for (const HUseListNode<HInstruction*>& use : array_length->GetUses()) { + HInstruction* user = use.GetUser(); + if (user->IsBoundsCheck() && loop == user->GetBlock()->GetLoopInformation()) { + HBoundsCheck* other_bounds_check = user->AsBoundsCheck(); + HInstruction* other_index = other_bounds_check->InputAt(0); + HInstruction* other_array_length = other_bounds_check->InputAt(1); + ValueBound other_value = ValueBound::AsValueBound(other_index); + int32_t other_c = other_value.GetConstant(); + if (array_length == other_array_length && base == other_value.GetInstruction()) { + // Does the current basic block dominate all back edges? If not, + // add this candidate later only if it falls into the range. + if (!loop->DominatesAllBackEdges(user->GetBlock())) { + standby.push_back(other_bounds_check); + continue; + } + min_c = std::min(min_c, other_c); + max_c = std::max(max_c, other_c); + candidates.push_back(other_bounds_check); + } + } + } + // Add standby candidates that fall in selected range. + for (HBoundsCheck* other_bounds_check : standby) { + HInstruction* other_index = other_bounds_check->InputAt(0); + int32_t other_c = ValueBound::AsValueBound(other_index).GetConstant(); + if (min_c <= other_c && other_c <= max_c) { + candidates.push_back(other_bounds_check); + } + } + // Perform loop-based deoptimization if it seems profitable, where we eliminate bounds + // checks and replace these with deopt checks that guard against any possible OOB. + DCHECK_LT(0u, candidates.size()); + uint32_t distance = static_cast<uint32_t>(max_c) - static_cast<uint32_t>(min_c); + if ((base != nullptr || min_c >= 0) && // reject certain OOB + distance <= kMaxLengthForAddingDeoptimize) { // reject likely/certain deopt + HBasicBlock* block = GetPreHeader(loop, bounds_check); + HInstruction* min_lower = nullptr; + HInstruction* min_upper = nullptr; + HInstruction* max_lower = nullptr; + HInstruction* max_upper = nullptr; + // Iterate over all bounds checks. + for (HBoundsCheck* other_bounds_check : candidates) { + // Only handle if still in the graph. This avoids visiting the same + // bounds check twice if it occurred multiple times in the use list. + if (other_bounds_check->IsInBlock()) { + HInstruction* other_index = other_bounds_check->InputAt(0); + int32_t other_c = ValueBound::AsValueBound(other_index).GetConstant(); + // Generate code for either the maximum or minimum. Range analysis already was queried + // whether code generation on the original and, thus, related bounds check was possible. + // It handles either loop invariants (lower is not set) or unit strides. + if (other_c == max_c) { + induction_range_.GenerateRangeCode( + other_bounds_check, other_index, GetGraph(), block, &max_lower, &max_upper); + } else if (other_c == min_c && base != nullptr) { + induction_range_.GenerateRangeCode( + other_bounds_check, other_index, GetGraph(), block, &min_lower, &min_upper); + } + ReplaceInstruction(other_bounds_check, other_index); + } + } + // In code, using unsigned comparisons: + // (1) constants only + // if (max_upper >= a.length ) deoptimize; + // (2) two symbolic invariants + // if (min_upper > max_upper) deoptimize; unless min_c == max_c + // if (max_upper >= a.length ) deoptimize; + // (3) general case, unit strides (where lower would exceed upper for arithmetic wrap-around) + // if (min_lower > max_lower) deoptimize; unless min_c == max_c + // if (max_lower > max_upper) deoptimize; + // if (max_upper >= a.length ) deoptimize; + if (base == nullptr) { + // Constants only. + DCHECK_GE(min_c, 0); + DCHECK(min_lower == nullptr && min_upper == nullptr && + max_lower == nullptr && max_upper != nullptr); + } else if (max_lower == nullptr) { + // Two symbolic invariants. + if (min_c != max_c) { + DCHECK(min_lower == nullptr && min_upper != nullptr && + max_lower == nullptr && max_upper != nullptr); + InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(min_upper, max_upper)); + } else { + DCHECK(min_lower == nullptr && min_upper == nullptr && + max_lower == nullptr && max_upper != nullptr); + } + } else { + // General case, unit strides. + if (min_c != max_c) { + DCHECK(min_lower != nullptr && min_upper != nullptr && + max_lower != nullptr && max_upper != nullptr); + InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(min_lower, max_lower)); + } else { + DCHECK(min_lower == nullptr && min_upper == nullptr && + max_lower != nullptr && max_upper != nullptr); + } + InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(max_lower, max_upper)); + } + InsertDeoptInLoop( + loop, block, new (GetGraph()->GetArena()) HAboveOrEqual(max_upper, array_length)); + } else { + // TODO: if rejected, avoid doing this again for subsequent instructions in this set? } - return false; } /** @@ -1474,8 +1561,7 @@ class BCEVisitor : public HGraphVisitor { * of the loop to use, dynamic bce in such cases is only allowed if other tests * ensure the loop is finite. */ - bool CanHandleInfiniteLoop( - HLoopInformation* loop, HBoundsCheck* check, HInstruction* index, bool needs_infinite_test) { + bool CanHandleInfiniteLoop(HLoopInformation* loop, HInstruction* index, bool needs_infinite_test) { if (needs_infinite_test) { // If we already forced the loop to be finite, allow directly. const uint32_t loop_id = loop->GetHeader()->GetBlockId(); @@ -1497,11 +1583,6 @@ class BCEVisitor : public HGraphVisitor { } } } - // If bounds check made it this far, it is worthwhile to check later if - // the loop was forced finite by another candidate. - if (record_dynamic_bce_standby_) { - dynamic_bce_standby_.push_back(check); - } return false; } return true; @@ -1727,10 +1808,6 @@ class BCEVisitor : public HGraphVisitor { // in a block that checks an index against that HArrayLength. ArenaSafeMap<int, HBoundsCheck*> first_index_bounds_check_map_; - // Stand by list for dynamic bce. - ArenaVector<HBoundsCheck*> dynamic_bce_standby_; - bool record_dynamic_bce_standby_; - // Early-exit loop bookkeeping. ArenaSafeMap<uint32_t, bool> early_exit_loop_; diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 4520f9b3e3..c532e72465 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -137,7 +137,7 @@ size_t CodeGenerator::GetCacheOffset(uint32_t index) { size_t CodeGenerator::GetCachePointerOffset(uint32_t index) { auto pointer_size = InstructionSetPointerSize(GetInstructionSet()); - return pointer_size * index; + return static_cast<size_t>(pointer_size) * index; } uint32_t CodeGenerator::GetArrayLengthOffset(HArrayLength* array_length) { @@ -291,7 +291,8 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots, DCHECK(!block_order.empty()); DCHECK(block_order[0] == GetGraph()->GetEntryBlock()); ComputeSpillMask(); - first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize; + first_register_slot_in_slow_path_ = RoundUp( + (number_of_out_slots + number_of_spill_slots) * kVRegSize, GetPreferredSlotsAlignment()); if (number_of_spill_slots == 0 && !HasAllocatedCalleeSaveRegisters() @@ -302,8 +303,7 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots, SetFrameSize(CallPushesPC() ? GetWordSize() : 0); } else { SetFrameSize(RoundUp( - number_of_spill_slots * kVRegSize - + number_of_out_slots * kVRegSize + first_register_slot_in_slow_path_ + maximum_number_of_live_core_registers * GetWordSize() + maximum_number_of_live_fpu_registers * GetFloatingPointSpillSlotSize() + FrameEntrySpillSize(), @@ -314,7 +314,8 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots, void CodeGenerator::CreateCommonInvokeLocationSummary( HInvoke* invoke, InvokeDexCallingConventionVisitor* visitor) { ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetArena(); - LocationSummary* locations = new (allocator) LocationSummary(invoke, LocationSummary::kCall); + LocationSummary* locations = new (allocator) LocationSummary(invoke, + LocationSummary::kCallOnMainOnly); for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) { HInstruction* input = invoke->InputAt(i); @@ -378,7 +379,7 @@ void CodeGenerator::CreateUnresolvedFieldLocationSummary( ArenaAllocator* allocator = field_access->GetBlock()->GetGraph()->GetArena(); LocationSummary* locations = - new (allocator) LocationSummary(field_access, LocationSummary::kCall); + new (allocator) LocationSummary(field_access, LocationSummary::kCallOnMainOnly); locations->AddTemp(calling_convention.GetFieldIndexLocation()); @@ -499,7 +500,7 @@ void CodeGenerator::CreateLoadClassLocationSummary(HLoadClass* cls, bool code_generator_supports_read_barrier) { ArenaAllocator* allocator = cls->GetBlock()->GetGraph()->GetArena(); LocationSummary::CallKind call_kind = cls->NeedsAccessCheck() - ? LocationSummary::kCall + ? LocationSummary::kCallOnMainOnly : (((code_generator_supports_read_barrier && kEmitCompilerReadBarrier) || cls->CanCallRuntime()) ? LocationSummary::kCallOnSlowPath @@ -764,16 +765,24 @@ void CodeGenerator::RecordPcInfo(HInstruction* instruction, LocationSummary* locations = instruction->GetLocations(); uint32_t register_mask = locations->GetRegisterMask(); - if (locations->OnlyCallsOnSlowPath()) { - // In case of slow path, we currently set the location of caller-save registers - // to register (instead of their stack location when pushed before the slow-path - // call). Therefore register_mask contains both callee-save and caller-save - // registers that hold objects. We must remove the caller-save from the mask, since - // they will be overwritten by the callee. - register_mask &= core_callee_save_mask_; + if (instruction->IsSuspendCheck()) { + // Suspend check has special ABI that saves the caller-save registers in callee, + // so we want to emit stack maps containing the registers. + // TODO: Register allocator still reserves space for the caller-save registers. + // We should add slow-path-specific caller-save information into LocationSummary + // and refactor the code here as well as in the register allocator to use it. + } else { + if (locations->OnlyCallsOnSlowPath()) { + // In case of slow path, we currently set the location of caller-save registers + // to register (instead of their stack location when pushed before the slow-path + // call). Therefore register_mask contains both callee-save and caller-save + // registers that hold objects. We must remove the caller-save from the mask, since + // they will be overwritten by the callee. + register_mask &= core_callee_save_mask_; + } + // The register mask must be a subset of callee-save registers. + DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask); } - // The register mask must be a subset of callee-save registers. - DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask); stack_map_stream_.BeginStackMapEntry(outer_dex_pc, native_pc, register_mask, @@ -1173,23 +1182,23 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod << "instruction->DebugName()=" << instruction->DebugName() << " instruction->GetSideEffects().ToString()=" << instruction->GetSideEffects().ToString(); } else { - DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal()) + DCHECK(instruction->GetLocations()->CallsOnSlowPath() || slow_path->IsFatal()) << "instruction->DebugName()=" << instruction->DebugName() << " slow_path->GetDescription()=" << slow_path->GetDescription(); DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()) || - // When read barriers are enabled, some instructions use a - // slow path to emit a read barrier, which does not trigger - // GC, is not fatal, nor is emitted by HDeoptimize - // instructions. + // When (non-Baker) read barriers are enabled, some instructions + // use a slow path to emit a read barrier, which does not trigger + // GC. (kEmitCompilerReadBarrier && + !kUseBakerReadBarrier && (instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet() || - instruction->IsArraySet() || instruction->IsArrayGet() || instruction->IsLoadClass() || instruction->IsLoadString() || instruction->IsInstanceOf() || - instruction->IsCheckCast()))) + instruction->IsCheckCast() || + (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified())))) << "instruction->DebugName()=" << instruction->DebugName() << " instruction->GetSideEffects().ToString()=" << instruction->GetSideEffects().ToString() << " slow_path->GetDescription()=" << slow_path->GetDescription(); @@ -1203,6 +1212,28 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod << instruction->DebugName() << ((slow_path != nullptr) ? slow_path->GetDescription() : ""); } +void CodeGenerator::ValidateInvokeRuntimeWithoutRecordingPcInfo(HInstruction* instruction, + SlowPathCode* slow_path) { + DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath()) + << "instruction->DebugName()=" << instruction->DebugName() + << " slow_path->GetDescription()=" << slow_path->GetDescription(); + // Only the Baker read barrier marking slow path used by certains + // instructions is expected to invoke the runtime without recording + // PC-related information. + DCHECK(kUseBakerReadBarrier); + DCHECK(instruction->IsInstanceFieldGet() || + instruction->IsStaticFieldGet() || + instruction->IsArrayGet() || + instruction->IsLoadClass() || + instruction->IsLoadString() || + instruction->IsInstanceOf() || + instruction->IsCheckCast() || + (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified()) || + (instruction->IsInvokeStaticOrDirect() && instruction->GetLocations()->Intrinsified())) + << "instruction->DebugName()=" << instruction->DebugName() + << " slow_path->GetDescription()=" << slow_path->GetDescription(); +} + void SlowPathCode::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) { RegisterSet* live_registers = locations->GetLiveRegisters(); size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath(); diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 9364be35ff..fd396c474c 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -22,6 +22,7 @@ #include "base/arena_containers.h" #include "base/arena_object.h" #include "base/bit_field.h" +#include "base/enums.h" #include "compiled_method.h" #include "driver/compiler_options.h" #include "globals.h" @@ -80,7 +81,11 @@ class SlowPathCode : public DeletableArenaObject<kArenaAllocSlowPaths> { virtual void EmitNativeCode(CodeGenerator* codegen) = 0; + // Save live core and floating-point caller-save registers and + // update the stack mask in `locations` for registers holding object + // references. virtual void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations); + // Restore live core and floating-point caller-save registers. virtual void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations); bool IsCoreRegisterSaved(int reg) const { @@ -187,7 +192,7 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { size_t GetStackSlotOfParameter(HParameterValue* parameter) const { // Note that this follows the current calling convention. return GetFrameSize() - + InstructionSetPointerSize(GetInstructionSet()) // Art method + + static_cast<size_t>(InstructionSetPointerSize(GetInstructionSet())) // Art method + parameter->GetIndex() * kVRegSize; } @@ -211,6 +216,8 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { size_t maximum_number_of_live_fpu_registers, size_t number_of_out_slots, const ArenaVector<HBasicBlock*>& block_order); + // Backends can override this as necessary. For most, no special alignment is required. + virtual uint32_t GetPreferredSlotsAlignment() const { return 1; } uint32_t GetFrameSize() const { return frame_size_; } void SetFrameSize(uint32_t size) { frame_size_ = size; } @@ -333,6 +340,9 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { bool* GetBlockedCoreRegisters() const { return blocked_core_registers_; } bool* GetBlockedFloatingPointRegisters() const { return blocked_fpu_registers_; } + bool IsBlockedCoreRegister(size_t i) { return blocked_core_registers_[i]; } + bool IsBlockedFloatingPointRegister(size_t i) { return blocked_fpu_registers_[i]; } + // Helper that returns the pointer offset of an index in an object array. // Note: this method assumes we always have the same pointer size, regardless // of the architecture. @@ -350,6 +360,17 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { // accessing the String's `value` field in String intrinsics. static uint32_t GetArrayDataOffset(HArrayGet* array_get); + // Return the entry point offset for ReadBarrierMarkRegX, where X is `reg`. + template <PointerSize pointer_size> + static int32_t GetReadBarrierMarkEntryPointsOffset(size_t reg) { + // The entry point list defines 30 ReadBarrierMarkRegX entry points. + DCHECK_LT(reg, 30u); + // The ReadBarrierMarkRegX entry points are ordered by increasing + // register number in Thread::tls_Ptr_.quick_entrypoints. + return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value() + + static_cast<size_t>(pointer_size) * reg; + } + void EmitParallelMoves(Location from1, Location to1, Primitive::Type type1, @@ -363,8 +384,14 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { return type == Primitive::kPrimNot && !value->IsNullConstant(); } + + // Perfoms checks pertaining to an InvokeRuntime call. void ValidateInvokeRuntime(HInstruction* instruction, SlowPathCode* slow_path); + // Perfoms checks pertaining to an InvokeRuntimeWithoutRecordingPcInfo call. + static void ValidateInvokeRuntimeWithoutRecordingPcInfo(HInstruction* instruction, + SlowPathCode* slow_path); + void AddAllocatedRegister(Location location) { allocated_registers_.Add(location); } @@ -677,7 +704,7 @@ class CallingConvention { size_t number_of_registers, const F* fpu_registers, size_t number_of_fpu_registers, - size_t pointer_size) + PointerSize pointer_size) : registers_(registers), number_of_registers_(number_of_registers), fpu_registers_(fpu_registers), @@ -700,7 +727,7 @@ class CallingConvention { size_t GetStackOffsetOf(size_t index) const { // We still reserve the space for parameters passed by registers. // Add space for the method pointer. - return pointer_size_ + index * kVRegSize; + return static_cast<size_t>(pointer_size_) + index * kVRegSize; } private: @@ -708,7 +735,7 @@ class CallingConvention { const size_t number_of_registers_; const F* fpu_registers_; const size_t number_of_fpu_registers_; - const size_t pointer_size_; + const PointerSize pointer_size_; DISALLOW_COPY_AND_ASSIGN(CallingConvention); }; diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index e441f825bc..404f044cef 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -59,9 +59,9 @@ static constexpr DRegister DTMP = D31; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmWordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value() class NullCheckSlowPathARM : public SlowPathCode { public: @@ -119,11 +119,9 @@ class SuspendCheckSlowPathARM : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, instruction_->GetLocations()); arm_codegen->InvokeRuntime( QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, instruction_->GetLocations()); if (successor_ == nullptr) { __ b(GetReturnLabel()); } else { @@ -316,7 +314,7 @@ class TypeCheckSlowPathARM : public SlowPathCode { instruction_->GetDexPc(), this); CheckEntrypointTypes< - kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>(); + kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>(); arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0)); } else { DCHECK(instruction_->IsCheckCast()); @@ -412,8 +410,8 @@ class ArraySetSlowPathARM : public SlowPathCode { // Slow path marking an object during a read barrier. class ReadBarrierMarkSlowPathARM : public SlowPathCode { public: - ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location out, Location obj) - : SlowPathCode(instruction), out_(out), obj_(obj) { + ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location obj) + : SlowPathCode(instruction), obj_(obj) { DCHECK(kEmitCompilerReadBarrier); } @@ -421,9 +419,9 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); - Register reg_out = out_.AsRegister<Register>(); + Register reg = obj_.AsRegister<Register>(); DCHECK(locations->CanCall()); - DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg)); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || @@ -431,30 +429,45 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); - - InvokeRuntimeCallingConvention calling_convention; + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); - arm_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_); - arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark), - instruction_, - instruction_->GetDexPc(), - this); - CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>(); - arm_codegen->Move32(out_, Location::RegisterLocation(R0)); - - RestoreLiveRegisters(codegen, locations); + DCHECK_NE(reg, SP); + DCHECK_NE(reg, LR); + DCHECK_NE(reg, PC); + // IP is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(reg, IP); + DCHECK(0 <= reg && reg < kNumberOfCoreRegisters) << reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in R0): + // + // R0 <- obj + // R0 <- ReadBarrierMark(R0) + // obj <- R0 + // + // we just use rX (the register holding `obj`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(reg); + // This runtime call does not require a stack map. + arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ b(GetExitLabel()); } private: - const Location out_; const Location obj_; DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM); @@ -500,8 +513,7 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); @@ -688,8 +700,8 @@ class ReadBarrierForRootSlowPathARM : public SlowPathCode { }; #undef __ -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<ArmAssembler*>(GetAssembler())-> // NOLINT +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<ArmAssembler*>(GetAssembler())-> // NOLINT inline Condition ARMCondition(IfCondition cond) { switch (cond) { @@ -956,7 +968,7 @@ void CodeGeneratorARM::GenerateFrameExit() { if (fpu_spill_mask_ != 0) { SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_)); __ vpops(start_register, POPCOUNT(fpu_spill_mask_)); - __ cfi().AdjustCFAOffset(-kArmPointerSize * POPCOUNT(fpu_spill_mask_)); + __ cfi().AdjustCFAOffset(-static_cast<int>(kArmPointerSize) * POPCOUNT(fpu_spill_mask_)); __ cfi().RestoreMany(DWARFReg(SRegister(0)), fpu_spill_mask_); } // Pop LR into PC to return. @@ -1208,7 +1220,7 @@ void CodeGeneratorARM::InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path) { - InvokeRuntime(GetThreadOffset<kArmWordSize>(entrypoint).Int32Value(), + InvokeRuntime(GetThreadOffset<kArmPointerSize>(entrypoint).Int32Value(), instruction, dex_pc, slow_path); @@ -1224,6 +1236,14 @@ void CodeGeneratorARM::InvokeRuntime(int32_t entry_point_offset, RecordPcInfo(instruction, dex_pc, slow_path); } +void CodeGeneratorARM::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path) { + ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path); + __ LoadFromOffset(kLoadWord, LR, TR, entry_point_offset); + __ blx(LR); +} + void InstructionCodeGeneratorARM::HandleGoto(HInstruction* got, HBasicBlock* successor) { DCHECK(!successor->IsExitBlock()); @@ -1271,6 +1291,44 @@ void LocationsBuilderARM::VisitExit(HExit* exit) { void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) { } +void InstructionCodeGeneratorARM::GenerateVcmp(HInstruction* instruction) { + Primitive::Type type = instruction->InputAt(0)->GetType(); + Location lhs_loc = instruction->GetLocations()->InAt(0); + Location rhs_loc = instruction->GetLocations()->InAt(1); + if (rhs_loc.IsConstant()) { + // 0.0 is the only immediate that can be encoded directly in + // a VCMP instruction. + // + // Both the JLS (section 15.20.1) and the JVMS (section 6.5) + // specify that in a floating-point comparison, positive zero + // and negative zero are considered equal, so we can use the + // literal 0.0 for both cases here. + // + // Note however that some methods (Float.equal, Float.compare, + // Float.compareTo, Double.equal, Double.compare, + // Double.compareTo, Math.max, Math.min, StrictMath.max, + // StrictMath.min) consider 0.0 to be (strictly) greater than + // -0.0. So if we ever translate calls to these methods into a + // HCompare instruction, we must handle the -0.0 case with + // care here. + DCHECK(rhs_loc.GetConstant()->IsArithmeticZero()); + if (type == Primitive::kPrimFloat) { + __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>()); + } else { + DCHECK_EQ(type, Primitive::kPrimDouble); + __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>())); + } + } else { + if (type == Primitive::kPrimFloat) { + __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>()); + } else { + DCHECK_EQ(type, Primitive::kPrimDouble); + __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()), + FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>())); + } + } +} + void InstructionCodeGeneratorARM::GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label ATTRIBUTE_UNUSED) { @@ -1371,22 +1429,14 @@ void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condi Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in; Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in; - LocationSummary* locations = condition->GetLocations(); - Location left = locations->InAt(0); - Location right = locations->InAt(1); - Primitive::Type type = condition->InputAt(0)->GetType(); switch (type) { case Primitive::kPrimLong: GenerateLongComparesAndJumps(condition, true_target, false_target); break; case Primitive::kPrimFloat: - __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>()); - GenerateFPJumps(condition, true_target, false_target); - break; case Primitive::kPrimDouble: - __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()), - FromLowSToD(right.AsFpuRegisterPairLow<SRegister>())); + GenerateVcmp(condition); GenerateFPJumps(condition, true_target, false_target); break; default: @@ -1567,7 +1617,7 @@ void LocationsBuilderARM::HandleCondition(HCondition* cond) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, ArithmeticZeroOrFpuRegister(cond->InputAt(1))); if (!cond->IsEmittedAtUseSite()) { locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } @@ -1614,12 +1664,8 @@ void InstructionCodeGeneratorARM::HandleCondition(HCondition* cond) { GenerateLongComparesAndJumps(cond, &true_label, &false_label); break; case Primitive::kPrimFloat: - __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>()); - GenerateFPJumps(cond, &true_label, &false_label); - break; case Primitive::kPrimDouble: - __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()), - FromLowSToD(right.AsFpuRegisterPairLow<SRegister>())); + GenerateVcmp(cond); GenerateFPJumps(cond, &true_label, &false_label); break; } @@ -1889,8 +1935,6 @@ void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke) LocationSummary* locations = invoke->GetLocations(); Register temp = locations->GetTemp(0).AsRegister<Register>(); Register hidden_reg = locations->GetTemp(1).AsRegister<Register>(); - uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset( - invoke->GetImtIndex() % mirror::Class::kImtSize, kArmPointerSize).Uint32Value(); Location receiver = locations->InAt(0); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); @@ -1916,10 +1960,14 @@ void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke) // intact/accessible until the end of the marking phase (the // concurrent copying collector may not in the future). __ MaybeUnpoisonHeapReference(temp); + __ LoadFromOffset(kLoadWord, temp, temp, + mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value()); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + invoke->GetImtIndex(), kArmPointerSize)); // temp = temp->GetImtEntryAt(method_offset); - uint32_t entry_point = - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize).Int32Value(); __ LoadFromOffset(kLoadWord, temp, temp, method_offset); + uint32_t entry_point = + ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value(); // LR = temp->GetEntryPoint(); __ LoadFromOffset(kLoadWord, LR, temp, entry_point); // LR(); @@ -2012,7 +2060,7 @@ void LocationsBuilderARM::VisitTypeConversion(HTypeConversion* conversion) { (((input_type == Primitive::kPrimFloat || input_type == Primitive::kPrimDouble) && result_type == Primitive::kPrimLong) || (input_type == Primitive::kPrimLong && result_type == Primitive::kPrimFloat)) - ? LocationSummary::kCall + ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind); @@ -2477,7 +2525,7 @@ void LocationsBuilderARM::VisitAdd(HAdd* add) { case Primitive::kPrimLong: { locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, ArmEncodableConstantOrRegister(add->InputAt(1), ADD)); locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); break; } @@ -2514,13 +2562,18 @@ void InstructionCodeGeneratorARM::VisitAdd(HAdd* add) { break; case Primitive::kPrimLong: { - DCHECK(second.IsRegisterPair()); - __ adds(out.AsRegisterPairLow<Register>(), - first.AsRegisterPairLow<Register>(), - ShifterOperand(second.AsRegisterPairLow<Register>())); - __ adc(out.AsRegisterPairHigh<Register>(), - first.AsRegisterPairHigh<Register>(), - ShifterOperand(second.AsRegisterPairHigh<Register>())); + if (second.IsConstant()) { + uint64_t value = static_cast<uint64_t>(Int64FromConstant(second.GetConstant())); + GenerateAddLongConst(out, first, value); + } else { + DCHECK(second.IsRegisterPair()); + __ adds(out.AsRegisterPairLow<Register>(), + first.AsRegisterPairLow<Register>(), + ShifterOperand(second.AsRegisterPairLow<Register>())); + __ adc(out.AsRegisterPairHigh<Register>(), + first.AsRegisterPairHigh<Register>(), + ShifterOperand(second.AsRegisterPairHigh<Register>())); + } break; } @@ -2554,7 +2607,7 @@ void LocationsBuilderARM::VisitSub(HSub* sub) { case Primitive::kPrimLong: { locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, ArmEncodableConstantOrRegister(sub->InputAt(1), SUB)); locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); break; } @@ -2590,13 +2643,18 @@ void InstructionCodeGeneratorARM::VisitSub(HSub* sub) { } case Primitive::kPrimLong: { - DCHECK(second.IsRegisterPair()); - __ subs(out.AsRegisterPairLow<Register>(), - first.AsRegisterPairLow<Register>(), - ShifterOperand(second.AsRegisterPairLow<Register>())); - __ sbc(out.AsRegisterPairHigh<Register>(), - first.AsRegisterPairHigh<Register>(), - ShifterOperand(second.AsRegisterPairHigh<Register>())); + if (second.IsConstant()) { + uint64_t value = static_cast<uint64_t>(Int64FromConstant(second.GetConstant())); + GenerateAddLongConst(out, first, -value); + } else { + DCHECK(second.IsRegisterPair()); + __ subs(out.AsRegisterPairLow<Register>(), + first.AsRegisterPairLow<Register>(), + ShifterOperand(second.AsRegisterPairLow<Register>())); + __ sbc(out.AsRegisterPairHigh<Register>(), + first.AsRegisterPairHigh<Register>(), + ShifterOperand(second.AsRegisterPairHigh<Register>())); + } break; } @@ -2831,13 +2889,13 @@ void LocationsBuilderARM::VisitDiv(HDiv* div) { LocationSummary::CallKind call_kind = LocationSummary::kNoCall; if (div->GetResultType() == Primitive::kPrimLong) { // pLdiv runtime call. - call_kind = LocationSummary::kCall; + call_kind = LocationSummary::kCallOnMainOnly; } else if (div->GetResultType() == Primitive::kPrimInt && div->InputAt(1)->IsConstant()) { // sdiv will be replaced by other instruction sequence. } else if (div->GetResultType() == Primitive::kPrimInt && !codegen_->GetInstructionSetFeatures().HasDivideInstruction()) { // pIdivmod runtime call. - call_kind = LocationSummary::kCall; + call_kind = LocationSummary::kCallOnMainOnly; } LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(div, call_kind); @@ -2956,7 +3014,7 @@ void LocationsBuilderARM::VisitRem(HRem* rem) { Primitive::Type type = rem->GetResultType(); // Most remainders are implemented in the runtime. - LocationSummary::CallKind call_kind = LocationSummary::kCall; + LocationSummary::CallKind call_kind = LocationSummary::kCallOnMainOnly; if (rem->GetResultType() == Primitive::kPrimInt && rem->InputAt(1)->IsConstant()) { // sdiv will be replaced by other instruction sequence. call_kind = LocationSummary::kNoCall; @@ -3493,7 +3551,7 @@ void InstructionCodeGeneratorARM::VisitUShr(HUShr* ushr) { void LocationsBuilderARM::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); if (instruction->IsStringAlloc()) { locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); } else { @@ -3510,7 +3568,7 @@ void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) { if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize); + MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize); __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString)); __ LoadFromOffset(kLoadWord, LR, temp, code_offset.Int32Value()); __ blx(LR); @@ -3526,7 +3584,7 @@ void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) { void LocationsBuilderARM::VisitNewArray(HNewArray* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->SetOut(Location::RegisterLocation(R0)); @@ -3634,7 +3692,7 @@ void LocationsBuilderARM::VisitCompare(HCompare* compare) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, ArithmeticZeroOrFpuRegister(compare->InputAt(1))); locations->SetOut(Location::RequiresRegister()); break; } @@ -3679,12 +3737,7 @@ void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { __ LoadImmediate(out, 0); - if (type == Primitive::kPrimFloat) { - __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>()); - } else { - __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()), - FromLowSToD(right.AsFpuRegisterPairLow<SRegister>())); - } + GenerateVcmp(compare); __ vmstat(); // transfer FP status register to ARM APSR. less_cond = ARMFPCondition(kCondLT, compare->IsGtBias()); break; @@ -3978,6 +4031,17 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI } } +Location LocationsBuilderARM::ArithmeticZeroOrFpuRegister(HInstruction* input) { + DCHECK(input->GetType() == Primitive::kPrimDouble || input->GetType() == Primitive::kPrimFloat) + << input->GetType(); + if ((input->IsFloatConstant() && (input->AsFloatConstant()->IsArithmeticZero())) || + (input->IsDoubleConstant() && (input->AsDoubleConstant()->IsArithmeticZero()))) { + return Location::ConstantLocation(input->AsConstant()); + } else { + return Location::RequiresFpuRegister(); + } +} + Location LocationsBuilderARM::ArmEncodableConstantOrRegister(HInstruction* constant, Opcode opcode) { DCHECK(!Primitive::IsFloatingPointType(constant->GetType())); @@ -3992,31 +4056,51 @@ bool LocationsBuilderARM::CanEncodeConstantAsImmediate(HConstant* input_cst, Opcode opcode) { uint64_t value = static_cast<uint64_t>(Int64FromConstant(input_cst)); if (Primitive::Is64BitType(input_cst->GetType())) { - return CanEncodeConstantAsImmediate(Low32Bits(value), opcode) && - CanEncodeConstantAsImmediate(High32Bits(value), opcode); + Opcode high_opcode = opcode; + SetCc low_set_cc = kCcDontCare; + switch (opcode) { + case SUB: + // Flip the operation to an ADD. + value = -value; + opcode = ADD; + FALLTHROUGH_INTENDED; + case ADD: + if (Low32Bits(value) == 0u) { + return CanEncodeConstantAsImmediate(High32Bits(value), opcode, kCcDontCare); + } + high_opcode = ADC; + low_set_cc = kCcSet; + break; + default: + break; + } + return CanEncodeConstantAsImmediate(Low32Bits(value), opcode, low_set_cc) && + CanEncodeConstantAsImmediate(High32Bits(value), high_opcode, kCcDontCare); } else { return CanEncodeConstantAsImmediate(Low32Bits(value), opcode); } } -bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode) { +bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value, + Opcode opcode, + SetCc set_cc) { ShifterOperand so; ArmAssembler* assembler = codegen_->GetAssembler(); - if (assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, opcode, value, &so)) { + if (assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, opcode, value, set_cc, &so)) { return true; } Opcode neg_opcode = kNoOperand; switch (opcode) { - case AND: - neg_opcode = BIC; - break; - case ORR: - neg_opcode = ORN; - break; + case AND: neg_opcode = BIC; value = ~value; break; + case ORR: neg_opcode = ORN; value = ~value; break; + case ADD: neg_opcode = SUB; value = -value; break; + case ADC: neg_opcode = SBC; value = ~value; break; + case SUB: neg_opcode = ADD; value = -value; break; + case SBC: neg_opcode = ADC; value = ~value; break; default: return false; } - return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, ~value, &so); + return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, value, set_cc, &so); } void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction, @@ -4264,6 +4348,122 @@ void InstructionCodeGeneratorARM::VisitNullCheck(HNullCheck* instruction) { codegen_->GenerateNullCheck(instruction); } +static LoadOperandType GetLoadOperandType(Primitive::Type type) { + switch (type) { + case Primitive::kPrimNot: + return kLoadWord; + case Primitive::kPrimBoolean: + return kLoadUnsignedByte; + case Primitive::kPrimByte: + return kLoadSignedByte; + case Primitive::kPrimChar: + return kLoadUnsignedHalfword; + case Primitive::kPrimShort: + return kLoadSignedHalfword; + case Primitive::kPrimInt: + return kLoadWord; + case Primitive::kPrimLong: + return kLoadWordPair; + case Primitive::kPrimFloat: + return kLoadSWord; + case Primitive::kPrimDouble: + return kLoadDWord; + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + +static StoreOperandType GetStoreOperandType(Primitive::Type type) { + switch (type) { + case Primitive::kPrimNot: + return kStoreWord; + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + return kStoreByte; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + return kStoreHalfword; + case Primitive::kPrimInt: + return kStoreWord; + case Primitive::kPrimLong: + return kStoreWordPair; + case Primitive::kPrimFloat: + return kStoreSWord; + case Primitive::kPrimDouble: + return kStoreDWord; + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + +void CodeGeneratorARM::LoadFromShiftedRegOffset(Primitive::Type type, + Location out_loc, + Register base, + Register reg_offset, + Condition cond) { + uint32_t shift_count = Primitive::ComponentSizeShift(type); + Address mem_address(base, reg_offset, Shift::LSL, shift_count); + + switch (type) { + case Primitive::kPrimByte: + __ ldrsb(out_loc.AsRegister<Register>(), mem_address, cond); + break; + case Primitive::kPrimBoolean: + __ ldrb(out_loc.AsRegister<Register>(), mem_address, cond); + break; + case Primitive::kPrimShort: + __ ldrsh(out_loc.AsRegister<Register>(), mem_address, cond); + break; + case Primitive::kPrimChar: + __ ldrh(out_loc.AsRegister<Register>(), mem_address, cond); + break; + case Primitive::kPrimNot: + case Primitive::kPrimInt: + __ ldr(out_loc.AsRegister<Register>(), mem_address, cond); + break; + // T32 doesn't support LoadFromShiftedRegOffset mem address mode for these types. + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + +void CodeGeneratorARM::StoreToShiftedRegOffset(Primitive::Type type, + Location loc, + Register base, + Register reg_offset, + Condition cond) { + uint32_t shift_count = Primitive::ComponentSizeShift(type); + Address mem_address(base, reg_offset, Shift::LSL, shift_count); + + switch (type) { + case Primitive::kPrimByte: + case Primitive::kPrimBoolean: + __ strb(loc.AsRegister<Register>(), mem_address, cond); + break; + case Primitive::kPrimShort: + case Primitive::kPrimChar: + __ strh(loc.AsRegister<Register>(), mem_address, cond); + break; + case Primitive::kPrimNot: + case Primitive::kPrimInt: + __ str(loc.AsRegister<Register>(), mem_address, cond); + break; + // T32 doesn't support StoreToShiftedRegOffset mem address mode for these types. + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) { bool object_array_get_with_read_barrier = kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot); @@ -4298,70 +4498,40 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { Location index = locations->InAt(1); Location out_loc = locations->Out(); uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction); - Primitive::Type type = instruction->GetType(); - switch (type) { - case Primitive::kPrimBoolean: { - Register out = out_loc.AsRegister<Register>(); - if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; - __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset); - } else { - __ add(IP, obj, ShifterOperand(index.AsRegister<Register>())); - __ LoadFromOffset(kLoadUnsignedByte, out, IP, data_offset); - } - break; - } - - case Primitive::kPrimByte: { - Register out = out_loc.AsRegister<Register>(); - if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; - __ LoadFromOffset(kLoadSignedByte, out, obj, offset); - } else { - __ add(IP, obj, ShifterOperand(index.AsRegister<Register>())); - __ LoadFromOffset(kLoadSignedByte, out, IP, data_offset); - } - break; - } - - case Primitive::kPrimShort: { - Register out = out_loc.AsRegister<Register>(); - if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset; - __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset); - } else { - __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2)); - __ LoadFromOffset(kLoadSignedHalfword, out, IP, data_offset); - } - break; - } - - case Primitive::kPrimChar: { - Register out = out_loc.AsRegister<Register>(); - if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset; - __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset); - } else { - __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2)); - __ LoadFromOffset(kLoadUnsignedHalfword, out, IP, data_offset); - } - break; - } + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. + DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier)); + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: case Primitive::kPrimInt: { - Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ LoadFromOffset(kLoadWord, out, obj, offset); + int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue(); + uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type)); + + LoadOperandType load_type = GetLoadOperandType(type); + __ LoadFromOffset(load_type, out_loc.AsRegister<Register>(), obj, full_offset); } else { - __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4)); - __ LoadFromOffset(kLoadWord, out, IP, data_offset); + Register temp = IP; + + if (has_intermediate_address) { + // We do not need to compute the intermediate address from the array: the + // input instruction has done it already. See the comment in + // `TryExtractArrayAccessAddress()`. + if (kIsDebugBuild) { + HIntermediateAddress* tmp = array_instr->AsIntermediateAddress(); + DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset); + } + temp = obj; + } else { + __ add(temp, obj, ShifterOperand(data_offset)); + } + codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>()); } break; } @@ -4390,8 +4560,22 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { // reference, if heap poisoning is enabled). codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset); } else { - __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4)); - __ LoadFromOffset(kLoadWord, out, IP, data_offset); + Register temp = IP; + + if (has_intermediate_address) { + // We do not need to compute the intermediate address from the array: the + // input instruction has done it already. See the comment in + // `TryExtractArrayAccessAddress()`. + if (kIsDebugBuild) { + HIntermediateAddress* tmp = array_instr->AsIntermediateAddress(); + DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset); + } + temp = obj; + } else { + __ add(temp, obj, ShifterOperand(data_offset)); + } + codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>()); + codegen_->MaybeRecordImplicitNullCheck(instruction); // If read barriers are enabled, emit read barriers other than // Baker's using a slow path (and also unpoison the loaded @@ -4490,54 +4674,68 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck(); bool needs_write_barrier = CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue()); + uint32_t data_offset = + mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value(); + Location value_loc = locations->InAt(2); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. + DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier)); switch (value_type) { case Primitive::kPrimBoolean: - case Primitive::kPrimByte: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value(); - Register value = locations->InAt(2).AsRegister<Register>(); - if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; - __ StoreToOffset(kStoreByte, value, array, offset); - } else { - __ add(IP, array, ShifterOperand(index.AsRegister<Register>())); - __ StoreToOffset(kStoreByte, value, IP, data_offset); - } - break; - } - + case Primitive::kPrimByte: case Primitive::kPrimShort: - case Primitive::kPrimChar: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value(); - Register value = locations->InAt(2).AsRegister<Register>(); + case Primitive::kPrimChar: + case Primitive::kPrimInt: { if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset; - __ StoreToOffset(kStoreHalfword, value, array, offset); + int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue(); + uint32_t full_offset = + data_offset + (const_index << Primitive::ComponentSizeShift(value_type)); + StoreOperandType store_type = GetStoreOperandType(value_type); + __ StoreToOffset(store_type, value_loc.AsRegister<Register>(), array, full_offset); } else { - __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2)); - __ StoreToOffset(kStoreHalfword, value, IP, data_offset); + Register temp = IP; + + if (has_intermediate_address) { + // We do not need to compute the intermediate address from the array: the + // input instruction has done it already. See the comment in + // `TryExtractArrayAccessAddress()`. + if (kIsDebugBuild) { + HIntermediateAddress* tmp = array_instr->AsIntermediateAddress(); + DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == data_offset); + } + temp = array; + } else { + __ add(temp, array, ShifterOperand(data_offset)); + } + codegen_->StoreToShiftedRegOffset(value_type, + value_loc, + temp, + index.AsRegister<Register>()); } break; } case Primitive::kPrimNot: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); - Location value_loc = locations->InAt(2); Register value = value_loc.AsRegister<Register>(); - Register source = value; + // TryExtractArrayAccessAddress optimization is never applied for non-primitive ArraySet. + // See the comment in instruction_simplifier_shared.cc. + DCHECK(!has_intermediate_address); if (instruction->InputAt(2)->IsNullConstant()) { // Just setting null. if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ StoreToOffset(kStoreWord, source, array, offset); + __ StoreToOffset(kStoreWord, value, array, offset); } else { DCHECK(index.IsRegister()) << index; - __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4)); - __ StoreToOffset(kStoreWord, source, IP, data_offset); + __ add(IP, array, ShifterOperand(data_offset)); + codegen_->StoreToShiftedRegOffset(value_type, + value_loc, + IP, + index.AsRegister<Register>()); } codegen_->MaybeRecordImplicitNullCheck(instruction); DCHECK(!needs_write_barrier); @@ -4566,8 +4764,11 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { __ StoreToOffset(kStoreWord, value, array, offset); } else { DCHECK(index.IsRegister()) << index; - __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4)); - __ StoreToOffset(kStoreWord, value, IP, data_offset); + __ add(IP, array, ShifterOperand(data_offset)); + codegen_->StoreToShiftedRegOffset(value_type, + value_loc, + IP, + index.AsRegister<Register>()); } codegen_->MaybeRecordImplicitNullCheck(instruction); __ b(&done); @@ -4634,6 +4835,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { } } + Register source = value; if (kPoisonHeapReferences) { // Note that in the case where `value` is a null reference, // we do not enter this block, as a null reference does not @@ -4650,8 +4852,12 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { __ StoreToOffset(kStoreWord, source, array, offset); } else { DCHECK(index.IsRegister()) << index; - __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4)); - __ StoreToOffset(kStoreWord, source, IP, data_offset); + + __ add(IP, array, ShifterOperand(data_offset)); + codegen_->StoreToShiftedRegOffset(value_type, + Location::RegisterLocation(source), + IP, + index.AsRegister<Register>()); } if (!may_need_runtime_call_for_type_check) { @@ -4671,23 +4877,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { break; } - case Primitive::kPrimInt: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); - Register value = locations->InAt(2).AsRegister<Register>(); - if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ StoreToOffset(kStoreWord, value, array, offset); - } else { - DCHECK(index.IsRegister()) << index; - __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4)); - __ StoreToOffset(kStoreWord, value, IP, data_offset); - } - break; - } - case Primitive::kPrimLong: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value(); Location value = locations->InAt(2); if (index.IsConstant()) { size_t offset = @@ -4701,7 +4891,6 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { } case Primitive::kPrimFloat: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value(); Location value = locations->InAt(2); DCHECK(value.IsFpuRegister()); if (index.IsConstant()) { @@ -4715,7 +4904,6 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { } case Primitive::kPrimDouble: { - uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value(); Location value = locations->InAt(2); DCHECK(value.IsFpuRegisterPair()); if (index.IsConstant()) { @@ -4756,6 +4944,37 @@ void InstructionCodeGeneratorARM::VisitArrayLength(HArrayLength* instruction) { codegen_->MaybeRecordImplicitNullCheck(instruction); } +void LocationsBuilderARM::VisitIntermediateAddress(HIntermediateAddress* instruction) { + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. + DCHECK(!kEmitCompilerReadBarrier); + LocationSummary* locations = + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->GetOffset())); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); +} + +void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* instruction) { + LocationSummary* locations = instruction->GetLocations(); + Location out = locations->Out(); + Location first = locations->InAt(0); + Location second = locations->InAt(1); + + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. + DCHECK(!kEmitCompilerReadBarrier); + + if (second.IsRegister()) { + __ add(out.AsRegister<Register>(), + first.AsRegister<Register>(), + ShifterOperand(second.AsRegister<Register>())); + } else { + __ AddConstant(out.AsRegister<Register>(), + first.AsRegister<Register>(), + second.GetConstant()->AsIntConstant()->GetValue()); + } +} + void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) { LocationSummary::CallKind call_kind = instruction->CanThrowIntoCatchBlock() ? LocationSummary::kCallOnSlowPath @@ -4790,7 +5009,7 @@ void CodeGeneratorARM::MarkGCCard(Register temp, if (can_be_null) { __ CompareAndBranchIfZero(value, &is_null); } - __ LoadFromOffset(kLoadWord, card, TR, Thread::CardTableOffset<kArmWordSize>().Int32Value()); + __ LoadFromOffset(kLoadWord, card, TR, Thread::CardTableOffset<kArmPointerSize>().Int32Value()); __ Lsr(temp, object, gc::accounting::CardTable::kCardShift); __ strb(card, Address(card, temp)); if (can_be_null) { @@ -4841,7 +5060,7 @@ void InstructionCodeGeneratorARM::GenerateSuspendCheck(HSuspendCheck* instructio } __ LoadFromOffset( - kLoadUnsignedHalfword, IP, TR, Thread::ThreadFlagsOffset<kArmWordSize>().Int32Value()); + kLoadUnsignedHalfword, IP, TR, Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value()); if (successor == nullptr) { __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel()); __ Bind(slow_path->GetReturnLabel()); @@ -5370,59 +5589,19 @@ void InstructionCodeGeneratorARM::VisitLoadString(HLoadString* load) { __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address)); return; // No dex cache slow path. } - case HLoadString::LoadKind::kDexCacheAddress: { - DCHECK_NE(load->GetAddress(), 0u); - uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress()); - // 16-bit LDR immediate has a 5-bit offset multiplied by the size and that gives - // a 128B range. To try and reduce the number of literals if we load multiple strings, - // simply split the dex cache address to a 128B aligned base loaded from a literal - // and the remaining offset embedded in the load. - static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes."); - DCHECK_ALIGNED(load->GetAddress(), 4u); - constexpr size_t offset_bits = /* encoded bits */ 5 + /* scale */ 2; - uint32_t base_address = address & ~MaxInt<uint32_t>(offset_bits); - uint32_t offset = address & MaxInt<uint32_t>(offset_bits); - __ LoadLiteral(out, codegen_->DeduplicateDexCacheAddressLiteral(base_address)); - // /* GcRoot<mirror::String> */ out = *(base_address + offset) - GenerateGcRootFieldLoad(load, out_loc, out, offset); - break; - } - case HLoadString::LoadKind::kDexCachePcRelative: { - Register base_reg = locations->InAt(0).AsRegister<Register>(); - HArmDexCacheArraysBase* base = load->InputAt(0)->AsArmDexCacheArraysBase(); - int32_t offset = load->GetDexCacheElementOffset() - base->GetElementOffset(); - // /* GcRoot<mirror::String> */ out = *(dex_cache_arrays_base + offset) - GenerateGcRootFieldLoad(load, out_loc, base_reg, offset); - break; - } - case HLoadString::LoadKind::kDexCacheViaMethod: { - Register current_method = locations->InAt(0).AsRegister<Register>(); - - // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_ - GenerateGcRootFieldLoad( - load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value()); - // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_ - __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value()); - // /* GcRoot<mirror::String> */ out = out[string_index] - GenerateGcRootFieldLoad( - load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex())); - break; - } default: - LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind(); - UNREACHABLE(); + break; } - if (!load->IsInDexCache()) { - SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load); - codegen_->AddSlowPath(slow_path); - __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); - } + // TODO: Re-add the compiler code to do string dex cache lookup again. + SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load); + codegen_->AddSlowPath(slow_path); + __ b(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } static int32_t GetExceptionTlsOffset() { - return Thread::ExceptionOffset<kArmWordSize>().Int32Value(); + return Thread::ExceptionOffset<kArmPointerSize>().Int32Value(); } void LocationsBuilderARM::VisitLoadException(HLoadException* load) { @@ -5447,7 +5626,7 @@ void InstructionCodeGeneratorARM::VisitClearException(HClearException* clear ATT void LocationsBuilderARM::VisitThrow(HThrow* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -5848,7 +6027,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) { void LocationsBuilderARM::VisitMonitorOperation(HMonitorOperation* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -6011,6 +6190,34 @@ void InstructionCodeGeneratorARM::GenerateEorConst(Register out, Register first, __ eor(out, first, ShifterOperand(value)); } +void InstructionCodeGeneratorARM::GenerateAddLongConst(Location out, + Location first, + uint64_t value) { + Register out_low = out.AsRegisterPairLow<Register>(); + Register out_high = out.AsRegisterPairHigh<Register>(); + Register first_low = first.AsRegisterPairLow<Register>(); + Register first_high = first.AsRegisterPairHigh<Register>(); + uint32_t value_low = Low32Bits(value); + uint32_t value_high = High32Bits(value); + if (value_low == 0u) { + if (out_low != first_low) { + __ mov(out_low, ShifterOperand(first_low)); + } + __ AddConstant(out_high, first_high, value_high); + return; + } + __ AddConstantSetFlags(out_low, first_low, value_low); + ShifterOperand so; + if (__ ShifterOperandCanHold(out_high, first_high, ADC, value_high, kCcDontCare, &so)) { + __ adc(out_high, first_high, so); + } else if (__ ShifterOperandCanHold(out_low, first_low, SBC, ~value_high, kCcDontCare, &so)) { + __ sbc(out_high, first_high, so); + } else { + LOG(FATAL) << "Unexpected constant " << value_high; + UNREACHABLE(); + } +} + void InstructionCodeGeneratorARM::HandleBitwiseOperation(HBinaryOperation* instruction) { LocationSummary* locations = instruction->GetLocations(); Location first = locations->InAt(0); @@ -6172,12 +6379,12 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct // Slow path used to mark the GC root `root`. SlowPathCode* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root, root); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root); codegen_->AddSlowPath(slow_path); // IP = Thread::Current()->GetIsGcMarking() __ LoadFromOffset( - kLoadWord, IP, TR, Thread::IsGcMarkingOffset<kArmWordSize>().Int32Value()); + kLoadWord, IP, TR, Thread::IsGcMarkingOffset<kArmPointerSize>().Int32Value()); __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } else { @@ -6275,21 +6482,13 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // /* LockWord */ lock_word = LockWord(monitor) static_assert(sizeof(LockWord) == sizeof(int32_t), "art::LockWord and int32_t have different sizes."); - // /* uint32_t */ rb_state = lock_word.ReadBarrierState() - __ Lsr(temp_reg, temp_reg, LockWord::kReadBarrierStateShift); - __ and_(temp_reg, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask)); - static_assert( - LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_, - "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_."); - // Introduce a dependency on the high bits of rb_state, which shall - // be all zeroes, to prevent load-load reordering, and without using + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using // a memory barrier (which would be more expensive). - // IP = rb_state & ~LockWord::kReadBarrierStateMask = 0 - __ bic(IP, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask)); - // obj is unchanged by this operation, but its value now depends on - // IP, which depends on temp_reg. - __ add(obj, obj, ShifterOperand(IP)); + // `obj` is unchanged by this operation, but its value now depends + // on `temp_reg`. + __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32)); // The actual reference load. if (index.IsValid()) { @@ -6321,13 +6520,19 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // Slow path used to mark the object `ref` when it is gray. SlowPathCode* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref, ref); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref); AddSlowPath(slow_path); // if (rb_state == ReadBarrier::gray_ptr_) // ref = ReadBarrier::Mark(ref); - __ cmp(temp_reg, ShifterOperand(ReadBarrier::gray_ptr_)); - __ b(slow_path->GetEntryLabel(), EQ); + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1); + __ b(slow_path->GetEntryLabel(), CS); // Carry flag is the last bit shifted out by LSRS. __ Bind(slow_path->GetExitLabel()); } @@ -6539,7 +6744,7 @@ void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, // LR = callee_method->entry_point_from_quick_compiled_code_ __ LoadFromOffset( kLoadWord, LR, callee_method.AsRegister<Register>(), - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize).Int32Value()); + ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value()); // LR() __ blx(LR); break; @@ -6573,7 +6778,7 @@ void CodeGeneratorARM::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp __ MaybeUnpoisonHeapReference(temp); // temp = temp->GetMethodAt(method_offset); uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset( - kArmWordSize).Int32Value(); + kArmPointerSize).Int32Value(); __ LoadFromOffset(kLoadWord, temp, temp, method_offset); // LR = temp->GetEntryPoint(); __ LoadFromOffset(kLoadWord, LR, temp, entry_point); @@ -6951,18 +7156,25 @@ void LocationsBuilderARM::VisitClassTableGet(HClassTableGet* instruction) { void InstructionCodeGeneratorARM::VisitClassTableGet(HClassTableGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - uint32_t method_offset = 0; if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) { - method_offset = mirror::Class::EmbeddedVTableEntryOffset( + uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( instruction->GetIndex(), kArmPointerSize).SizeValue(); + __ LoadFromOffset(kLoadWord, + locations->Out().AsRegister<Register>(), + locations->InAt(0).AsRegister<Register>(), + method_offset); } else { - method_offset = mirror::Class::EmbeddedImTableEntryOffset( - instruction->GetIndex() % mirror::Class::kImtSize, kArmPointerSize).Uint32Value(); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + instruction->GetIndex(), kArmPointerSize)); + __ LoadFromOffset(kLoadWord, + locations->Out().AsRegister<Register>(), + locations->InAt(0).AsRegister<Register>(), + mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value()); + __ LoadFromOffset(kLoadWord, + locations->Out().AsRegister<Register>(), + locations->Out().AsRegister<Register>(), + method_offset); } - __ LoadFromOffset(kLoadWord, - locations->Out().AsRegister<Register>(), - locations->InAt(0).AsRegister<Register>(), - method_offset); } #undef __ diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index 477c4f18c1..5d9b2dce1c 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -17,13 +17,13 @@ #ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM_H_ #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM_H_ +#include "base/enums.h" #include "code_generator.h" -#include "dex/compiler_enums.h" #include "driver/compiler_options.h" #include "nodes.h" +#include "string_reference.h" #include "parallel_move_resolver.h" #include "utils/arm/assembler_thumb2.h" -#include "utils/string_reference.h" #include "utils/type_reference.h" namespace art { @@ -32,7 +32,7 @@ namespace arm { class CodeGeneratorARM; // Use a local definition to prevent copying mistakes. -static constexpr size_t kArmWordSize = kArmPointerSize; +static constexpr size_t kArmWordSize = static_cast<size_t>(kArmPointerSize); static constexpr size_t kArmBitsPerWord = kArmWordSize * kBitsPerByte; static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 }; @@ -180,9 +180,10 @@ class LocationsBuilderARM : public HGraphVisitor { void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); + Location ArithmeticZeroOrFpuRegister(HInstruction* input); Location ArmEncodableConstantOrRegister(HInstruction* constant, Opcode opcode); bool CanEncodeConstantAsImmediate(HConstant* input_cst, Opcode opcode); - bool CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode); + bool CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode, SetCc set_cc = kCcDontCare); CodeGeneratorARM* const codegen_; InvokeDexCallingConventionVisitorARM parameter_visitor_; @@ -219,6 +220,7 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator { void GenerateAndConst(Register out, Register first, uint32_t value); void GenerateOrrConst(Register out, Register first, uint32_t value); void GenerateEorConst(Register out, Register first, uint32_t value); + void GenerateAddLongConst(Location out, Location first, uint64_t value); void HandleBitwiseOperation(HBinaryOperation* operation); void HandleCondition(HCondition* condition); void HandleIntegerRotate(LocationSummary* locations); @@ -281,6 +283,7 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator { void GenerateCompareTestAndBranch(HCondition* condition, Label* true_target, Label* false_target); + void GenerateVcmp(HInstruction* instruction); void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label); void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label); void DivRemOneOrMinusOne(HBinaryOperation* instruction); @@ -365,6 +368,24 @@ class CodeGeneratorARM : public CodeGenerator { // Helper method to move a 64bits value between two locations. void Move64(Location destination, Location source); + void LoadOrStoreToOffset(Primitive::Type type, + Location loc, + Register base, + int32_t offset, + bool is_load, + Condition cond = AL); + + void LoadFromShiftedRegOffset(Primitive::Type type, + Location out_loc, + Register base, + Register reg_offset, + Condition cond = AL); + void StoreToShiftedRegOffset(Primitive::Type type, + Location out_loc, + Register base, + Register reg_offset, + Condition cond = AL); + // Generate code to invoke a runtime entry point. void InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, @@ -376,6 +397,12 @@ class CodeGeneratorARM : public CodeGenerator { uint32_t dex_pc, SlowPathCode* slow_path); + // Generate code to invoke a runtime entry point, but do not record + // PC-related information in a stack map. + void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path); + // Emit a write barrier. void MarkGCCard(Register temp, Register card, Register object, Register value, bool can_be_null); diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index fc2c2c34aa..122c174eae 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -33,8 +33,7 @@ #include "utils/assembler.h" #include "utils/stack_checks.h" - -using namespace vixl; // NOLINT(build/namespaces) +using namespace vixl::aarch64; // NOLINT(build/namespaces) #ifdef __ #error "ARM64 Codegen VIXL macro-assembler macro already defined." @@ -132,9 +131,9 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type retur return ARM64ReturnLocation(return_type); } -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value() // Calculate memory accessing operand for save/restore live registers. static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen, @@ -147,20 +146,20 @@ static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen, codegen->GetNumberOfFloatingPointRegisters())); CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, - register_set->GetCoreRegisters() & (~callee_saved_core_registers.list())); + register_set->GetCoreRegisters() & (~callee_saved_core_registers.GetList())); CPURegList fp_list = CPURegList(CPURegister::kFPRegister, kDRegSize, - register_set->GetFloatingPointRegisters() & (~callee_saved_fp_registers.list())); + register_set->GetFloatingPointRegisters() & (~callee_saved_fp_registers.GetList())); MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler(); UseScratchRegisterScope temps(masm); Register base = masm->StackPointer(); - int64_t core_spill_size = core_list.TotalSizeInBytes(); - int64_t fp_spill_size = fp_list.TotalSizeInBytes(); + int64_t core_spill_size = core_list.GetTotalSizeInBytes(); + int64_t fp_spill_size = fp_list.GetTotalSizeInBytes(); int64_t reg_size = kXRegSizeInBytes; int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size; uint32_t ls_access_size = WhichPowerOf2(reg_size); - if (((core_list.Count() > 1) || (fp_list.Count() > 1)) && + if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) && !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) { // If the offset does not fit in the instruction's immediate field, use an alternate register // to compute the base address(float point registers spill base address). @@ -399,11 +398,9 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, instruction_->GetLocations()); arm64_codegen->InvokeRuntime( QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, instruction_->GetLocations()); if (successor_ == nullptr) { __ B(GetReturnLabel()); } else { @@ -411,7 +408,7 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 { } } - vixl::Label* GetReturnLabel() { + vixl::aarch64::Label* GetReturnLabel() { DCHECK(successor_ == nullptr); return &return_label_; } @@ -427,7 +424,7 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 { HBasicBlock* const successor_; // If `successor_` is null, the label to branch to after the suspend check. - vixl::Label return_label_; + vixl::aarch64::Label return_label_; DISALLOW_COPY_AND_ASSIGN(SuspendCheckSlowPathARM64); }; @@ -463,7 +460,7 @@ class TypeCheckSlowPathARM64 : public SlowPathCodeARM64 { if (instruction_->IsInstanceOf()) { arm64_codegen->InvokeRuntime( QUICK_ENTRY_POINT(pInstanceofNonTrivial), instruction_, dex_pc, this); - CheckEntrypointTypes<kQuickInstanceofNonTrivial, uint32_t, + CheckEntrypointTypes<kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>(); Primitive::Type ret_type = instruction_->GetType(); Location ret_loc = calling_convention.GetReturnLocation(ret_type); @@ -567,9 +564,9 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { __ Bind(&table_start_); const ArenaVector<HBasicBlock*>& successors = switch_instr_->GetBlock()->GetSuccessors(); for (uint32_t i = 0; i < num_entries; i++) { - vixl::Label* target_label = codegen->GetLabelOf(successors[i]); + vixl::aarch64::Label* target_label = codegen->GetLabelOf(successors[i]); DCHECK(target_label->IsBound()); - ptrdiff_t jump_offset = target_label->location() - table_start_.location(); + ptrdiff_t jump_offset = target_label->GetLocation() - table_start_.GetLocation(); DCHECK_GT(jump_offset, std::numeric_limits<int32_t>::min()); DCHECK_LE(jump_offset, std::numeric_limits<int32_t>::max()); Literal<int32_t> literal(jump_offset); @@ -580,8 +577,8 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { // Slow path marking an object during a read barrier. class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { public: - ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location out, Location obj) - : SlowPathCodeARM64(instruction), out_(out), obj_(obj) { + ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location obj) + : SlowPathCodeARM64(instruction), obj_(obj) { DCHECK(kEmitCompilerReadBarrier); } @@ -589,9 +586,8 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); - Primitive::Type type = Primitive::kPrimNot; DCHECK(locations->CanCall()); - DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg())); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(obj_.reg())); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || @@ -599,30 +595,45 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); - - InvokeRuntimeCallingConvention calling_convention; + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - arm64_codegen->MoveLocation(LocationFrom(calling_convention.GetRegisterAt(0)), obj_, type); - arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark), - instruction_, - instruction_->GetDexPc(), - this); - CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>(); - arm64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type); - - RestoreLiveRegisters(codegen, locations); + DCHECK_NE(obj_.reg(), LR); + DCHECK_NE(obj_.reg(), WSP); + DCHECK_NE(obj_.reg(), WZR); + // IP0 is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(obj_.reg(), IP0); + DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg(); + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in W0): + // + // W0 <- obj + // W0 <- ReadBarrierMark(W0) + // obj <- W0 + // + // we just use rX (the register holding `obj`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(obj_.reg()); + // This runtime call does not require a stack map. + arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ B(GetExitLabel()); } private: - const Location out_; const Location obj_; DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64); @@ -668,14 +679,12 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); - // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. DCHECK(!(instruction_->IsArrayGet() && - instruction_->AsArrayGet()->GetArray()->IsArm64IntermediateAddress())); + instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); __ Bind(GetEntryLabel()); @@ -744,10 +753,7 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 { (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile)) << instruction_->AsInvoke()->GetIntrinsic(); DCHECK_EQ(offset_, 0U); - DCHECK(index_.IsRegisterPair()); - // UnsafeGet's offset location is a register pair, the low - // part contains the correct offset. - index = index_.ToLow(); + DCHECK(index_.IsRegister()); } } @@ -790,8 +796,8 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 { private: Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) { - size_t ref = static_cast<int>(XRegisterFrom(ref_).code()); - size_t obj = static_cast<int>(XRegisterFrom(obj_).code()); + size_t ref = static_cast<int>(XRegisterFrom(ref_).GetCode()); + size_t obj = static_cast<int>(XRegisterFrom(obj_).GetCode()); for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) { if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) { return Register(VIXLRegCodeFromART(i), kXRegSize); @@ -909,8 +915,8 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, kNumberOfAllocatableRegisters, kNumberOfAllocatableFPRegisters, kNumberOfAllocatableRegisterPairs, - callee_saved_core_registers.list(), - callee_saved_fp_registers.list(), + callee_saved_core_registers.GetList(), + callee_saved_fp_registers.GetList(), compiler_options, stats), block_labels_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), @@ -1060,17 +1066,17 @@ void CodeGeneratorARM64::GenerateFrameExit() { GetAssembler()->cfi().DefCFAOffset(GetFrameSize()); } -vixl::CPURegList CodeGeneratorARM64::GetFramePreservedCoreRegisters() const { +CPURegList CodeGeneratorARM64::GetFramePreservedCoreRegisters() const { DCHECK(ArtVixlRegCodeCoherentForRegSet(core_spill_mask_, GetNumberOfCoreRegisters(), 0, 0)); - return vixl::CPURegList(vixl::CPURegister::kRegister, vixl::kXRegSize, - core_spill_mask_); + return CPURegList(CPURegister::kRegister, kXRegSize, + core_spill_mask_); } -vixl::CPURegList CodeGeneratorARM64::GetFramePreservedFPRegisters() const { +CPURegList CodeGeneratorARM64::GetFramePreservedFPRegisters() const { DCHECK(ArtVixlRegCodeCoherentForRegSet(0, 0, fpu_spill_mask_, GetNumberOfFloatingPointRegisters())); - return vixl::CPURegList(vixl::CPURegister::kFPRegister, vixl::kDRegSize, - fpu_spill_mask_); + return CPURegList(CPURegister::kFPRegister, kDRegSize, + fpu_spill_mask_); } void CodeGeneratorARM64::Bind(HBasicBlock* block) { @@ -1094,11 +1100,11 @@ void CodeGeneratorARM64::MarkGCCard(Register object, Register value, bool value_ UseScratchRegisterScope temps(GetVIXLAssembler()); Register card = temps.AcquireX(); Register temp = temps.AcquireW(); // Index within the CardTable - 32bit. - vixl::Label done; + vixl::aarch64::Label done; if (value_can_be_null) { __ Cbz(value, &done); } - __ Ldr(card, MemOperand(tr, Thread::CardTableOffset<kArm64WordSize>().Int32Value())); + __ Ldr(card, MemOperand(tr, Thread::CardTableOffset<kArm64PointerSize>().Int32Value())); __ Lsr(temp, object, gc::accounting::CardTable::kCardShift); __ Strb(card, MemOperand(card, temp.X())); if (value_can_be_null) { @@ -1119,12 +1125,12 @@ void CodeGeneratorARM64::SetupBlockedRegisters() const { CPURegList reserved_core_registers = vixl_reserved_core_registers; reserved_core_registers.Combine(runtime_reserved_core_registers); while (!reserved_core_registers.IsEmpty()) { - blocked_core_registers_[reserved_core_registers.PopLowestIndex().code()] = true; + blocked_core_registers_[reserved_core_registers.PopLowestIndex().GetCode()] = true; } CPURegList reserved_fp_registers = vixl_reserved_fp_registers; while (!reserved_fp_registers.IsEmpty()) { - blocked_fpu_registers_[reserved_fp_registers.PopLowestIndex().code()] = true; + blocked_fpu_registers_[reserved_fp_registers.PopLowestIndex().GetCode()] = true; } if (GetGraph()->IsDebuggable()) { @@ -1133,7 +1139,7 @@ void CodeGeneratorARM64::SetupBlockedRegisters() const { // now, just block them. CPURegList reserved_fp_registers_debuggable = callee_saved_fp_registers; while (!reserved_fp_registers_debuggable.IsEmpty()) { - blocked_fpu_registers_[reserved_fp_registers_debuggable.PopLowestIndex().code()] = true; + blocked_fpu_registers_[reserved_fp_registers_debuggable.PopLowestIndex().GetCode()] = true; } } } @@ -1277,17 +1283,21 @@ void CodeGeneratorARM64::MoveLocation(Location destination, UseScratchRegisterScope temps(GetVIXLAssembler()); HConstant* src_cst = source.GetConstant(); CPURegister temp; - if (src_cst->IsIntConstant() || src_cst->IsNullConstant()) { - temp = temps.AcquireW(); - } else if (src_cst->IsLongConstant()) { - temp = temps.AcquireX(); - } else if (src_cst->IsFloatConstant()) { - temp = temps.AcquireS(); + if (src_cst->IsZeroBitPattern()) { + temp = (src_cst->IsLongConstant() || src_cst->IsDoubleConstant()) ? xzr : wzr; } else { - DCHECK(src_cst->IsDoubleConstant()); - temp = temps.AcquireD(); + if (src_cst->IsIntConstant()) { + temp = temps.AcquireW(); + } else if (src_cst->IsLongConstant()) { + temp = temps.AcquireX(); + } else if (src_cst->IsFloatConstant()) { + temp = temps.AcquireS(); + } else { + DCHECK(src_cst->IsDoubleConstant()); + temp = temps.AcquireD(); + } + MoveConstant(temp, src_cst); } - MoveConstant(temp, src_cst); __ Str(temp, StackOperandFrom(destination)); } else { DCHECK(source.IsStackSlot() || source.IsDoubleStackSlot()); @@ -1344,7 +1354,7 @@ void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction, DCHECK(!src.IsPostIndex()); // TODO(vixl): Let the MacroAssembler handle MemOperand. - __ Add(temp_base, src.base(), OperandFromMemOperand(src)); + __ Add(temp_base, src.GetBaseRegister(), OperandFromMemOperand(src)); MemOperand base = MemOperand(temp_base); switch (type) { case Primitive::kPrimBoolean: @@ -1436,7 +1446,7 @@ void CodeGeneratorARM64::StoreRelease(Primitive::Type type, // TODO(vixl): Let the MacroAssembler handle this. Operand op = OperandFromMemOperand(dst); - __ Add(temp_base, dst.base(), op); + __ Add(temp_base, dst.GetBaseRegister(), op); MemOperand base = MemOperand(temp_base); switch (type) { case Primitive::kPrimBoolean: @@ -1472,7 +1482,7 @@ void CodeGeneratorARM64::InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path) { - InvokeRuntime(GetThreadOffset<kArm64WordSize>(entrypoint).Int32Value(), + InvokeRuntime(GetThreadOffset<kArm64PointerSize>(entrypoint).Int32Value(), instruction, dex_pc, slow_path); @@ -1489,8 +1499,17 @@ void CodeGeneratorARM64::InvokeRuntime(int32_t entry_point_offset, RecordPcInfo(instruction, dex_pc, slow_path); } +void CodeGeneratorARM64::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path) { + ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path); + BlockPoolsScope block_pools(GetVIXLAssembler()); + __ Ldr(lr, MemOperand(tr, entry_point_offset)); + __ Blr(lr); +} + void InstructionCodeGeneratorARM64::GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, - vixl::Register class_reg) { + Register class_reg) { UseScratchRegisterScope temps(GetVIXLAssembler()); Register temp = temps.AcquireW(); size_t status_offset = mirror::Class::StatusOffset().SizeValue(); @@ -1546,7 +1565,7 @@ void InstructionCodeGeneratorARM64::GenerateSuspendCheck(HSuspendCheck* instruct UseScratchRegisterScope temps(codegen_->GetVIXLAssembler()); Register temp = temps.AcquireW(); - __ Ldrh(temp, MemOperand(tr, Thread::ThreadFlagsOffset<kArm64WordSize>().SizeValue())); + __ Ldrh(temp, MemOperand(tr, Thread::ThreadFlagsOffset<kArm64PointerSize>().SizeValue())); if (successor == nullptr) { __ Cbnz(temp, slow_path->GetEntryLabel()); __ Bind(slow_path->GetReturnLabel()); @@ -1755,7 +1774,7 @@ void InstructionCodeGeneratorARM64::HandleBinaryOp(HBinaryOperation* instr) { __ Sub(dst, lhs, rhs); } else if (instr->IsRor()) { if (rhs.IsImmediate()) { - uint32_t shift = rhs.immediate() & (lhs.SizeInBits() - 1); + uint32_t shift = rhs.GetImmediate() & (lhs.GetSizeInBits() - 1); __ Ror(dst, lhs, shift); } else { // Ensure shift distance is in the same size register as the result. If @@ -1818,7 +1837,7 @@ void InstructionCodeGeneratorARM64::HandleShift(HBinaryOperation* instr) { Register lhs = InputRegisterAt(instr, 0); Operand rhs = InputOperandAt(instr, 1); if (rhs.IsImmediate()) { - uint32_t shift_value = rhs.immediate() & + uint32_t shift_value = rhs.GetImmediate() & (type == Primitive::kPrimInt ? kMaxIntShiftDistance : kMaxLongShiftDistance); if (instr->IsShl()) { __ Lsl(dst, lhs, shift_value); @@ -1828,7 +1847,7 @@ void InstructionCodeGeneratorARM64::HandleShift(HBinaryOperation* instr) { __ Lsr(dst, lhs, shift_value); } } else { - Register rhs_reg = dst.IsX() ? rhs.reg().X() : rhs.reg().W(); + Register rhs_reg = dst.IsX() ? rhs.GetRegister().X() : rhs.GetRegister().W(); if (instr->IsShl()) { __ Lsl(dst, lhs, rhs_reg); @@ -1965,9 +1984,8 @@ void InstructionCodeGeneratorARM64::VisitArm64DataProcWithShifterOp( } } -void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddress* instruction) { - // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. +void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) { + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. DCHECK(!kEmitCompilerReadBarrier); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); @@ -1976,10 +1994,9 @@ void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddr locations->SetOut(Location::RequiresRegister()); } -void InstructionCodeGeneratorARM64::VisitArm64IntermediateAddress( - HArm64IntermediateAddress* instruction) { - // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. +void InstructionCodeGeneratorARM64::VisitIntermediateAddress( + HIntermediateAddress* instruction) { + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. DCHECK(!kEmitCompilerReadBarrier); __ Add(OutputRegister(instruction), InputRegisterAt(instruction, 0), @@ -2014,13 +2031,14 @@ void InstructionCodeGeneratorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* if (instr->GetType() == Primitive::kPrimLong && codegen_->GetInstructionSetFeatures().NeedFixCortexA53_835769()) { MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen_)->GetVIXLAssembler(); - vixl::Instruction* prev = masm->GetCursorAddress<vixl::Instruction*>() - vixl::kInstructionSize; + vixl::aarch64::Instruction* prev = + masm->GetCursorAddress<vixl::aarch64::Instruction*>() - kInstructionSize; if (prev->IsLoadOrStore()) { // Make sure we emit only exactly one nop. - vixl::CodeBufferCheckScope scope(masm, - vixl::kInstructionSize, - vixl::CodeBufferCheckScope::kCheck, - vixl::CodeBufferCheckScope::kExactSize); + vixl::aarch64::CodeBufferCheckScope scope(masm, + kInstructionSize, + vixl::aarch64::CodeBufferCheckScope::kCheck, + vixl::aarch64::CodeBufferCheckScope::kExactSize); __ nop(); } } @@ -2078,9 +2096,8 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Object ArrayGet with Baker's read barrier case. Register temp = temps.AcquireW(); - // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. - DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress()); + // The read barrier instrumentation does not support the HIntermediateAddress instruction yet. + DCHECK(!instruction->GetArray()->IsIntermediateAddress()); // Note that a potential implicit null check is handled in the // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call. codegen_->GenerateArrayLoadWithBakerReadBarrier( @@ -2093,15 +2110,15 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { source = HeapOperand(obj, offset); } else { Register temp = temps.AcquireSameSizeAs(obj); - if (instruction->GetArray()->IsArm64IntermediateAddress()) { + if (instruction->GetArray()->IsIntermediateAddress()) { // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. + // HIntermediateAddress instruction yet. DCHECK(!kEmitCompilerReadBarrier); // We do not need to compute the intermediate address from the array: the // input instruction has done it already. See the comment in - // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`. + // `TryExtractArrayAccessAddress()`. if (kIsDebugBuild) { - HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress(); + HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress(); DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), offset); } temp = obj; @@ -2185,15 +2202,15 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) { } else { UseScratchRegisterScope temps(masm); Register temp = temps.AcquireSameSizeAs(array); - if (instruction->GetArray()->IsArm64IntermediateAddress()) { + if (instruction->GetArray()->IsIntermediateAddress()) { // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. + // HIntermediateAddress instruction yet. DCHECK(!kEmitCompilerReadBarrier); // We do not need to compute the intermediate address from the array: the // input instruction has done it already. See the comment in - // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`. + // `TryExtractArrayAccessAddress()`. if (kIsDebugBuild) { - HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress(); + HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress(); DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == offset); } temp = array; @@ -2209,8 +2226,8 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) { codegen_->MaybeRecordImplicitNullCheck(instruction); } else { DCHECK(needs_write_barrier); - DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress()); - vixl::Label done; + DCHECK(!instruction->GetArray()->IsIntermediateAddress()); + vixl::aarch64::Label done; SlowPathCodeARM64* slow_path = nullptr; { // We use a block to end the scratch scope before the write barrier, thus @@ -2235,7 +2252,7 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) { slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathARM64(instruction); codegen_->AddSlowPath(slow_path); if (instruction->GetValueCanBeNull()) { - vixl::Label non_zero; + vixl::aarch64::Label non_zero; __ Cbnz(Register(value), &non_zero); if (!index.IsConstant()) { __ Add(temp, array, offset); @@ -2289,7 +2306,7 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) { __ Cmp(temp, temp2); if (instruction->StaticTypeOfArrayIsObjectArray()) { - vixl::Label do_put; + vixl::aarch64::Label do_put; __ B(eq, &do_put); // If heap poisoning is enabled, the `temp` reference has // not been unpoisoned yet; unpoison it now. @@ -2822,11 +2839,11 @@ void InstructionCodeGeneratorARM64::VisitTryBoundary(HTryBoundary* try_boundary) void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruction, size_t condition_input_index, - vixl::Label* true_target, - vixl::Label* false_target) { + vixl::aarch64::Label* true_target, + vixl::aarch64::Label* false_target) { // FP branching requires both targets to be explicit. If either of the targets // is nullptr (fallthrough) use and bind `fallthrough_target` instead. - vixl::Label fallthrough_target; + vixl::aarch64::Label fallthrough_target; HInstruction* cond = instruction->InputAt(condition_input_index); if (true_target == nullptr && false_target == nullptr) { @@ -2884,7 +2901,7 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct Operand rhs = InputOperandAt(condition, 1); Condition arm64_cond; - vixl::Label* non_fallthrough_target; + vixl::aarch64::Label* non_fallthrough_target; if (true_target == nullptr) { arm64_cond = ARM64Condition(condition->GetOppositeCondition()); non_fallthrough_target = false_target; @@ -2894,7 +2911,7 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct } if ((arm64_cond == eq || arm64_cond == ne || arm64_cond == lt || arm64_cond == ge) && - rhs.IsImmediate() && (rhs.immediate() == 0)) { + rhs.IsImmediate() && (rhs.GetImmediate() == 0)) { switch (arm64_cond) { case eq: __ Cbz(lhs, non_fallthrough_target); @@ -2943,10 +2960,14 @@ void LocationsBuilderARM64::VisitIf(HIf* if_instr) { void InstructionCodeGeneratorARM64::VisitIf(HIf* if_instr) { HBasicBlock* true_successor = if_instr->IfTrueSuccessor(); HBasicBlock* false_successor = if_instr->IfFalseSuccessor(); - vixl::Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ? - nullptr : codegen_->GetLabelOf(true_successor); - vixl::Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ? - nullptr : codegen_->GetLabelOf(false_successor); + vixl::aarch64::Label* true_target = codegen_->GetLabelOf(true_successor); + if (codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor)) { + true_target = nullptr; + } + vixl::aarch64::Label* false_target = codegen_->GetLabelOf(false_successor); + if (codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor)) { + false_target = nullptr; + } GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target); } @@ -3130,7 +3151,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) { uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); - vixl::Label done, zero; + vixl::aarch64::Label done, zero; SlowPathCodeARM64* slow_path = nullptr; // Return 0 if `obj` is null. @@ -3155,7 +3176,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) { case TypeCheckKind::kAbstractClassCheck: { // If the class is abstract, we eagerly fetch the super class of the // object to avoid doing a comparison we know will fail. - vixl::Label loop, success; + vixl::aarch64::Label loop, success; __ Bind(&loop); // /* HeapReference<Class> */ out = out->super_class_ GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc); @@ -3172,7 +3193,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) { case TypeCheckKind::kClassHierarchyCheck: { // Walk over the class hierarchy to find a match. - vixl::Label loop, success; + vixl::aarch64::Label loop, success; __ Bind(&loop); __ Cmp(out, cls); __ B(eq, &success); @@ -3191,7 +3212,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) { case TypeCheckKind::kArrayObjectCheck: { // Do an exact check. - vixl::Label exact_check; + vixl::aarch64::Label exact_check; __ Cmp(out, cls); __ B(eq, &exact_check); // Otherwise, we need to check that the object's class is a non-primitive array. @@ -3328,7 +3349,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) { is_type_check_slow_path_fatal); codegen_->AddSlowPath(type_check_slow_path); - vixl::Label done; + vixl::aarch64::Label done; // Avoid null check if we know obj is not null. if (instruction->MustDoNullCheck()) { __ Cbz(obj, &done); @@ -3350,7 +3371,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) { case TypeCheckKind::kAbstractClassCheck: { // If the class is abstract, we eagerly fetch the super class of the // object to avoid doing a comparison we know will fail. - vixl::Label loop, compare_classes; + vixl::aarch64::Label loop, compare_classes; __ Bind(&loop); // /* HeapReference<Class> */ temp = temp->super_class_ GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc); @@ -3377,7 +3398,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) { case TypeCheckKind::kClassHierarchyCheck: { // Walk over the class hierarchy to find a match. - vixl::Label loop; + vixl::aarch64::Label loop; __ Bind(&loop); __ Cmp(temp, cls); __ B(eq, &done); @@ -3402,7 +3423,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) { case TypeCheckKind::kArrayObjectCheck: { // Do an exact check. - vixl::Label check_non_primitive_component_type; + vixl::aarch64::Label check_non_primitive_component_type; __ Cmp(temp, cls); __ B(eq, &done); @@ -3506,11 +3527,9 @@ void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invok // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError. LocationSummary* locations = invoke->GetLocations(); Register temp = XRegisterFrom(locations->GetTemp(0)); - uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset( - invoke->GetImtIndex() % mirror::Class::kImtSize, kArm64PointerSize).Uint32Value(); Location receiver = locations->InAt(0); Offset class_offset = mirror::Object::ClassOffset(); - Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize); + Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize); // The register ip1 is required to be used for the hidden argument in // art_quick_imt_conflict_trampoline, so prevent VIXL from using it. @@ -3537,6 +3556,10 @@ void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invok // intact/accessible until the end of the marking phase (the // concurrent copying collector may not in the future). GetAssembler()->MaybeUnpoisonHeapReference(temp.W()); + __ Ldr(temp, + MemOperand(temp, mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value())); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + invoke->GetImtIndex(), kArm64PointerSize)); // temp = temp->GetImtEntryAt(method_offset); __ Ldr(temp, MemOperand(temp, method_offset)); // lr = temp->GetEntryPoint(); @@ -3626,17 +3649,17 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok // Add ADRP with its PC-relative DexCache access patch. const DexFile& dex_file = *invoke->GetTargetMethod().dex_file; uint32_t element_offset = invoke->GetDexCacheArrayOffset(); - vixl::Label* adrp_label = NewPcRelativeDexCacheArrayPatch(dex_file, element_offset); + vixl::aarch64::Label* adrp_label = NewPcRelativeDexCacheArrayPatch(dex_file, element_offset); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(adrp_label); __ adrp(XRegisterFrom(temp), /* offset placeholder */ 0); } // Add LDR with its PC-relative DexCache access patch. - vixl::Label* ldr_label = + vixl::aarch64::Label* ldr_label = NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(ldr_label); __ ldr(XRegisterFrom(temp), MemOperand(XRegisterFrom(temp), /* offset placeholder */ 0)); } @@ -3658,7 +3681,7 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_; __ Ldr(reg.X(), MemOperand(method_reg.X(), - ArtMethod::DexCacheResolvedMethodsOffset(kArm64WordSize).Int32Value())); + ArtMethod::DexCacheResolvedMethodsOffset(kArm64PointerSize).Int32Value())); // temp = temp[index_in_cache]; // Note: Don't use invoke->GetTargetMethod() as it may point to a different dex file. uint32_t index_in_cache = invoke->GetDexMethodIndex(); @@ -3673,8 +3696,8 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok break; case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative: { relative_call_patches_.emplace_back(invoke->GetTargetMethod()); - vixl::Label* label = &relative_call_patches_.back().label; - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + vixl::aarch64::Label* label = &relative_call_patches_.back().label; + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(label); __ bl(0); // Branch and link to itself. This will be overriden at link time. break; @@ -3690,7 +3713,7 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok // LR = callee_method->entry_point_from_quick_compiled_code_; __ Ldr(lr, MemOperand( XRegisterFrom(callee_method), - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize).Int32Value())); + ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize).Int32Value())); // lr() __ Blr(lr); break; @@ -3710,7 +3733,7 @@ void CodeGeneratorARM64::GenerateVirtualCall(HInvokeVirtual* invoke, Location te size_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( invoke->GetVTableIndex(), kArm64PointerSize).SizeValue(); Offset class_offset = mirror::Object::ClassOffset(); - Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize); + Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize); BlockPoolsScope block_pools(GetVIXLAssembler()); @@ -3733,58 +3756,64 @@ void CodeGeneratorARM64::GenerateVirtualCall(HInvokeVirtual* invoke, Location te __ Blr(lr); } -vixl::Label* CodeGeneratorARM64::NewPcRelativeStringPatch(const DexFile& dex_file, - uint32_t string_index, - vixl::Label* adrp_label) { +vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeStringPatch( + const DexFile& dex_file, + uint32_t string_index, + vixl::aarch64::Label* adrp_label) { return NewPcRelativePatch(dex_file, string_index, adrp_label, &pc_relative_string_patches_); } -vixl::Label* CodeGeneratorARM64::NewPcRelativeTypePatch(const DexFile& dex_file, - uint32_t type_index, - vixl::Label* adrp_label) { +vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeTypePatch( + const DexFile& dex_file, + uint32_t type_index, + vixl::aarch64::Label* adrp_label) { return NewPcRelativePatch(dex_file, type_index, adrp_label, &pc_relative_type_patches_); } -vixl::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, - uint32_t element_offset, - vixl::Label* adrp_label) { +vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch( + const DexFile& dex_file, + uint32_t element_offset, + vixl::aarch64::Label* adrp_label) { return NewPcRelativePatch(dex_file, element_offset, adrp_label, &pc_relative_dex_cache_patches_); } -vixl::Label* CodeGeneratorARM64::NewPcRelativePatch(const DexFile& dex_file, - uint32_t offset_or_index, - vixl::Label* adrp_label, - ArenaDeque<PcRelativePatchInfo>* patches) { +vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativePatch( + const DexFile& dex_file, + uint32_t offset_or_index, + vixl::aarch64::Label* adrp_label, + ArenaDeque<PcRelativePatchInfo>* patches) { // Add a patch entry and return the label. patches->emplace_back(dex_file, offset_or_index); PcRelativePatchInfo* info = &patches->back(); - vixl::Label* label = &info->label; + vixl::aarch64::Label* label = &info->label; // If adrp_label is null, this is the ADRP patch and needs to point to its own label. info->pc_insn_label = (adrp_label != nullptr) ? adrp_label : label; return label; } -vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral( +vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral( const DexFile& dex_file, uint32_t string_index) { return boot_image_string_patches_.GetOrCreate( StringReference(&dex_file, string_index), [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); }); } -vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLiteral( +vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLiteral( const DexFile& dex_file, uint32_t type_index) { return boot_image_type_patches_.GetOrCreate( TypeReference(&dex_file, type_index), [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); }); } -vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral(uint64_t address) { +vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral( + uint64_t address) { bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); } -vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateDexCacheAddressLiteral(uint64_t address) { +vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateDexCacheAddressLiteral( + uint64_t address) { return DeduplicateUint64Literal(address); } @@ -3803,76 +3832,76 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc linker_patches->reserve(size); for (const auto& entry : method_patches_) { const MethodReference& target_method = entry.first; - vixl::Literal<uint64_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::MethodPatch(literal->offset(), + vixl::aarch64::Literal<uint64_t>* literal = entry.second; + linker_patches->push_back(LinkerPatch::MethodPatch(literal->GetOffset(), target_method.dex_file, target_method.dex_method_index)); } for (const auto& entry : call_patches_) { const MethodReference& target_method = entry.first; - vixl::Literal<uint64_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::CodePatch(literal->offset(), + vixl::aarch64::Literal<uint64_t>* literal = entry.second; + linker_patches->push_back(LinkerPatch::CodePatch(literal->GetOffset(), target_method.dex_file, target_method.dex_method_index)); } - for (const MethodPatchInfo<vixl::Label>& info : relative_call_patches_) { - linker_patches->push_back(LinkerPatch::RelativeCodePatch(info.label.location(), + for (const MethodPatchInfo<vixl::aarch64::Label>& info : relative_call_patches_) { + linker_patches->push_back(LinkerPatch::RelativeCodePatch(info.label.GetLocation(), info.target_method.dex_file, info.target_method.dex_method_index)); } for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) { - linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.location(), + linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.GetLocation(), &info.target_dex_file, - info.pc_insn_label->location(), + info.pc_insn_label->GetLocation(), info.offset_or_index)); } for (const auto& entry : boot_image_string_patches_) { const StringReference& target_string = entry.first; - vixl::Literal<uint32_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::StringPatch(literal->offset(), + vixl::aarch64::Literal<uint32_t>* literal = entry.second; + linker_patches->push_back(LinkerPatch::StringPatch(literal->GetOffset(), target_string.dex_file, target_string.string_index)); } for (const PcRelativePatchInfo& info : pc_relative_string_patches_) { - linker_patches->push_back(LinkerPatch::RelativeStringPatch(info.label.location(), + linker_patches->push_back(LinkerPatch::RelativeStringPatch(info.label.GetLocation(), &info.target_dex_file, - info.pc_insn_label->location(), + info.pc_insn_label->GetLocation(), info.offset_or_index)); } for (const auto& entry : boot_image_type_patches_) { const TypeReference& target_type = entry.first; - vixl::Literal<uint32_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::TypePatch(literal->offset(), + vixl::aarch64::Literal<uint32_t>* literal = entry.second; + linker_patches->push_back(LinkerPatch::TypePatch(literal->GetOffset(), target_type.dex_file, target_type.type_index)); } for (const PcRelativePatchInfo& info : pc_relative_type_patches_) { - linker_patches->push_back(LinkerPatch::RelativeTypePatch(info.label.location(), + linker_patches->push_back(LinkerPatch::RelativeTypePatch(info.label.GetLocation(), &info.target_dex_file, - info.pc_insn_label->location(), + info.pc_insn_label->GetLocation(), info.offset_or_index)); } for (const auto& entry : boot_image_address_patches_) { DCHECK(GetCompilerOptions().GetIncludePatchInformation()); - vixl::Literal<uint32_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::RecordPosition(literal->offset())); + vixl::aarch64::Literal<uint32_t>* literal = entry.second; + linker_patches->push_back(LinkerPatch::RecordPosition(literal->GetOffset())); } } -vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateUint32Literal(uint32_t value, +vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map) { return map->GetOrCreate( value, [this, value]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(value); }); } -vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateUint64Literal(uint64_t value) { +vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateUint64Literal(uint64_t value) { return uint64_literals_.GetOrCreate( value, [this, value]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(value); }); } -vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral( +vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral( MethodReference target_method, MethodToLiteralMap* map) { return map->GetOrCreate( @@ -3880,12 +3909,12 @@ vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral( [this]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(/* placeholder */ 0u); }); } -vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodAddressLiteral( +vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodAddressLiteral( MethodReference target_method) { return DeduplicateMethodLiteral(target_method, &method_patches_); } -vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodCodeLiteral( +vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodCodeLiteral( MethodReference target_method) { return DeduplicateMethodLiteral(target_method, &call_patches_); } @@ -3959,7 +3988,7 @@ void LocationsBuilderARM64::VisitLoadClass(HLoadClass* cls) { CodeGenerator::CreateLoadClassLocationSummary( cls, LocationFrom(calling_convention.GetRegisterAt(0)), - LocationFrom(vixl::x0), + LocationFrom(vixl::aarch64::x0), /* code_generator_supports_read_barrier */ true); return; } @@ -4011,16 +4040,17 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) { // Add ADRP with its PC-relative type patch. const DexFile& dex_file = cls->GetDexFile(); uint32_t type_index = cls->GetTypeIndex(); - vixl::Label* adrp_label = codegen_->NewPcRelativeTypePatch(dex_file, type_index); + vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeTypePatch(dex_file, type_index); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(adrp_label); __ adrp(out.X(), /* offset placeholder */ 0); } // Add ADD with its PC-relative type patch. - vixl::Label* add_label = codegen_->NewPcRelativeTypePatch(dex_file, type_index, adrp_label); + vixl::aarch64::Label* add_label = + codegen_->NewPcRelativeTypePatch(dex_file, type_index, adrp_label); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(add_label); __ add(out.X(), out.X(), Operand(/* offset placeholder */ 0)); } @@ -4053,14 +4083,15 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) { // Add ADRP with its PC-relative DexCache access patch. const DexFile& dex_file = cls->GetDexFile(); uint32_t element_offset = cls->GetDexCacheElementOffset(); - vixl::Label* adrp_label = codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset); + vixl::aarch64::Label* adrp_label = + codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(adrp_label); __ adrp(out.X(), /* offset placeholder */ 0); } // Add LDR with its PC-relative DexCache access patch. - vixl::Label* ldr_label = + vixl::aarch64::Label* ldr_label = codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label); // /* GcRoot<mirror::Class> */ out = *(base_address + offset) /* PC-relative */ GenerateGcRootFieldLoad(cls, out_loc, out.X(), /* offset placeholder */ 0, ldr_label); @@ -4099,7 +4130,7 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) { } static MemOperand GetExceptionTlsAddress() { - return MemOperand(tr, Thread::ExceptionOffset<kArm64WordSize>().Int32Value()); + return MemOperand(tr, Thread::ExceptionOffset<kArm64PointerSize>().Int32Value()); } void LocationsBuilderARM64::VisitLoadException(HLoadException* load) { @@ -4166,7 +4197,6 @@ void LocationsBuilderARM64::VisitLoadString(HLoadString* load) { } void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) { - Location out_loc = load->GetLocations()->Out(); Register out = OutputRegister(load); switch (load->GetLoadKind()) { @@ -4180,17 +4210,17 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) { // Add ADRP with its PC-relative String patch. const DexFile& dex_file = load->GetDexFile(); uint32_t string_index = load->GetStringIndex(); - vixl::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index); + vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(adrp_label); __ adrp(out.X(), /* offset placeholder */ 0); } // Add ADD with its PC-relative String patch. - vixl::Label* add_label = + vixl::aarch64::Label* add_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index, adrp_label); { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(add_label); __ add(out.X(), out.X(), Operand(/* offset placeholder */ 0)); } @@ -4202,62 +4232,15 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) { __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(load->GetAddress())); return; // No dex cache slow path. } - case HLoadString::LoadKind::kDexCacheAddress: { - DCHECK_NE(load->GetAddress(), 0u); - // LDR immediate has a 12-bit offset multiplied by the size and for 32-bit loads - // that gives a 16KiB range. To try and reduce the number of literals if we load - // multiple strings, simply split the dex cache address to a 16KiB aligned base - // loaded from a literal and the remaining offset embedded in the load. - static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes."); - DCHECK_ALIGNED(load->GetAddress(), 4u); - constexpr size_t offset_bits = /* encoded bits */ 12 + /* scale */ 2; - uint64_t base_address = load->GetAddress() & ~MaxInt<uint64_t>(offset_bits); - uint32_t offset = load->GetAddress() & MaxInt<uint64_t>(offset_bits); - __ Ldr(out.X(), codegen_->DeduplicateDexCacheAddressLiteral(base_address)); - // /* GcRoot<mirror::String> */ out = *(base_address + offset) - GenerateGcRootFieldLoad(load, out_loc, out.X(), offset); - break; - } - case HLoadString::LoadKind::kDexCachePcRelative: { - // Add ADRP with its PC-relative DexCache access patch. - const DexFile& dex_file = load->GetDexFile(); - uint32_t element_offset = load->GetDexCacheElementOffset(); - vixl::Label* adrp_label = codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset); - { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); - __ Bind(adrp_label); - __ adrp(out.X(), /* offset placeholder */ 0); - } - // Add LDR with its PC-relative DexCache access patch. - vixl::Label* ldr_label = - codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label); - // /* GcRoot<mirror::String> */ out = *(base_address + offset) /* PC-relative */ - GenerateGcRootFieldLoad(load, out_loc, out.X(), /* offset placeholder */ 0, ldr_label); - break; - } - case HLoadString::LoadKind::kDexCacheViaMethod: { - Register current_method = InputRegisterAt(load, 0); - // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_ - GenerateGcRootFieldLoad( - load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value()); - // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_ - __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value())); - // /* GcRoot<mirror::String> */ out = out[string_index] - GenerateGcRootFieldLoad( - load, out_loc, out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex())); - break; - } default: - LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind(); - UNREACHABLE(); + break; } - if (!load->IsInDexCache()) { - SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load); - codegen_->AddSlowPath(slow_path); - __ Cbz(out, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); - } + // TODO: Re-add the compiler code to do string dex cache lookup again. + SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load); + codegen_->AddSlowPath(slow_path); + __ B(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } void LocationsBuilderARM64::VisitLongConstant(HLongConstant* constant) { @@ -4271,7 +4254,7 @@ void InstructionCodeGeneratorARM64::VisitLongConstant(HLongConstant* constant AT void LocationsBuilderARM64::VisitMonitorOperation(HMonitorOperation* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); } @@ -4369,7 +4352,7 @@ void InstructionCodeGeneratorARM64::VisitNeg(HNeg* neg) { void LocationsBuilderARM64::VisitNewArray(HNewArray* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0))); locations->SetOut(LocationFrom(x0)); @@ -4394,7 +4377,7 @@ void InstructionCodeGeneratorARM64::VisitNewArray(HNewArray* instruction) { void LocationsBuilderARM64::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; if (instruction->IsStringAlloc()) { locations->AddTemp(LocationFrom(kArtMethodRegister)); @@ -4411,7 +4394,7 @@ void InstructionCodeGeneratorARM64::VisitNewInstance(HNewInstance* instruction) if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. Location temp = instruction->GetLocations()->GetTemp(0); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize); + MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize); __ Ldr(XRegisterFrom(temp), MemOperand(tr, QUICK_ENTRY_POINT(pNewEmptyString))); __ Ldr(lr, MemOperand(XRegisterFrom(temp), code_offset.Int32Value())); __ Blr(lr); @@ -4450,7 +4433,7 @@ void LocationsBuilderARM64::VisitBooleanNot(HBooleanNot* instruction) { } void InstructionCodeGeneratorARM64::VisitBooleanNot(HBooleanNot* instruction) { - __ Eor(OutputRegister(instruction), InputRegisterAt(instruction, 0), vixl::Operand(1)); + __ Eor(OutputRegister(instruction), InputRegisterAt(instruction, 0), vixl::aarch64::Operand(1)); } void LocationsBuilderARM64::VisitNullCheck(HNullCheck* instruction) { @@ -4547,7 +4530,8 @@ void InstructionCodeGeneratorARM64::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED) void LocationsBuilderARM64::VisitRem(HRem* rem) { Primitive::Type type = rem->GetResultType(); LocationSummary::CallKind call_kind = - Primitive::IsFloatingPointType(type) ? LocationSummary::kCall : LocationSummary::kNoCall; + Primitive::IsFloatingPointType(type) ? LocationSummary::kCallOnMainOnly + : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind); switch (type) { @@ -4764,7 +4748,7 @@ void InstructionCodeGeneratorARM64::VisitSuspendCheck(HSuspendCheck* instruction void LocationsBuilderARM64::VisitThrow(HThrow* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); } @@ -4882,7 +4866,7 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst HBasicBlock* default_block = switch_instr->GetDefaultBlock(); // Roughly set 16 as max average assemblies generated per HIR in a graph. - static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * vixl::kInstructionSize; + static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * kInstructionSize; // ADR has a limited range(+/-1MB), so we set a threshold for the number of HIRs in the graph to // make sure we don't emit it if the target may run out of range. // TODO: Instead of emitting all jump tables at the end of the code, we could keep track of ADR @@ -5027,9 +5011,9 @@ void InstructionCodeGeneratorARM64::GenerateReferenceLoadTwoRegisters(HInstructi void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instruction, Location root, - vixl::Register obj, + Register obj, uint32_t offset, - vixl::Label* fixup_label) { + vixl::aarch64::Label* fixup_label) { Register root_reg = RegisterFrom(root, Primitive::kPrimNot); if (kEmitCompilerReadBarrier) { if (kUseBakerReadBarrier) { @@ -5045,7 +5029,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru if (fixup_label == nullptr) { __ Ldr(root_reg, MemOperand(obj, offset)); } else { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(fixup_label); __ ldr(root_reg, MemOperand(obj, offset)); } @@ -5059,14 +5043,14 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru // Slow path used to mark the GC root `root`. SlowPathCodeARM64* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root, root); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root); codegen_->AddSlowPath(slow_path); MacroAssembler* masm = GetVIXLAssembler(); UseScratchRegisterScope temps(masm); Register temp = temps.AcquireW(); // temp = Thread::Current()->GetIsGcMarking() - __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset<kArm64WordSize>().Int32Value())); + __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset<kArm64PointerSize>().Int32Value())); __ Cbnz(temp, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } else { @@ -5076,7 +5060,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru if (fixup_label == nullptr) { __ Add(root_reg.X(), obj.X(), offset); } else { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(fixup_label); __ add(root_reg.X(), obj.X(), offset); } @@ -5089,7 +5073,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru if (fixup_label == nullptr) { __ Ldr(root_reg, MemOperand(obj, offset)); } else { - vixl::SingleEmissionCheckScope guard(GetVIXLAssembler()); + SingleEmissionCheckScope guard(GetVIXLAssembler()); __ Bind(fixup_label); __ ldr(root_reg, MemOperand(obj, offset)); } @@ -5100,7 +5084,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, - vixl::Register obj, + Register obj, uint32_t offset, Register temp, bool needs_null_check, @@ -5124,7 +5108,7 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, - vixl::Register obj, + Register obj, uint32_t data_offset, Location index, Register temp, @@ -5155,7 +5139,7 @@ void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* ins void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, - vixl::Register obj, + Register obj, uint32_t offset, Location index, size_t scale_factor, @@ -5204,23 +5188,13 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // /* LockWord */ lock_word = LockWord(monitor) static_assert(sizeof(LockWord) == sizeof(int32_t), "art::LockWord and int32_t have different sizes."); - // /* uint32_t */ rb_state = lock_word.ReadBarrierState() - __ Lsr(temp, temp, LockWord::kReadBarrierStateShift); - __ And(temp, temp, Operand(LockWord::kReadBarrierStateMask)); - static_assert( - LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_, - "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_."); - // Introduce a dependency on the high bits of rb_state, which shall - // be all zeroes, to prevent load-load reordering, and without using + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using // a memory barrier (which would be more expensive). - // temp2 = rb_state & ~LockWord::kReadBarrierStateMask = 0 - Register temp2 = temps.AcquireW(); - __ Bic(temp2, temp, Operand(LockWord::kReadBarrierStateMask)); - // obj is unchanged by this operation, but its value now depends on - // temp2, which depends on temp. - __ Add(obj, obj, Operand(temp2)); - temps.Release(temp2); + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32)); // The actual reference load. if (index.IsValid()) { @@ -5246,7 +5220,7 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor); Load(type, ref_reg, HeapOperand(obj, computed_offset)); } else { - temp2 = temps.AcquireW(); + Register temp2 = temps.AcquireW(); __ Add(temp2, obj, offset); Load(type, ref_reg, HeapOperand(temp2, XRegisterFrom(index), LSL, scale_factor)); temps.Release(temp2); @@ -5267,13 +5241,16 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // Slow path used to mark the object `ref` when it is gray. SlowPathCodeARM64* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref, ref); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref); AddSlowPath(slow_path); // if (rb_state == ReadBarrier::gray_ptr_) // ref = ReadBarrier::Mark(ref); - __ Cmp(temp, ReadBarrier::gray_ptr_); - __ B(eq, slow_path->GetEntryLabel()); + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } @@ -5348,16 +5325,19 @@ void LocationsBuilderARM64::VisitClassTableGet(HClassTableGet* instruction) { void InstructionCodeGeneratorARM64::VisitClassTableGet(HClassTableGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - uint32_t method_offset = 0; if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) { - method_offset = mirror::Class::EmbeddedVTableEntryOffset( + uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( instruction->GetIndex(), kArm64PointerSize).SizeValue(); + __ Ldr(XRegisterFrom(locations->Out()), + MemOperand(XRegisterFrom(locations->InAt(0)), method_offset)); } else { - method_offset = mirror::Class::EmbeddedImTableEntryOffset( - instruction->GetIndex() % mirror::Class::kImtSize, kArm64PointerSize).Uint32Value(); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + instruction->GetIndex(), kArm64PointerSize)); + __ Ldr(XRegisterFrom(locations->Out()), MemOperand(XRegisterFrom(locations->InAt(0)), + mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value())); + __ Ldr(XRegisterFrom(locations->Out()), + MemOperand(XRegisterFrom(locations->Out()), method_offset)); } - __ Ldr(XRegisterFrom(locations->Out()), - MemOperand(XRegisterFrom(locations->InAt(0)), method_offset)); } diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index d4bf695602..921ce10aaa 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -20,15 +20,19 @@ #include "arch/arm64/quick_method_frame_info_arm64.h" #include "code_generator.h" #include "common_arm64.h" -#include "dex/compiler_enums.h" #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" +#include "string_reference.h" #include "utils/arm64/assembler_arm64.h" -#include "utils/string_reference.h" #include "utils/type_reference.h" -#include "vixl/a64/disasm-a64.h" -#include "vixl/a64/macro-assembler-a64.h" + +// TODO(VIXL): Make VIXL compile with -Wshadow. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#include "aarch64/disasm-aarch64.h" +#include "aarch64/macro-assembler-aarch64.h" +#pragma GCC diagnostic pop namespace art { namespace arm64 { @@ -36,34 +40,49 @@ namespace arm64 { class CodeGeneratorARM64; // Use a local definition to prevent copying mistakes. -static constexpr size_t kArm64WordSize = kArm64PointerSize; - -static const vixl::Register kParameterCoreRegisters[] = { - vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7 +static constexpr size_t kArm64WordSize = static_cast<size_t>(kArm64PointerSize); + +static const vixl::aarch64::Register kParameterCoreRegisters[] = { + vixl::aarch64::x1, + vixl::aarch64::x2, + vixl::aarch64::x3, + vixl::aarch64::x4, + vixl::aarch64::x5, + vixl::aarch64::x6, + vixl::aarch64::x7 }; static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters); -static const vixl::FPRegister kParameterFPRegisters[] = { - vixl::d0, vixl::d1, vixl::d2, vixl::d3, vixl::d4, vixl::d5, vixl::d6, vixl::d7 +static const vixl::aarch64::FPRegister kParameterFPRegisters[] = { + vixl::aarch64::d0, + vixl::aarch64::d1, + vixl::aarch64::d2, + vixl::aarch64::d3, + vixl::aarch64::d4, + vixl::aarch64::d5, + vixl::aarch64::d6, + vixl::aarch64::d7 }; static constexpr size_t kParameterFPRegistersLength = arraysize(kParameterFPRegisters); -const vixl::Register tr = vixl::x19; // Thread Register -static const vixl::Register kArtMethodRegister = vixl::x0; // Method register on invoke. - -const vixl::CPURegList vixl_reserved_core_registers(vixl::ip0, vixl::ip1); -const vixl::CPURegList vixl_reserved_fp_registers(vixl::d31); +// Thread Register +const vixl::aarch64::Register tr = vixl::aarch64::x19; +// Method register on invoke. +static const vixl::aarch64::Register kArtMethodRegister = vixl::aarch64::x0; +const vixl::aarch64::CPURegList vixl_reserved_core_registers(vixl::aarch64::ip0, + vixl::aarch64::ip1); +const vixl::aarch64::CPURegList vixl_reserved_fp_registers(vixl::aarch64::d31); -const vixl::CPURegList runtime_reserved_core_registers(tr, vixl::lr); +const vixl::aarch64::CPURegList runtime_reserved_core_registers(tr, vixl::aarch64::lr); // Callee-saved registers AAPCS64 (without x19 - Thread Register) -const vixl::CPURegList callee_saved_core_registers(vixl::CPURegister::kRegister, - vixl::kXRegSize, - vixl::x20.code(), - vixl::x30.code()); -const vixl::CPURegList callee_saved_fp_registers(vixl::CPURegister::kFPRegister, - vixl::kDRegSize, - vixl::d8.code(), - vixl::d15.code()); +const vixl::aarch64::CPURegList callee_saved_core_registers(vixl::aarch64::CPURegister::kRegister, + vixl::aarch64::kXRegSize, + vixl::aarch64::x20.GetCode(), + vixl::aarch64::x30.GetCode()); +const vixl::aarch64::CPURegList callee_saved_fp_registers(vixl::aarch64::CPURegister::kFPRegister, + vixl::aarch64::kDRegSize, + vixl::aarch64::d8.GetCode(), + vixl::aarch64::d15.GetCode()); Location ARM64ReturnLocation(Primitive::Type return_type); class SlowPathCodeARM64 : public SlowPathCode { @@ -71,15 +90,15 @@ class SlowPathCodeARM64 : public SlowPathCode { explicit SlowPathCodeARM64(HInstruction* instruction) : SlowPathCode(instruction), entry_label_(), exit_label_() {} - vixl::Label* GetEntryLabel() { return &entry_label_; } - vixl::Label* GetExitLabel() { return &exit_label_; } + vixl::aarch64::Label* GetEntryLabel() { return &entry_label_; } + vixl::aarch64::Label* GetExitLabel() { return &exit_label_; } void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) OVERRIDE; void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) OVERRIDE; private: - vixl::Label entry_label_; - vixl::Label exit_label_; + vixl::aarch64::Label entry_label_; + vixl::aarch64::Label exit_label_; DISALLOW_COPY_AND_ASSIGN(SlowPathCodeARM64); }; @@ -89,27 +108,42 @@ class JumpTableARM64 : public DeletableArenaObject<kArenaAllocSwitchTable> { explicit JumpTableARM64(HPackedSwitch* switch_instr) : switch_instr_(switch_instr), table_start_() {} - vixl::Label* GetTableStartLabel() { return &table_start_; } + vixl::aarch64::Label* GetTableStartLabel() { return &table_start_; } void EmitTable(CodeGeneratorARM64* codegen); private: HPackedSwitch* const switch_instr_; - vixl::Label table_start_; + vixl::aarch64::Label table_start_; DISALLOW_COPY_AND_ASSIGN(JumpTableARM64); }; -static const vixl::Register kRuntimeParameterCoreRegisters[] = - { vixl::x0, vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7 }; +static const vixl::aarch64::Register kRuntimeParameterCoreRegisters[] = + { vixl::aarch64::x0, + vixl::aarch64::x1, + vixl::aarch64::x2, + vixl::aarch64::x3, + vixl::aarch64::x4, + vixl::aarch64::x5, + vixl::aarch64::x6, + vixl::aarch64::x7 }; static constexpr size_t kRuntimeParameterCoreRegistersLength = arraysize(kRuntimeParameterCoreRegisters); -static const vixl::FPRegister kRuntimeParameterFpuRegisters[] = - { vixl::d0, vixl::d1, vixl::d2, vixl::d3, vixl::d4, vixl::d5, vixl::d6, vixl::d7 }; +static const vixl::aarch64::FPRegister kRuntimeParameterFpuRegisters[] = + { vixl::aarch64::d0, + vixl::aarch64::d1, + vixl::aarch64::d2, + vixl::aarch64::d3, + vixl::aarch64::d4, + vixl::aarch64::d5, + vixl::aarch64::d6, + vixl::aarch64::d7 }; static constexpr size_t kRuntimeParameterFpuRegistersLength = arraysize(kRuntimeParameterCoreRegisters); -class InvokeRuntimeCallingConvention : public CallingConvention<vixl::Register, vixl::FPRegister> { +class InvokeRuntimeCallingConvention : public CallingConvention<vixl::aarch64::Register, + vixl::aarch64::FPRegister> { public: static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters); @@ -126,7 +160,8 @@ class InvokeRuntimeCallingConvention : public CallingConvention<vixl::Register, DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention); }; -class InvokeDexCallingConvention : public CallingConvention<vixl::Register, vixl::FPRegister> { +class InvokeDexCallingConvention : public CallingConvention<vixl::aarch64::Register, + vixl::aarch64::FPRegister> { public: InvokeDexCallingConvention() : CallingConvention(kParameterCoreRegisters, @@ -166,23 +201,23 @@ class FieldAccessCallingConventionARM64 : public FieldAccessCallingConvention { FieldAccessCallingConventionARM64() {} Location GetObjectLocation() const OVERRIDE { - return helpers::LocationFrom(vixl::x1); + return helpers::LocationFrom(vixl::aarch64::x1); } Location GetFieldIndexLocation() const OVERRIDE { - return helpers::LocationFrom(vixl::x0); + return helpers::LocationFrom(vixl::aarch64::x0); } Location GetReturnLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE { - return helpers::LocationFrom(vixl::x0); + return helpers::LocationFrom(vixl::aarch64::x0); } Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE { return Primitive::Is64BitType(type) - ? helpers::LocationFrom(vixl::x2) + ? helpers::LocationFrom(vixl::aarch64::x2) : (is_instance - ? helpers::LocationFrom(vixl::x2) - : helpers::LocationFrom(vixl::x1)); + ? helpers::LocationFrom(vixl::aarch64::x2) + : helpers::LocationFrom(vixl::aarch64::x1)); } Location GetFpuLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE { - return helpers::LocationFrom(vixl::d0); + return helpers::LocationFrom(vixl::aarch64::d0); } private: @@ -208,10 +243,11 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { } Arm64Assembler* GetAssembler() const { return assembler_; } - vixl::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; } + vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); } private: - void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, vixl::Register class_reg); + void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, + vixl::aarch64::Register class_reg); void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor); void HandleBinaryOp(HBinaryOperation* instr); @@ -256,9 +292,9 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { // while honoring read barriers (if any). void GenerateGcRootFieldLoad(HInstruction* instruction, Location root, - vixl::Register obj, + vixl::aarch64::Register obj, uint32_t offset, - vixl::Label* fixup_label = nullptr); + vixl::aarch64::Label* fixup_label = nullptr); // Generate a floating-point comparison. void GenerateFcmp(HInstruction* instruction); @@ -266,8 +302,8 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { void HandleShift(HBinaryOperation* instr); void GenerateTestAndBranch(HInstruction* instruction, size_t condition_input_index, - vixl::Label* true_target, - vixl::Label* false_target); + vixl::aarch64::Label* true_target, + vixl::aarch64::Label* false_target); void DivRemOneOrMinusOne(HBinaryOperation* instruction); void DivRemByPowerOfTwo(HBinaryOperation* instruction); void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); @@ -327,12 +363,12 @@ class ParallelMoveResolverARM64 : public ParallelMoveResolverNoSwap { private: Arm64Assembler* GetAssembler() const; - vixl::MacroAssembler* GetVIXLAssembler() const { - return GetAssembler()->vixl_masm_; + vixl::aarch64::MacroAssembler* GetVIXLAssembler() const { + return GetAssembler()->GetVIXLAssembler(); } CodeGeneratorARM64* const codegen_; - vixl::UseScratchRegisterScope vixl_temps_; + vixl::aarch64::UseScratchRegisterScope vixl_temps_; DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverARM64); }; @@ -348,12 +384,12 @@ class CodeGeneratorARM64 : public CodeGenerator { void GenerateFrameEntry() OVERRIDE; void GenerateFrameExit() OVERRIDE; - vixl::CPURegList GetFramePreservedCoreRegisters() const; - vixl::CPURegList GetFramePreservedFPRegisters() const; + vixl::aarch64::CPURegList GetFramePreservedCoreRegisters() const; + vixl::aarch64::CPURegList GetFramePreservedFPRegisters() const; void Bind(HBasicBlock* block) OVERRIDE; - vixl::Label* GetLabelOf(HBasicBlock* block) { + vixl::aarch64::Label* GetLabelOf(HBasicBlock* block) { block = FirstNonEmptyBlock(block); return &(block_labels_[block->GetBlockId()]); } @@ -368,19 +404,21 @@ class CodeGeneratorARM64 : public CodeGenerator { } uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE { - vixl::Label* block_entry_label = GetLabelOf(block); + vixl::aarch64::Label* block_entry_label = GetLabelOf(block); DCHECK(block_entry_label->IsBound()); - return block_entry_label->location(); + return block_entry_label->GetLocation(); } HGraphVisitor* GetLocationBuilder() OVERRIDE { return &location_builder_; } HGraphVisitor* GetInstructionVisitor() OVERRIDE { return &instruction_visitor_; } Arm64Assembler* GetAssembler() OVERRIDE { return &assembler_; } const Arm64Assembler& GetAssembler() const OVERRIDE { return assembler_; } - vixl::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; } + vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); } // Emit a write barrier. - void MarkGCCard(vixl::Register object, vixl::Register value, bool value_can_be_null); + void MarkGCCard(vixl::aarch64::Register object, + vixl::aarch64::Register value, + bool value_can_be_null); void GenerateMemoryBarrier(MemBarrierKind kind); @@ -399,8 +437,8 @@ class CodeGeneratorARM64 : public CodeGenerator { // (xzr, wzr), or make for poor allocatable registers (sp alignment // requirements, etc.). This also facilitates our task as all other registers // can easily be mapped via to or from their type and index or code. - static const int kNumberOfAllocatableRegisters = vixl::kNumberOfRegisters - 1; - static const int kNumberOfAllocatableFPRegisters = vixl::kNumberOfFPRegisters; + static const int kNumberOfAllocatableRegisters = vixl::aarch64::kNumberOfRegisters - 1; + static const int kNumberOfAllocatableFPRegisters = vixl::aarch64::kNumberOfFPRegisters; static constexpr int kNumberOfAllocatableRegisterPairs = 0; void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE; @@ -418,6 +456,10 @@ class CodeGeneratorARM64 : public CodeGenerator { block_labels_.resize(GetGraph()->GetBlocks().size()); } + // We want to use the STP and LDP instructions to spill and restore registers for slow paths. + // These instructions can only encode offsets that are multiples of the register size accessed. + uint32_t GetPreferredSlotsAlignment() const OVERRIDE { return vixl::aarch64::kXRegSizeInBytes; } + JumpTableARM64* CreateJumpTable(HPackedSwitch* switch_instr) { jump_tables_.emplace_back(new (GetGraph()->GetArena()) JumpTableARM64(switch_instr)); return jump_tables_.back().get(); @@ -426,18 +468,24 @@ class CodeGeneratorARM64 : public CodeGenerator { void Finalize(CodeAllocator* allocator) OVERRIDE; // Code generation helpers. - void MoveConstant(vixl::CPURegister destination, HConstant* constant); + void MoveConstant(vixl::aarch64::CPURegister destination, HConstant* constant); void MoveConstant(Location destination, int32_t value) OVERRIDE; void MoveLocation(Location dst, Location src, Primitive::Type dst_type) OVERRIDE; void AddLocationAsTemp(Location location, LocationSummary* locations) OVERRIDE; - void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src); - void Store(Primitive::Type type, vixl::CPURegister src, const vixl::MemOperand& dst); + void Load(Primitive::Type type, + vixl::aarch64::CPURegister dst, + const vixl::aarch64::MemOperand& src); + void Store(Primitive::Type type, + vixl::aarch64::CPURegister src, + const vixl::aarch64::MemOperand& dst); void LoadAcquire(HInstruction* instruction, - vixl::CPURegister dst, - const vixl::MemOperand& src, + vixl::aarch64::CPURegister dst, + const vixl::aarch64::MemOperand& src, bool needs_null_check); - void StoreRelease(Primitive::Type type, vixl::CPURegister src, const vixl::MemOperand& dst); + void StoreRelease(Primitive::Type type, + vixl::aarch64::CPURegister src, + const vixl::aarch64::MemOperand& dst); // Generate code to invoke a runtime entry point. void InvokeRuntime(QuickEntrypointEnum entrypoint, @@ -450,6 +498,12 @@ class CodeGeneratorARM64 : public CodeGenerator { uint32_t dex_pc, SlowPathCode* slow_path); + // Generate code to invoke a runtime entry point, but do not record + // PC-related information in a stack map. + void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path); + ParallelMoveResolverARM64* GetMoveResolver() OVERRIDE { return &move_resolver_; } bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE { @@ -484,32 +538,33 @@ class CodeGeneratorARM64 : public CodeGenerator { // to be bound before the instruction. The instruction will be either the // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing // to the associated ADRP patch label). - vixl::Label* NewPcRelativeStringPatch(const DexFile& dex_file, - uint32_t string_index, - vixl::Label* adrp_label = nullptr); + vixl::aarch64::Label* NewPcRelativeStringPatch(const DexFile& dex_file, + uint32_t string_index, + vixl::aarch64::Label* adrp_label = nullptr); // Add a new PC-relative type patch for an instruction and return the label // to be bound before the instruction. The instruction will be either the // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing // to the associated ADRP patch label). - vixl::Label* NewPcRelativeTypePatch(const DexFile& dex_file, - uint32_t type_index, - vixl::Label* adrp_label = nullptr); + vixl::aarch64::Label* NewPcRelativeTypePatch(const DexFile& dex_file, + uint32_t type_index, + vixl::aarch64::Label* adrp_label = nullptr); // Add a new PC-relative dex cache array patch for an instruction and return // the label to be bound before the instruction. The instruction will be // either the ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label` // pointing to the associated ADRP patch label). - vixl::Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, - uint32_t element_offset, - vixl::Label* adrp_label = nullptr); - - vixl::Literal<uint32_t>* DeduplicateBootImageStringLiteral(const DexFile& dex_file, - uint32_t string_index); - vixl::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, - uint32_t type_index); - vixl::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address); - vixl::Literal<uint64_t>* DeduplicateDexCacheAddressLiteral(uint64_t address); + vixl::aarch64::Label* NewPcRelativeDexCacheArrayPatch( + const DexFile& dex_file, + uint32_t element_offset, + vixl::aarch64::Label* adrp_label = nullptr); + + vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageStringLiteral(const DexFile& dex_file, + uint32_t string_index); + vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, + uint32_t type_index); + vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address); + vixl::aarch64::Literal<uint64_t>* DeduplicateDexCacheAddressLiteral(uint64_t address); void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE; @@ -517,29 +572,29 @@ class CodeGeneratorARM64 : public CodeGenerator { // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, - vixl::Register obj, + vixl::aarch64::Register obj, uint32_t offset, - vixl::Register temp, + vixl::aarch64::Register temp, bool needs_null_check, bool use_load_acquire); // Fast path implementation of ReadBarrier::Barrier for a heap // reference array load when Baker's read barriers are used. void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, - vixl::Register obj, + vixl::aarch64::Register obj, uint32_t data_offset, Location index, - vixl::Register temp, + vixl::aarch64::Register temp, bool needs_null_check); // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier // and GenerateArrayLoadWithBakerReadBarrier. void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, - vixl::Register obj, + vixl::aarch64::Register obj, uint32_t offset, Location index, size_t scale_factor, - vixl::Register temp, + vixl::aarch64::Register temp, bool needs_null_check, bool use_load_acquire); @@ -597,24 +652,25 @@ class CodeGeneratorARM64 : public CodeGenerator { void GenerateExplicitNullCheck(HNullCheck* instruction); private: - using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::Literal<uint64_t>*>; - using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::Literal<uint32_t>*>; + using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::aarch64::Literal<uint64_t>*>; + using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::aarch64::Literal<uint32_t>*>; using MethodToLiteralMap = ArenaSafeMap<MethodReference, - vixl::Literal<uint64_t>*, + vixl::aarch64::Literal<uint64_t>*, MethodReferenceComparator>; using BootStringToLiteralMap = ArenaSafeMap<StringReference, - vixl::Literal<uint32_t>*, + vixl::aarch64::Literal<uint32_t>*, StringReferenceValueComparator>; using BootTypeToLiteralMap = ArenaSafeMap<TypeReference, - vixl::Literal<uint32_t>*, + vixl::aarch64::Literal<uint32_t>*, TypeReferenceValueComparator>; - vixl::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); - vixl::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value); - vixl::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method, - MethodToLiteralMap* map); - vixl::Literal<uint64_t>* DeduplicateMethodAddressLiteral(MethodReference target_method); - vixl::Literal<uint64_t>* DeduplicateMethodCodeLiteral(MethodReference target_method); + vixl::aarch64::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value, + Uint32ToLiteralMap* map); + vixl::aarch64::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value); + vixl::aarch64::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method, + MethodToLiteralMap* map); + vixl::aarch64::Literal<uint64_t>* DeduplicateMethodAddressLiteral(MethodReference target_method); + vixl::aarch64::Literal<uint64_t>* DeduplicateMethodCodeLiteral(MethodReference target_method); // The PcRelativePatchInfo is used for PC-relative addressing of dex cache arrays // and boot image strings/types. The only difference is the interpretation of the @@ -626,21 +682,21 @@ class CodeGeneratorARM64 : public CodeGenerator { const DexFile& target_dex_file; // Either the dex cache array element offset or the string/type index. uint32_t offset_or_index; - vixl::Label label; - vixl::Label* pc_insn_label; + vixl::aarch64::Label label; + vixl::aarch64::Label* pc_insn_label; }; - vixl::Label* NewPcRelativePatch(const DexFile& dex_file, - uint32_t offset_or_index, - vixl::Label* adrp_label, - ArenaDeque<PcRelativePatchInfo>* patches); + vixl::aarch64::Label* NewPcRelativePatch(const DexFile& dex_file, + uint32_t offset_or_index, + vixl::aarch64::Label* adrp_label, + ArenaDeque<PcRelativePatchInfo>* patches); void EmitJumpTables(); // Labels for each block that will be compiled. - // We use a deque so that the `vixl::Label` objects do not move in memory. - ArenaDeque<vixl::Label> block_labels_; // Indexed by block id. - vixl::Label frame_entry_label_; + // We use a deque so that the `vixl::aarch64::Label` objects do not move in memory. + ArenaDeque<vixl::aarch64::Label> block_labels_; // Indexed by block id. + vixl::aarch64::Label frame_entry_label_; ArenaVector<std::unique_ptr<JumpTableARM64>> jump_tables_; LocationsBuilderARM64 location_builder_; @@ -659,7 +715,7 @@ class CodeGeneratorARM64 : public CodeGenerator { MethodToLiteralMap call_patches_; // Relative call patch info. // Using ArenaDeque<> which retains element addresses on push/emplace_back(). - ArenaDeque<MethodPatchInfo<vixl::Label>> relative_call_patches_; + ArenaDeque<MethodPatchInfo<vixl::aarch64::Label>> relative_call_patches_; // PC-relative DexCache access info. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; // Deduplication map for boot string literals for kBootImageLinkTimeAddress. diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index 37f1c35c50..a7fbc84340 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -145,9 +145,9 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type type) return MipsReturnLocation(type); } -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<CodeGeneratorMIPS*>(codegen)->GetAssembler()-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<CodeGeneratorMIPS*>(codegen)->GetAssembler()-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, x).Int32Value() class BoundsCheckSlowPathMIPS : public SlowPathCodeMIPS { public: @@ -351,14 +351,12 @@ class SuspendCheckSlowPathMIPS : public SlowPathCodeMIPS { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, instruction_->GetLocations()); mips_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this, IsDirectEntrypoint(kQuickTestSuspend)); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, instruction_->GetLocations()); if (successor_ == nullptr) { __ B(GetReturnLabel()); } else { @@ -415,7 +413,7 @@ class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS { this, IsDirectEntrypoint(kQuickInstanceofNonTrivial)); CheckEntrypointTypes< - kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>(); + kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>(); Primitive::Type ret_type = instruction_->GetType(); Location ret_loc = calling_convention.GetReturnLocation(ret_type); mips_codegen->MoveLocation(locations->Out(), ret_loc, ret_type); @@ -482,19 +480,30 @@ CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph, move_resolver_(graph->GetArena(), this), assembler_(graph->GetArena(), &isa_features), isa_features_(isa_features), + uint32_literals_(std::less<uint32_t>(), + graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), method_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), call_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) { + pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + boot_image_string_patches_(StringReferenceValueComparator(), + graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + boot_image_type_patches_(TypeReferenceValueComparator(), + graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + boot_image_address_patches_(std::less<uint32_t>(), + graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + clobbered_ra_(false) { // Save RA (containing the return address) to mimic Quick. AddAllocatedRegister(Location::RegisterLocation(RA)); } #undef __ -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<MipsAssembler*>(GetAssembler())-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<MipsAssembler*>(GetAssembler())-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, x).Int32Value() void CodeGeneratorMIPS::Finalize(CodeAllocator* allocator) { // Ensure that we fix up branches. @@ -688,6 +697,16 @@ void CodeGeneratorMIPS::ComputeSpillMask() { if ((fpu_spill_mask_ != 0) && (POPCOUNT(core_spill_mask_) % 2 != 0)) { core_spill_mask_ |= (1 << ZERO); } + // If RA is clobbered by PC-relative operations on R2 and it's the only spilled register + // (this can happen in leaf methods), artificially spill the ZERO register in order to + // force explicit saving and restoring of RA. RA isn't saved/restored when it's the only + // spilled register. + // TODO: Can this be improved? It causes creation of a stack frame (while RA might be + // saved in an unused temporary register) and saving of RA and the current method pointer + // in the frame. + if (clobbered_ra_ && core_spill_mask_ == (1u << RA) && fpu_spill_mask_ == 0) { + core_spill_mask_ |= (1 << ZERO); + } } static dwarf::Reg DWARFReg(Register reg) { @@ -962,7 +981,12 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch size_t size = method_patches_.size() + call_patches_.size() + - pc_relative_dex_cache_patches_.size(); + pc_relative_dex_cache_patches_.size() + + pc_relative_string_patches_.size() + + pc_relative_type_patches_.size() + + boot_image_string_patches_.size() + + boot_image_type_patches_.size() + + boot_image_address_patches_.size(); linker_patches->reserve(size); for (const auto& entry : method_patches_) { const MethodReference& target_method = entry.first; @@ -994,6 +1018,71 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch pc_rel_offset, base_element_offset)); } + for (const PcRelativePatchInfo& info : pc_relative_string_patches_) { + const DexFile& dex_file = info.target_dex_file; + size_t string_index = info.offset_or_index; + DCHECK(info.high_label.IsBound()); + uint32_t high_offset = __ GetLabelLocation(&info.high_label); + // On R2 we use HMipsComputeBaseMethodAddress and patch relative to + // the assembler's base label used for PC-relative literals. + uint32_t pc_rel_offset = info.pc_rel_label.IsBound() + ? __ GetLabelLocation(&info.pc_rel_label) + : __ GetPcRelBaseLabelLocation(); + linker_patches->push_back(LinkerPatch::RelativeStringPatch(high_offset, + &dex_file, + pc_rel_offset, + string_index)); + } + for (const PcRelativePatchInfo& info : pc_relative_type_patches_) { + const DexFile& dex_file = info.target_dex_file; + size_t type_index = info.offset_or_index; + DCHECK(info.high_label.IsBound()); + uint32_t high_offset = __ GetLabelLocation(&info.high_label); + // On R2 we use HMipsComputeBaseMethodAddress and patch relative to + // the assembler's base label used for PC-relative literals. + uint32_t pc_rel_offset = info.pc_rel_label.IsBound() + ? __ GetLabelLocation(&info.pc_rel_label) + : __ GetPcRelBaseLabelLocation(); + linker_patches->push_back(LinkerPatch::RelativeTypePatch(high_offset, + &dex_file, + pc_rel_offset, + type_index)); + } + for (const auto& entry : boot_image_string_patches_) { + const StringReference& target_string = entry.first; + Literal* literal = entry.second; + DCHECK(literal->GetLabel()->IsBound()); + uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); + linker_patches->push_back(LinkerPatch::StringPatch(literal_offset, + target_string.dex_file, + target_string.string_index)); + } + for (const auto& entry : boot_image_type_patches_) { + const TypeReference& target_type = entry.first; + Literal* literal = entry.second; + DCHECK(literal->GetLabel()->IsBound()); + uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); + linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, + target_type.dex_file, + target_type.type_index)); + } + for (const auto& entry : boot_image_address_patches_) { + DCHECK(GetCompilerOptions().GetIncludePatchInformation()); + Literal* literal = entry.second; + DCHECK(literal->GetLabel()->IsBound()); + uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); + linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); + } +} + +CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeStringPatch( + const DexFile& dex_file, uint32_t string_index) { + return NewPcRelativePatch(dex_file, string_index, &pc_relative_string_patches_); +} + +CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeTypePatch( + const DexFile& dex_file, uint32_t type_index) { + return NewPcRelativePatch(dex_file, type_index, &pc_relative_type_patches_); } CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeDexCacheArrayPatch( @@ -1007,6 +1096,12 @@ CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativePatch( return &patches->back(); } +Literal* CodeGeneratorMIPS::DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map) { + return map->GetOrCreate( + value, + [this, value]() { return __ NewLiteral<uint32_t>(value); }); +} + Literal* CodeGeneratorMIPS::DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map) { return map->GetOrCreate( @@ -1022,6 +1117,26 @@ Literal* CodeGeneratorMIPS::DeduplicateMethodCodeLiteral(MethodReference target_ return DeduplicateMethodLiteral(target_method, &call_patches_); } +Literal* CodeGeneratorMIPS::DeduplicateBootImageStringLiteral(const DexFile& dex_file, + uint32_t string_index) { + return boot_image_string_patches_.GetOrCreate( + StringReference(&dex_file, string_index), + [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); +} + +Literal* CodeGeneratorMIPS::DeduplicateBootImageTypeLiteral(const DexFile& dex_file, + uint32_t type_index) { + return boot_image_type_patches_.GetOrCreate( + TypeReference(&dex_file, type_index), + [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); +} + +Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) { + bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); + Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; + return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); +} + void CodeGeneratorMIPS::MarkGCCard(Register object, Register value) { MipsLabel done; Register card = AT; @@ -1030,7 +1145,7 @@ void CodeGeneratorMIPS::MarkGCCard(Register object, Register value) { __ LoadFromOffset(kLoadWord, card, TR, - Thread::CardTableOffset<kMipsWordSize>().Int32Value()); + Thread::CardTableOffset<kMipsPointerSize>().Int32Value()); __ Srl(temp, object, gc::accounting::CardTable::kCardShift); __ Addu(temp, card, temp); __ Sb(card, temp, 0); @@ -1067,6 +1182,15 @@ void CodeGeneratorMIPS::SetupBlockedRegisters() const { blocked_fpu_registers_[i] = true; } + if (GetGraph()->IsDebuggable()) { + // Stubs do not save callee-save floating point registers. If the graph + // is debuggable, we need to deal with these registers differently. For + // now, just block them. + for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) { + blocked_fpu_registers_[kFpuCalleeSaves[i]] = true; + } + } + UpdateBlockedPairRegisters(); } @@ -1113,7 +1237,7 @@ void CodeGeneratorMIPS::InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path) { - InvokeRuntime(GetThreadOffset<kMipsWordSize>(entrypoint).Int32Value(), + InvokeRuntime(GetThreadOffset<kMipsPointerSize>(entrypoint).Int32Value(), instruction, dex_pc, slow_path, @@ -1164,7 +1288,7 @@ void InstructionCodeGeneratorMIPS::GenerateSuspendCheck(HSuspendCheck* instructi __ LoadFromOffset(kLoadUnsignedHalfword, TMP, TR, - Thread::ThreadFlagsOffset<kMipsWordSize>().Int32Value()); + Thread::ThreadFlagsOffset<kMipsPointerSize>().Int32Value()); if (successor == nullptr) { __ Bnez(TMP, slow_path->GetEntryLabel()); __ Bind(slow_path->GetReturnLabel()); @@ -1855,7 +1979,7 @@ void LocationsBuilderMIPS::VisitArraySet(HArraySet* instruction) { bool needs_runtime_call = instruction->NeedsTypeCheck(); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( instruction, - needs_runtime_call ? LocationSummary::kCall : LocationSummary::kNoCall); + needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); if (needs_runtime_call) { InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -2467,7 +2591,7 @@ void InstructionCodeGeneratorMIPS::GenerateDivRemIntegral(HBinaryOperation* inst void LocationsBuilderMIPS::VisitDiv(HDiv* div) { Primitive::Type type = div->GetResultType(); LocationSummary::CallKind call_kind = (type == Primitive::kPrimLong) - ? LocationSummary::kCall + ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(div, call_kind); @@ -3430,7 +3554,7 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field bool is_wide = (field_type == Primitive::kPrimLong) || (field_type == Primitive::kPrimDouble); bool generate_volatile = field_info.IsVolatile() && is_wide; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( - instruction, generate_volatile ? LocationSummary::kCall : LocationSummary::kNoCall); + instruction, generate_volatile ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); if (generate_volatile) { @@ -3440,7 +3564,8 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field if (field_type == Primitive::kPrimLong) { locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimLong)); } else { - locations->SetOut(Location::RequiresFpuRegister()); + // Use Location::Any() to prevent situations when running out of available fp registers. + locations->SetOut(Location::Any()); // Need some temp core regs since FP results are returned in core registers Location reg = calling_convention.GetReturnLocation(Primitive::kPrimLong); locations->AddTemp(Location::RegisterLocation(reg.AsRegisterPairLow<Register>())); @@ -3505,11 +3630,23 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction, IsDirectEntrypoint(kQuickA64Load)); CheckEntrypointTypes<kQuickA64Load, int64_t, volatile const int64_t*>(); if (type == Primitive::kPrimDouble) { - // Need to move to FP regs since FP results are returned in core registers. - __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), - locations->Out().AsFpuRegister<FRegister>()); - __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(), - locations->Out().AsFpuRegister<FRegister>()); + // FP results are returned in core registers. Need to move them. + Location out = locations->Out(); + if (out.IsFpuRegister()) { + __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), out.AsFpuRegister<FRegister>()); + __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(), + out.AsFpuRegister<FRegister>()); + } else { + DCHECK(out.IsDoubleStackSlot()); + __ StoreToOffset(kStoreWord, + locations->GetTemp(1).AsRegister<Register>(), + SP, + out.GetStackIndex()); + __ StoreToOffset(kStoreWord, + locations->GetTemp(2).AsRegister<Register>(), + SP, + out.GetStackIndex() + 4); + } } } else { if (!Primitive::IsFloatingPointType(type)) { @@ -3557,7 +3694,7 @@ void LocationsBuilderMIPS::HandleFieldSet(HInstruction* instruction, const Field bool is_wide = (field_type == Primitive::kPrimLong) || (field_type == Primitive::kPrimDouble); bool generate_volatile = field_info.IsVolatile() && is_wide; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( - instruction, generate_volatile ? LocationSummary::kCall : LocationSummary::kNoCall); + instruction, generate_volatile ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); if (generate_volatile) { @@ -3568,7 +3705,8 @@ void LocationsBuilderMIPS::HandleFieldSet(HInstruction* instruction, const Field locations->SetInAt(1, Location::RegisterPairLocation( calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3))); } else { - locations->SetInAt(1, Location::RequiresFpuRegister()); + // Use Location::Any() to prevent situations when running out of available fp registers. + locations->SetInAt(1, Location::Any()); // Pass FP parameters in core registers. locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2))); locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(3))); @@ -3627,10 +3765,28 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction, codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); if (type == Primitive::kPrimDouble) { // Pass FP parameters in core registers. - __ Mfc1(locations->GetTemp(1).AsRegister<Register>(), - locations->InAt(1).AsFpuRegister<FRegister>()); - __ MoveFromFpuHigh(locations->GetTemp(2).AsRegister<Register>(), - locations->InAt(1).AsFpuRegister<FRegister>()); + Location in = locations->InAt(1); + if (in.IsFpuRegister()) { + __ Mfc1(locations->GetTemp(1).AsRegister<Register>(), in.AsFpuRegister<FRegister>()); + __ MoveFromFpuHigh(locations->GetTemp(2).AsRegister<Register>(), + in.AsFpuRegister<FRegister>()); + } else if (in.IsDoubleStackSlot()) { + __ LoadFromOffset(kLoadWord, + locations->GetTemp(1).AsRegister<Register>(), + SP, + in.GetStackIndex()); + __ LoadFromOffset(kLoadWord, + locations->GetTemp(2).AsRegister<Register>(), + SP, + in.GetStackIndex() + 4); + } else { + DCHECK(in.IsConstant()); + DCHECK(in.GetConstant()->IsDoubleConstant()); + int64_t value = bit_cast<int64_t, double>(in.GetConstant()->AsDoubleConstant()->GetValue()); + __ LoadConst64(locations->GetTemp(2).AsRegister<Register>(), + locations->GetTemp(1).AsRegister<Register>(), + value); + } } codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pA64Store), instruction, @@ -3696,6 +3852,23 @@ void InstructionCodeGeneratorMIPS::VisitInstanceFieldSet(HInstanceFieldSet* inst HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetDexPc()); } +void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad( + HInstruction* instruction ATTRIBUTE_UNUSED, + Location root, + Register obj, + uint32_t offset) { + Register root_reg = root.AsRegister<Register>(); + if (kEmitCompilerReadBarrier) { + UNIMPLEMENTED(FATAL) << "for read barrier"; + } else { + // Plain GC root load with no read barrier. + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + // Note that GC roots are not affected by heap poisoning, thus we + // do not have to unpoison `root_reg` here. + } +} + void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) { LocationSummary::CallKind call_kind = instruction->IsExactCheck() ? LocationSummary::kNoCall : LocationSummary::kCallOnSlowPath; @@ -3772,11 +3945,9 @@ void LocationsBuilderMIPS::VisitInvokeInterface(HInvokeInterface* invoke) { void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke) { // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError. Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>(); - uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset( - invoke->GetImtIndex() % mirror::Class::kImtSize, kMipsPointerSize).Uint32Value(); Location receiver = invoke->GetLocations()->InAt(0); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); - Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsWordSize); + Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize); // Set the hidden argument. __ LoadConst32(invoke->GetLocations()->GetTemp(1).AsRegister<Register>(), @@ -3790,6 +3961,10 @@ void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset); } codegen_->MaybeRecordImplicitNullCheck(invoke); + __ LoadFromOffset(kLoadWord, temp, temp, + mirror::Class::ImtPtrOffset(kMipsPointerSize).Uint32Value()); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + invoke->GetImtIndex(), kMipsPointerSize)); // temp = temp->GetImtEntryAt(method_offset); __ LoadFromOffset(kLoadWord, temp, temp, method_offset); // T9 = temp->GetEntryPoint(); @@ -3859,16 +4034,80 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorMIPS* codegen } HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind( - HLoadString::LoadKind desired_string_load_kind ATTRIBUTE_UNUSED) { - // TODO: Implement other kinds. - return HLoadString::LoadKind::kDexCacheViaMethod; + HLoadString::LoadKind desired_string_load_kind) { + if (kEmitCompilerReadBarrier) { + UNIMPLEMENTED(FATAL) << "for read barrier"; + } + // We disable PC-relative load when there is an irreducible loop, as the optimization + // is incompatible with it. + bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops(); + bool fallback_load = has_irreducible_loops; + switch (desired_string_load_kind) { + case HLoadString::LoadKind::kBootImageLinkTimeAddress: + DCHECK(!GetCompilerOptions().GetCompilePic()); + break; + case HLoadString::LoadKind::kBootImageLinkTimePcRelative: + DCHECK(GetCompilerOptions().GetCompilePic()); + break; + case HLoadString::LoadKind::kBootImageAddress: + break; + case HLoadString::LoadKind::kDexCacheAddress: + DCHECK(Runtime::Current()->UseJitCompilation()); + fallback_load = false; + break; + case HLoadString::LoadKind::kDexCachePcRelative: + DCHECK(!Runtime::Current()->UseJitCompilation()); + // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods + // with irreducible loops. + break; + case HLoadString::LoadKind::kDexCacheViaMethod: + fallback_load = false; + break; + } + if (fallback_load) { + desired_string_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; + } + return desired_string_load_kind; } HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind( HLoadClass::LoadKind desired_class_load_kind) { - DCHECK_NE(desired_class_load_kind, HLoadClass::LoadKind::kReferrersClass); - // TODO: Implement other kinds. - return HLoadClass::LoadKind::kDexCacheViaMethod; + if (kEmitCompilerReadBarrier) { + UNIMPLEMENTED(FATAL) << "for read barrier"; + } + // We disable pc-relative load when there is an irreducible loop, as the optimization + // is incompatible with it. + bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops(); + bool fallback_load = has_irreducible_loops; + switch (desired_class_load_kind) { + case HLoadClass::LoadKind::kReferrersClass: + fallback_load = false; + break; + case HLoadClass::LoadKind::kBootImageLinkTimeAddress: + DCHECK(!GetCompilerOptions().GetCompilePic()); + break; + case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: + DCHECK(GetCompilerOptions().GetCompilePic()); + break; + case HLoadClass::LoadKind::kBootImageAddress: + break; + case HLoadClass::LoadKind::kDexCacheAddress: + DCHECK(Runtime::Current()->UseJitCompilation()); + fallback_load = false; + break; + case HLoadClass::LoadKind::kDexCachePcRelative: + DCHECK(!Runtime::Current()->UseJitCompilation()); + // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods + // with irreducible loops. + break; + case HLoadClass::LoadKind::kDexCacheViaMethod: + fallback_load = false; + break; + } + if (fallback_load) { + desired_class_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod; + } + return desired_class_load_kind; } Register CodeGeneratorMIPS::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, @@ -4046,7 +4285,7 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke T9, callee_method.AsRegister<Register>(), ArtMethod::EntryPointFromQuickCompiledCodeOffset( - kMipsWordSize).Int32Value()); + kMipsPointerSize).Int32Value()); // T9() __ Jalr(T9); __ Nop(); @@ -4079,7 +4318,7 @@ void CodeGeneratorMIPS::GenerateVirtualCall(HInvokeVirtual* invoke, Location tem size_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( invoke->GetVTableIndex(), kMipsPointerSize).SizeValue(); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); - Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsWordSize); + Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize); // temp = object->GetClass(); DCHECK(receiver.IsRegister()); @@ -4105,11 +4344,40 @@ void InstructionCodeGeneratorMIPS::VisitInvokeVirtual(HInvokeVirtual* invoke) { } void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) { - InvokeRuntimeCallingConvention calling_convention; - CodeGenerator::CreateLoadClassLocationSummary( - cls, - Location::RegisterLocation(calling_convention.GetRegisterAt(0)), - Location::RegisterLocation(V0)); + if (cls->NeedsAccessCheck()) { + InvokeRuntimeCallingConvention calling_convention; + CodeGenerator::CreateLoadClassLocationSummary( + cls, + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + Location::RegisterLocation(V0), + /* code_generator_supports_read_barrier */ false); // TODO: revisit this bool. + return; + } + + LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || kEmitCompilerReadBarrier) + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall; + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind); + HLoadClass::LoadKind load_kind = cls->GetLoadKind(); + switch (load_kind) { + // We need an extra register for PC-relative literals on R2. + case HLoadClass::LoadKind::kBootImageLinkTimeAddress: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: + if (codegen_->GetInstructionSetFeatures().IsR6()) { + break; + } + FALLTHROUGH_INTENDED; + // We need an extra register for PC-relative dex cache accesses. + case HLoadClass::LoadKind::kDexCachePcRelative: + case HLoadClass::LoadKind::kReferrersClass: + case HLoadClass::LoadKind::kDexCacheViaMethod: + locations->SetInAt(0, Location::RequiresRegister()); + break; + default: + break; + } + locations->SetOut(Location::RequiresRegister()); } void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) { @@ -4125,40 +4393,132 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) { return; } - Register out = locations->Out().AsRegister<Register>(); - Register current_method = locations->InAt(0).AsRegister<Register>(); - if (cls->IsReferrersClass()) { - DCHECK(!cls->CanCallRuntime()); - DCHECK(!cls->MustGenerateClinitCheck()); - __ LoadFromOffset(kLoadWord, out, current_method, - ArtMethod::DeclaringClassOffset().Int32Value()); - } else { - __ LoadFromOffset(kLoadWord, out, current_method, - ArtMethod::DexCacheResolvedTypesOffset(kMipsPointerSize).Int32Value()); - __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())); - - if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) { - DCHECK(cls->CanCallRuntime()); - SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS( - cls, - cls, - cls->GetDexPc(), - cls->MustGenerateClinitCheck()); - codegen_->AddSlowPath(slow_path); - if (!cls->IsInDexCache()) { - __ Beqz(out, slow_path->GetEntryLabel()); - } - if (cls->MustGenerateClinitCheck()) { - GenerateClassInitializationCheck(slow_path, out); + HLoadClass::LoadKind load_kind = cls->GetLoadKind(); + Location out_loc = locations->Out(); + Register out = out_loc.AsRegister<Register>(); + Register base_or_current_method_reg; + bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); + switch (load_kind) { + // We need an extra register for PC-relative literals on R2. + case HLoadClass::LoadKind::kBootImageLinkTimeAddress: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: + base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>(); + break; + // We need an extra register for PC-relative dex cache accesses. + case HLoadClass::LoadKind::kDexCachePcRelative: + case HLoadClass::LoadKind::kReferrersClass: + case HLoadClass::LoadKind::kDexCacheViaMethod: + base_or_current_method_reg = locations->InAt(0).AsRegister<Register>(); + break; + default: + base_or_current_method_reg = ZERO; + break; + } + + bool generate_null_check = false; + switch (load_kind) { + case HLoadClass::LoadKind::kReferrersClass: { + DCHECK(!cls->CanCallRuntime()); + DCHECK(!cls->MustGenerateClinitCheck()); + // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_ + GenerateGcRootFieldLoad(cls, + out_loc, + base_or_current_method_reg, + ArtMethod::DeclaringClassOffset().Int32Value()); + break; + } + case HLoadClass::LoadKind::kBootImageLinkTimeAddress: + DCHECK(!kEmitCompilerReadBarrier); + __ LoadLiteral(out, + base_or_current_method_reg, + codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), + cls->GetTypeIndex())); + break; + case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { + DCHECK(!kEmitCompilerReadBarrier); + CodeGeneratorMIPS::PcRelativePatchInfo* info = + codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex()); + if (isR6) { + __ Bind(&info->high_label); + __ Bind(&info->pc_rel_label); + // Add a 32-bit offset to PC. + __ Auipc(out, /* placeholder */ 0x1234); + __ Addiu(out, out, /* placeholder */ 0x5678); } else { - __ Bind(slow_path->GetExitLabel()); + __ Bind(&info->high_label); + __ Lui(out, /* placeholder */ 0x1234); + // We do not bind info->pc_rel_label here, we'll use the assembler's label + // for PC-relative literals and the base from HMipsComputeBaseMethodAddress. + __ Ori(out, out, /* placeholder */ 0x5678); + // Add a 32-bit offset to PC. + __ Addu(out, out, base_or_current_method_reg); } + break; + } + case HLoadClass::LoadKind::kBootImageAddress: { + DCHECK(!kEmitCompilerReadBarrier); + DCHECK_NE(cls->GetAddress(), 0u); + uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress()); + __ LoadLiteral(out, + base_or_current_method_reg, + codegen_->DeduplicateBootImageAddressLiteral(address)); + break; + } + case HLoadClass::LoadKind::kDexCacheAddress: { + DCHECK_NE(cls->GetAddress(), 0u); + uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress()); + static_assert(sizeof(GcRoot<mirror::Class>) == 4u, "Expected GC root to be 4 bytes."); + DCHECK_ALIGNED(cls->GetAddress(), 4u); + int16_t offset = Low16Bits(address); + uint32_t base_address = address - offset; // This accounts for offset sign extension. + __ Lui(out, High16Bits(base_address)); + // /* GcRoot<mirror::Class> */ out = *(base_address + offset) + GenerateGcRootFieldLoad(cls, out_loc, out, offset); + generate_null_check = !cls->IsInDexCache(); + break; + } + case HLoadClass::LoadKind::kDexCachePcRelative: { + HMipsDexCacheArraysBase* base = cls->InputAt(0)->AsMipsDexCacheArraysBase(); + int32_t offset = + cls->GetDexCacheElementOffset() - base->GetElementOffset() - kDexCacheArrayLwOffset; + // /* GcRoot<mirror::Class> */ out = *(dex_cache_arrays_base + offset) + GenerateGcRootFieldLoad(cls, out_loc, base_or_current_method_reg, offset); + generate_null_check = !cls->IsInDexCache(); + break; + } + case HLoadClass::LoadKind::kDexCacheViaMethod: { + // /* GcRoot<mirror::Class>[] */ out = + // current_method.ptr_sized_fields_->dex_cache_resolved_types_ + __ LoadFromOffset(kLoadWord, + out, + base_or_current_method_reg, + ArtMethod::DexCacheResolvedTypesOffset(kArmPointerSize).Int32Value()); + // /* GcRoot<mirror::Class> */ out = out[type_index] + size_t offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex()); + GenerateGcRootFieldLoad(cls, out_loc, out, offset); + generate_null_check = !cls->IsInDexCache(); + } + } + + if (generate_null_check || cls->MustGenerateClinitCheck()) { + DCHECK(cls->CanCallRuntime()); + SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS( + cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck()); + codegen_->AddSlowPath(slow_path); + if (generate_null_check) { + __ Beqz(out, slow_path->GetEntryLabel()); + } + if (cls->MustGenerateClinitCheck()) { + GenerateClassInitializationCheck(slow_path, out); + } else { + __ Bind(slow_path->GetExitLabel()); } } } static int32_t GetExceptionTlsOffset() { - return Thread::ExceptionOffset<kMipsWordSize>().Int32Value(); + return Thread::ExceptionOffset<kMipsPointerSize>().Int32Value(); } void LocationsBuilderMIPS::VisitLoadException(HLoadException* load) { @@ -4181,28 +4541,97 @@ void InstructionCodeGeneratorMIPS::VisitClearException(HClearException* clear AT } void LocationsBuilderMIPS::VisitLoadString(HLoadString* load) { - LocationSummary::CallKind call_kind = load->NeedsEnvironment() + LocationSummary::CallKind call_kind = (load->NeedsEnvironment() || kEmitCompilerReadBarrier) ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); - locations->SetInAt(0, Location::RequiresRegister()); + HLoadString::LoadKind load_kind = load->GetLoadKind(); + switch (load_kind) { + // We need an extra register for PC-relative literals on R2. + case HLoadString::LoadKind::kBootImageLinkTimeAddress: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kBootImageLinkTimePcRelative: + if (codegen_->GetInstructionSetFeatures().IsR6()) { + break; + } + FALLTHROUGH_INTENDED; + // We need an extra register for PC-relative dex cache accesses. + case HLoadString::LoadKind::kDexCachePcRelative: + case HLoadString::LoadKind::kDexCacheViaMethod: + locations->SetInAt(0, Location::RequiresRegister()); + break; + default: + break; + } locations->SetOut(Location::RequiresRegister()); } void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) { + HLoadString::LoadKind load_kind = load->GetLoadKind(); LocationSummary* locations = load->GetLocations(); - Register out = locations->Out().AsRegister<Register>(); - Register current_method = locations->InAt(0).AsRegister<Register>(); - __ LoadFromOffset(kLoadWord, out, current_method, ArtMethod::DeclaringClassOffset().Int32Value()); - __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value()); - __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex())); + Location out_loc = locations->Out(); + Register out = out_loc.AsRegister<Register>(); + Register base_or_current_method_reg; + bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); + switch (load_kind) { + // We need an extra register for PC-relative literals on R2. + case HLoadString::LoadKind::kBootImageLinkTimeAddress: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kBootImageLinkTimePcRelative: + base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>(); + break; + default: + base_or_current_method_reg = ZERO; + break; + } - if (!load->IsInDexCache()) { - SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load); - codegen_->AddSlowPath(slow_path); - __ Beqz(out, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + switch (load_kind) { + case HLoadString::LoadKind::kBootImageLinkTimeAddress: + DCHECK(!kEmitCompilerReadBarrier); + __ LoadLiteral(out, + base_or_current_method_reg, + codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(), + load->GetStringIndex())); + return; // No dex cache slow path. + case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { + DCHECK(!kEmitCompilerReadBarrier); + CodeGeneratorMIPS::PcRelativePatchInfo* info = + codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex()); + if (isR6) { + __ Bind(&info->high_label); + __ Bind(&info->pc_rel_label); + // Add a 32-bit offset to PC. + __ Auipc(out, /* placeholder */ 0x1234); + __ Addiu(out, out, /* placeholder */ 0x5678); + } else { + __ Bind(&info->high_label); + __ Lui(out, /* placeholder */ 0x1234); + // We do not bind info->pc_rel_label here, we'll use the assembler's label + // for PC-relative literals and the base from HMipsComputeBaseMethodAddress. + __ Ori(out, out, /* placeholder */ 0x5678); + // Add a 32-bit offset to PC. + __ Addu(out, out, base_or_current_method_reg); + } + return; // No dex cache slow path. + } + case HLoadString::LoadKind::kBootImageAddress: { + DCHECK(!kEmitCompilerReadBarrier); + DCHECK_NE(load->GetAddress(), 0u); + uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress()); + __ LoadLiteral(out, + base_or_current_method_reg, + codegen_->DeduplicateBootImageAddressLiteral(address)); + return; // No dex cache slow path. + } + default: + break; } + + // TODO: Re-add the compiler code to do string dex cache lookup again. + SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load); + codegen_->AddSlowPath(slow_path); + __ B(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } void LocationsBuilderMIPS::VisitLongConstant(HLongConstant* constant) { @@ -4216,7 +4645,7 @@ void InstructionCodeGeneratorMIPS::VisitLongConstant(HLongConstant* constant ATT void LocationsBuilderMIPS::VisitMonitorOperation(HMonitorOperation* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -4395,7 +4824,7 @@ void InstructionCodeGeneratorMIPS::VisitNeg(HNeg* instruction) { void LocationsBuilderMIPS::VisitNewArray(HNewArray* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2))); @@ -4410,7 +4839,7 @@ void InstructionCodeGeneratorMIPS::VisitNewArray(HNewArray* instruction) { // Move an uint16_t value to a register. __ LoadConst32(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex()); codegen_->InvokeRuntime( - GetThreadOffset<kMipsWordSize>(instruction->GetEntrypoint()).Int32Value(), + GetThreadOffset<kMipsPointerSize>(instruction->GetEntrypoint()).Int32Value(), instruction, instruction->GetDexPc(), nullptr, @@ -4421,7 +4850,7 @@ void InstructionCodeGeneratorMIPS::VisitNewArray(HNewArray* instruction) { void LocationsBuilderMIPS::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; if (instruction->IsStringAlloc()) { locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); @@ -4436,7 +4865,7 @@ void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) { if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsWordSize); + MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize); __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString)); __ LoadFromOffset(kLoadWord, T9, temp, code_offset.Int32Value()); __ Jalr(T9); @@ -4444,7 +4873,7 @@ void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) { codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); } else { codegen_->InvokeRuntime( - GetThreadOffset<kMipsWordSize>(instruction->GetEntrypoint()).Int32Value(), + GetThreadOffset<kMipsPointerSize>(instruction->GetEntrypoint()).Int32Value(), instruction, instruction->GetDexPc(), nullptr, @@ -4591,7 +5020,7 @@ void InstructionCodeGeneratorMIPS::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED) void LocationsBuilderMIPS::VisitRem(HRem* rem) { Primitive::Type type = rem->GetResultType(); LocationSummary::CallKind call_kind = - (type == Primitive::kPrimInt) ? LocationSummary::kNoCall : LocationSummary::kCall; + (type == Primitive::kPrimInt) ? LocationSummary::kNoCall : LocationSummary::kCallOnMainOnly; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind); switch (type) { @@ -4828,7 +5257,7 @@ void InstructionCodeGeneratorMIPS::VisitSuspendCheck(HSuspendCheck* instruction) void LocationsBuilderMIPS::VisitThrow(HThrow* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -4857,7 +5286,7 @@ void LocationsBuilderMIPS::VisitTypeConversion(HTypeConversion* conversion) { if (!isR6 && ((Primitive::IsFloatingPointType(result_type) && input_type == Primitive::kPrimLong) || (result_type == Primitive::kPrimLong && Primitive::IsFloatingPointType(input_type)))) { - call_kind = LocationSummary::kCall; + call_kind = LocationSummary::kCallOnMainOnly; } LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind); @@ -5325,6 +5754,7 @@ void InstructionCodeGeneratorMIPS::VisitMipsComputeBaseMethodAddress( __ Nal(); // Grab the return address off RA. __ Move(reg, RA); + // TODO: Can we share this code with that of VisitMipsDexCacheArraysBase()? // Remember this offset (the obtained PC value) for later use with constant area. __ BindPcRelBaseLabel(); @@ -5355,6 +5785,7 @@ void InstructionCodeGeneratorMIPS::VisitMipsDexCacheArraysBase(HMipsDexCacheArra __ Ori(reg, reg, /* placeholder */ 0x5678); // Add a 32-bit offset to PC. __ Addu(reg, reg, RA); + // TODO: Can we share this code with that of VisitMipsComputeBaseMethodAddress()? } } @@ -5378,18 +5809,25 @@ void LocationsBuilderMIPS::VisitClassTableGet(HClassTableGet* instruction) { void InstructionCodeGeneratorMIPS::VisitClassTableGet(HClassTableGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - uint32_t method_offset = 0; if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) { - method_offset = mirror::Class::EmbeddedVTableEntryOffset( + uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( instruction->GetIndex(), kMipsPointerSize).SizeValue(); + __ LoadFromOffset(kLoadWord, + locations->Out().AsRegister<Register>(), + locations->InAt(0).AsRegister<Register>(), + method_offset); } else { - method_offset = mirror::Class::EmbeddedImTableEntryOffset( - instruction->GetIndex() % mirror::Class::kImtSize, kMipsPointerSize).Uint32Value(); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + instruction->GetIndex(), kMipsPointerSize)); + __ LoadFromOffset(kLoadWord, + locations->Out().AsRegister<Register>(), + locations->InAt(0).AsRegister<Register>(), + mirror::Class::ImtPtrOffset(kMipsPointerSize).Uint32Value()); + __ LoadFromOffset(kLoadWord, + locations->Out().AsRegister<Register>(), + locations->Out().AsRegister<Register>(), + method_offset); } - __ LoadFromOffset(kLoadWord, - locations->Out().AsRegister<Register>(), - locations->InAt(0).AsRegister<Register>(), - method_offset); } #undef __ diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h index 08f74c04d1..63a0345c1c 100644 --- a/compiler/optimizing/code_generator_mips.h +++ b/compiler/optimizing/code_generator_mips.h @@ -18,11 +18,12 @@ #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_MIPS_H_ #include "code_generator.h" -#include "dex/compiler_enums.h" #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" +#include "string_reference.h" #include "utils/mips/assembler_mips.h" +#include "utils/type_reference.h" namespace art { namespace mips { @@ -226,6 +227,15 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { void HandleShift(HBinaryOperation* operation); void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc); + // Generate a GC root reference load: + // + // root <- *(obj + offset) + // + // while honoring read barriers (if any). + void GenerateGcRootFieldLoad(HInstruction* instruction, + Location root, + Register obj, + uint32_t offset); void GenerateIntCompare(IfCondition cond, LocationSummary* locations); void GenerateIntCompareAndBranch(IfCondition cond, LocationSummary* locations, @@ -298,6 +308,9 @@ class CodeGeneratorMIPS : public CodeGenerator { size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id); size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id); size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id); + void ClobberRA() { + clobbered_ra_ = true; + } void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE; void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE; @@ -383,7 +396,7 @@ class CodeGeneratorMIPS : public CodeGenerator { PcRelativePatchInfo(PcRelativePatchInfo&& other) = default; const DexFile& target_dex_file; - // Either the dex cache array element offset or the string index. + // Either the dex cache array element offset or the string/type index. uint32_t offset_or_index; // Label for the instruction loading the most significant half of the offset that's added to PC // to form the base address (the least significant half is loaded with the instruction that @@ -393,14 +406,27 @@ class CodeGeneratorMIPS : public CodeGenerator { MipsLabel pc_rel_label; }; + PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, uint32_t string_index); + PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, uint32_t type_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); + Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, uint32_t string_index); + Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, uint32_t type_index); + Literal* DeduplicateBootImageAddressLiteral(uint32_t address); private: Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp); + using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>; using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>; - + using BootStringToLiteralMap = ArenaSafeMap<StringReference, + Literal*, + StringReferenceValueComparator>; + using BootTypeToLiteralMap = ArenaSafeMap<TypeReference, + Literal*, + TypeReferenceValueComparator>; + + Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); Literal* DeduplicateMethodAddressLiteral(MethodReference target_method); Literal* DeduplicateMethodCodeLiteral(MethodReference target_method); @@ -417,11 +443,27 @@ class CodeGeneratorMIPS : public CodeGenerator { MipsAssembler assembler_; const MipsInstructionSetFeatures& isa_features_; + // Deduplication map for 32-bit literals, used for non-patchable boot image addresses. + Uint32ToLiteralMap uint32_literals_; // Method patch info, map MethodReference to a literal for method address and method code. MethodToLiteralMap method_patches_; MethodToLiteralMap call_patches_; // PC-relative patch info for each HMipsDexCacheArraysBase. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; + // Deduplication map for boot string literals for kBootImageLinkTimeAddress. + BootStringToLiteralMap boot_image_string_patches_; + // PC-relative String patch info. + ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; + // Deduplication map for boot type literals for kBootImageLinkTimeAddress. + BootTypeToLiteralMap boot_image_type_patches_; + // PC-relative type patch info. + ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; + // Deduplication map for patchable boot image addresses. + Uint32ToLiteralMap boot_image_address_patches_; + + // PC-relative loads on R2 clobber RA, which may need to be preserved explicitly in leaf methods. + // This is a flag set by pc_relative_fixups_mips and dex_cache_array_fixups_mips optimizations. + bool clobbered_ra_; DISALLOW_COPY_AND_ASSIGN(CodeGeneratorMIPS); }; diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index 2e78884daf..4a5755c925 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -102,9 +102,9 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type type) return Mips64ReturnLocation(type); } -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<CodeGeneratorMIPS64*>(codegen)->GetAssembler()-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<CodeGeneratorMIPS64*>(codegen)->GetAssembler()-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, x).Int32Value() class BoundsCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { public: @@ -300,13 +300,11 @@ class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, instruction_->GetLocations()); mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, instruction_->GetLocations()); if (successor_ == nullptr) { __ Bc(GetReturnLabel()); } else { @@ -362,7 +360,7 @@ class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { dex_pc, this); CheckEntrypointTypes< - kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>(); + kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>(); Primitive::Type ret_type = instruction_->GetType(); Location ret_loc = calling_convention.GetReturnLocation(ret_type); mips64_codegen->MoveLocation(locations->Out(), ret_loc, ret_type); @@ -429,9 +427,9 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph, } #undef __ -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<Mips64Assembler*>(GetAssembler())-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<Mips64Assembler*>(GetAssembler())-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, x).Int32Value() void CodeGeneratorMIPS64::Finalize(CodeAllocator* allocator) { // Ensure that we fix up branches. @@ -888,7 +886,7 @@ void CodeGeneratorMIPS64::MarkGCCard(GpuRegister object, __ LoadFromOffset(kLoadDoubleword, card, TR, - Thread::CardTableOffset<kMips64DoublewordSize>().Int32Value()); + Thread::CardTableOffset<kMips64PointerSize>().Int32Value()); __ Dsrl(temp, object, gc::accounting::CardTable::kCardShift); __ Daddu(temp, card, temp); __ Sb(card, temp, 0); @@ -964,7 +962,7 @@ void CodeGeneratorMIPS64::InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path) { - InvokeRuntime(GetThreadOffset<kMips64DoublewordSize>(entrypoint).Int32Value(), + InvokeRuntime(GetThreadOffset<kMips64PointerSize>(entrypoint).Int32Value(), instruction, dex_pc, slow_path); @@ -1004,7 +1002,7 @@ void InstructionCodeGeneratorMIPS64::GenerateSuspendCheck(HSuspendCheck* instruc __ LoadFromOffset(kLoadUnsignedHalfword, TMP, TR, - Thread::ThreadFlagsOffset<kMips64DoublewordSize>().Int32Value()); + Thread::ThreadFlagsOffset<kMips64PointerSize>().Int32Value()); if (successor == nullptr) { __ Bnezc(TMP, slow_path->GetEntryLabel()); __ Bind(slow_path->GetReturnLabel()); @@ -1436,7 +1434,7 @@ void LocationsBuilderMIPS64::VisitArraySet(HArraySet* instruction) { bool needs_runtime_call = instruction->NeedsTypeCheck(); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( instruction, - needs_runtime_call ? LocationSummary::kCall : LocationSummary::kNoCall); + needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); if (needs_runtime_call) { InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -2932,11 +2930,9 @@ void LocationsBuilderMIPS64::VisitInvokeInterface(HInvokeInterface* invoke) { void InstructionCodeGeneratorMIPS64::VisitInvokeInterface(HInvokeInterface* invoke) { // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError. GpuRegister temp = invoke->GetLocations()->GetTemp(0).AsRegister<GpuRegister>(); - uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset( - invoke->GetImtIndex() % mirror::Class::kImtSize, kMips64PointerSize).Uint32Value(); Location receiver = invoke->GetLocations()->InAt(0); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); - Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64DoublewordSize); + Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize); // Set the hidden argument. __ LoadConst32(invoke->GetLocations()->GetTemp(1).AsRegister<GpuRegister>(), @@ -2950,6 +2946,10 @@ void InstructionCodeGeneratorMIPS64::VisitInvokeInterface(HInvokeInterface* invo __ LoadFromOffset(kLoadUnsignedWord, temp, receiver.AsRegister<GpuRegister>(), class_offset); } codegen_->MaybeRecordImplicitNullCheck(invoke); + __ LoadFromOffset(kLoadDoubleword, temp, temp, + mirror::Class::ImtPtrOffset(kMips64PointerSize).Uint32Value()); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + invoke->GetImtIndex(), kMips64PointerSize)); // temp = temp->GetImtEntryAt(method_offset); __ LoadFromOffset(kLoadDoubleword, temp, temp, method_offset); // T9 = temp->GetEntryPoint(); @@ -3113,7 +3113,7 @@ void CodeGeneratorMIPS64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo T9, callee_method.AsRegister<GpuRegister>(), ArtMethod::EntryPointFromQuickCompiledCodeOffset( - kMips64DoublewordSize).Int32Value()); + kMips64PointerSize).Int32Value()); // T9() __ Jalr(T9); __ Nop(); @@ -3151,7 +3151,7 @@ void CodeGeneratorMIPS64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t size_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( invoke->GetVTableIndex(), kMips64PointerSize).SizeValue(); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); - Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64DoublewordSize); + Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize); // temp = object->GetClass(); __ LoadFromOffset(kLoadUnsignedWord, temp, receiver, class_offset); @@ -3229,7 +3229,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) { } static int32_t GetExceptionTlsOffset() { - return Thread::ExceptionOffset<kMips64DoublewordSize>().Int32Value(); + return Thread::ExceptionOffset<kMips64PointerSize>().Int32Value(); } void LocationsBuilderMIPS64::VisitLoadException(HLoadException* load) { @@ -3261,22 +3261,11 @@ void LocationsBuilderMIPS64::VisitLoadString(HLoadString* load) { } void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) { - LocationSummary* locations = load->GetLocations(); - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); - GpuRegister current_method = locations->InAt(0).AsRegister<GpuRegister>(); - __ LoadFromOffset(kLoadUnsignedWord, out, current_method, - ArtMethod::DeclaringClassOffset().Int32Value()); - __ LoadFromOffset(kLoadDoubleword, out, out, mirror::Class::DexCacheStringsOffset().Int32Value()); - __ LoadFromOffset( - kLoadUnsignedWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex())); - // TODO: We will need a read barrier here. - - if (!load->IsInDexCache()) { - SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load); - codegen_->AddSlowPath(slow_path); - __ Beqzc(out, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); - } + // TODO: Re-add the compiler code to do string dex cache lookup again. + SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load); + codegen_->AddSlowPath(slow_path); + __ Bc(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } void LocationsBuilderMIPS64::VisitLongConstant(HLongConstant* constant) { @@ -3290,7 +3279,7 @@ void InstructionCodeGeneratorMIPS64::VisitLongConstant(HLongConstant* constant A void LocationsBuilderMIPS64::VisitMonitorOperation(HMonitorOperation* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -3417,7 +3406,7 @@ void InstructionCodeGeneratorMIPS64::VisitNeg(HNeg* instruction) { void LocationsBuilderMIPS64::VisitNewArray(HNewArray* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot)); @@ -3438,7 +3427,7 @@ void InstructionCodeGeneratorMIPS64::VisitNewArray(HNewArray* instruction) { void LocationsBuilderMIPS64::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; if (instruction->IsStringAlloc()) { locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); @@ -3454,7 +3443,7 @@ void InstructionCodeGeneratorMIPS64::VisitNewInstance(HNewInstance* instruction) // String is allocated through StringFactory. Call NewEmptyString entry point. GpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<GpuRegister>(); MemberOffset code_offset = - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64DoublewordSize); + ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize); __ LoadFromOffset(kLoadDoubleword, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString)); __ LoadFromOffset(kLoadDoubleword, T9, temp, code_offset.Int32Value()); __ Jalr(T9); @@ -3598,7 +3587,8 @@ void InstructionCodeGeneratorMIPS64::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED void LocationsBuilderMIPS64::VisitRem(HRem* rem) { Primitive::Type type = rem->GetResultType(); LocationSummary::CallKind call_kind = - Primitive::IsFloatingPointType(type) ? LocationSummary::kCall : LocationSummary::kNoCall; + Primitive::IsFloatingPointType(type) ? LocationSummary::kCallOnMainOnly + : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind); switch (type) { @@ -3811,7 +3801,7 @@ void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instructio void LocationsBuilderMIPS64::VisitThrow(HThrow* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index 4b462cc800..197f86b22b 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -18,7 +18,6 @@ #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_MIPS64_H_ #include "code_generator.h" -#include "dex/compiler_enums.h" #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 1261619536..7aca16f867 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -47,9 +47,9 @@ static constexpr int kC2ConditionMask = 0x400; static constexpr int kFakeReturnRegister = Register(8); -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86WordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, x).Int32Value() class NullCheckSlowPathX86 : public SlowPathCode { public: @@ -140,12 +140,29 @@ class BoundsCheckSlowPathX86 : public SlowPathCode { // Live registers will be restored in the catch block if caught. SaveLiveRegisters(codegen, instruction_->GetLocations()); } + + // Are we using an array length from memory? + HInstruction* array_length = instruction_->InputAt(1); + Location length_loc = locations->InAt(1); InvokeRuntimeCallingConvention calling_convention; + if (array_length->IsArrayLength() && array_length->IsEmittedAtUseSite()) { + // Load the array length into our temporary. + uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength()); + Location array_loc = array_length->GetLocations()->InAt(0); + Address array_len(array_loc.AsRegister<Register>(), len_offset); + length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(1)); + // Check for conflicts with index. + if (length_loc.Equals(locations->InAt(0))) { + // We know we aren't using parameter 2. + length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(2)); + } + __ movl(length_loc.AsRegister<Register>(), array_len); + } x86_codegen->EmitParallelMoves( locations->InAt(0), Location::RegisterLocation(calling_convention.GetRegisterAt(0)), Primitive::kPrimInt, - locations->InAt(1), + length_loc, Location::RegisterLocation(calling_convention.GetRegisterAt(1)), Primitive::kPrimInt); uint32_t entry_point_offset = instruction_->AsBoundsCheck()->IsStringCharAt() @@ -175,13 +192,11 @@ class SuspendCheckSlowPathX86 : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, instruction_->GetLocations()); x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, instruction_->GetLocations()); if (successor_ == nullptr) { __ jmp(GetReturnLabel()); } else { @@ -332,7 +347,7 @@ class TypeCheckSlowPathX86 : public SlowPathCode { instruction_->GetDexPc(), this); CheckEntrypointTypes< - kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>(); + kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>(); } else { DCHECK(instruction_->IsCheckCast()); x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast), @@ -430,8 +445,8 @@ class ArraySetSlowPathX86 : public SlowPathCode { // Slow path marking an object during a read barrier. class ReadBarrierMarkSlowPathX86 : public SlowPathCode { public: - ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location out, Location obj) - : SlowPathCode(instruction), out_(out), obj_(obj) { + ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj) + : SlowPathCode(instruction), obj_(obj) { DCHECK(kEmitCompilerReadBarrier); } @@ -439,9 +454,9 @@ class ReadBarrierMarkSlowPathX86 : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); - Register reg_out = out_.AsRegister<Register>(); + Register reg = obj_.AsRegister<Register>(); DCHECK(locations->CanCall()); - DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg)); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || @@ -449,30 +464,40 @@ class ReadBarrierMarkSlowPathX86 : public SlowPathCode { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); - - InvokeRuntimeCallingConvention calling_convention; + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen); - x86_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_); - x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark), - instruction_, - instruction_->GetDexPc(), - this); - CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>(); - x86_codegen->Move32(out_, Location::RegisterLocation(EAX)); - - RestoreLiveRegisters(codegen, locations); + DCHECK_NE(reg, ESP); + DCHECK(0 <= reg && reg < kNumberOfCpuRegisters) << reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in EAX): + // + // EAX <- obj + // EAX <- ReadBarrierMark(EAX) + // obj <- EAX + // + // we just use rX (the register holding `obj`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(reg); + // This runtime call does not require a stack map. + x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ jmp(GetExitLabel()); } private: - const Location out_; const Location obj_; DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86); @@ -518,8 +543,7 @@ class ReadBarrierForHeapReferenceSlowPathX86 : public SlowPathCode { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); @@ -706,8 +730,8 @@ class ReadBarrierForRootSlowPathX86 : public SlowPathCode { }; #undef __ -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<X86Assembler*>(GetAssembler())-> /* NOLINT */ +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86Assembler*>(GetAssembler())-> // NOLINT inline Condition X86Condition(IfCondition cond) { switch (cond) { @@ -778,7 +802,7 @@ void CodeGeneratorX86::InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path) { - InvokeRuntime(GetThreadOffset<kX86WordSize>(entrypoint).Int32Value(), + InvokeRuntime(GetThreadOffset<kX86PointerSize>(entrypoint).Int32Value(), instruction, dex_pc, slow_path); @@ -793,6 +817,13 @@ void CodeGeneratorX86::InvokeRuntime(int32_t entry_point_offset, RecordPcInfo(instruction, dex_pc, slow_path); } +void CodeGeneratorX86::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path) { + ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path); + __ fs()->call(Address::Absolute(entry_point_offset)); +} + CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const X86InstructionSetFeatures& isa_features, const CompilerOptions& compiler_options, @@ -1548,15 +1579,15 @@ void LocationsBuilderX86::VisitSelect(HSelect* select) { locations->SetOut(Location::SameAsFirstInput()); } -void InstructionCodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) { +void CodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) { Register lhs_reg = lhs.AsRegister<Register>(); if (rhs.IsConstant()) { int32_t value = CodeGenerator::GetInt32ValueOf(rhs.GetConstant()); - codegen_->Compare32BitValue(lhs_reg, value); + Compare32BitValue(lhs_reg, value); } else if (rhs.IsStackSlot()) { - __ cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex())); + assembler_.cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex())); } else { - __ cmpl(lhs_reg, rhs.AsRegister<Register>()); + assembler_.cmpl(lhs_reg, rhs.AsRegister<Register>()); } } @@ -1589,7 +1620,7 @@ void InstructionCodeGeneratorX86::VisitSelect(HSelect* select) { DCHECK_NE(condition->InputAt(0)->GetType(), Primitive::kPrimLong); DCHECK(!Primitive::IsFloatingPointType(condition->InputAt(0)->GetType())); LocationSummary* cond_locations = condition->GetLocations(); - GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1)); + codegen_->GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1)); cond = X86Condition(condition->GetCondition()); } } else { @@ -1698,7 +1729,7 @@ void InstructionCodeGeneratorX86::HandleCondition(HCondition* cond) { // Clear output register: setb only sets the low byte. __ xorl(reg, reg); - GenerateIntCompare(lhs, rhs); + codegen_->GenerateIntCompare(lhs, rhs); __ setb(X86Condition(cond->GetCondition()), reg); return; } @@ -2027,8 +2058,6 @@ void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) LocationSummary* locations = invoke->GetLocations(); Register temp = locations->GetTemp(0).AsRegister<Register>(); XmmRegister hidden_reg = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); - uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset( - invoke->GetImtIndex() % mirror::Class::kImtSize, kX86PointerSize).Uint32Value(); Location receiver = locations->InAt(0); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); @@ -2055,11 +2084,16 @@ void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) // intact/accessible until the end of the marking phase (the // concurrent copying collector may not in the future). __ MaybeUnpoisonHeapReference(temp); + // temp = temp->GetAddressOfIMT() + __ movl(temp, + Address(temp, mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value())); // temp = temp->GetImtEntryAt(method_offset); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + invoke->GetImtIndex(), kX86PointerSize)); __ movl(temp, Address(temp, method_offset)); // call temp->GetEntryPoint(); __ call(Address(temp, - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value())); + ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value())); DCHECK(!codegen_->IsLeafMethod()); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -2182,7 +2216,7 @@ void LocationsBuilderX86::VisitTypeConversion(HTypeConversion* conversion) { LocationSummary::CallKind call_kind = ((input_type == Primitive::kPrimFloat || input_type == Primitive::kPrimDouble) && result_type == Primitive::kPrimLong) - ? LocationSummary::kCall + ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind); @@ -3437,7 +3471,7 @@ void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instr void LocationsBuilderX86::VisitDiv(HDiv* div) { LocationSummary::CallKind call_kind = (div->GetResultType() == Primitive::kPrimLong) - ? LocationSummary::kCall + ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(div, call_kind); @@ -3540,7 +3574,7 @@ void LocationsBuilderX86::VisitRem(HRem* rem) { Primitive::Type type = rem->GetResultType(); LocationSummary::CallKind call_kind = (rem->GetResultType() == Primitive::kPrimLong) - ? LocationSummary::kCall + ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind); @@ -3982,7 +4016,7 @@ void InstructionCodeGeneratorX86::VisitUShr(HUShr* ushr) { void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); locations->SetOut(Location::RegisterLocation(EAX)); if (instruction->IsStringAlloc()) { locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); @@ -3999,7 +4033,7 @@ void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) { if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize); + MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize); __ fs()->movl(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString))); __ call(Address(temp, code_offset.Int32Value())); codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); @@ -4015,7 +4049,7 @@ void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) { void LocationsBuilderX86::VisitNewArray(HNewArray* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); locations->SetOut(Location::RegisterLocation(EAX)); InvokeRuntimeCallingConvention calling_convention; locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -4070,16 +4104,21 @@ void LocationsBuilderX86::VisitClassTableGet(HClassTableGet* instruction) { void InstructionCodeGeneratorX86::VisitClassTableGet(HClassTableGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - uint32_t method_offset = 0; if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) { - method_offset = mirror::Class::EmbeddedVTableEntryOffset( + uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( instruction->GetIndex(), kX86PointerSize).SizeValue(); + __ movl(locations->Out().AsRegister<Register>(), + Address(locations->InAt(0).AsRegister<Register>(), method_offset)); } else { - method_offset = mirror::Class::EmbeddedImTableEntryOffset( - instruction->GetIndex() % mirror::Class::kImtSize, kX86PointerSize).Uint32Value(); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + instruction->GetIndex(), kX86PointerSize)); + __ movl(locations->Out().AsRegister<Register>(), + Address(locations->InAt(0).AsRegister<Register>(), + mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value())); + // temp = temp->GetImtEntryAt(method_offset); + __ movl(locations->Out().AsRegister<Register>(), + Address(locations->Out().AsRegister<Register>(), method_offset)); } - __ movl(locations->Out().AsRegister<Register>(), - Address(locations->InAt(0).AsRegister<Register>(), method_offset)); } void LocationsBuilderX86::VisitNot(HNot* not_) { @@ -4172,7 +4211,7 @@ void InstructionCodeGeneratorX86::VisitCompare(HCompare* compare) { case Primitive::kPrimShort: case Primitive::kPrimChar: case Primitive::kPrimInt: { - GenerateIntCompare(left, right); + codegen_->GenerateIntCompare(left, right); break; } case Primitive::kPrimLong: { @@ -4411,7 +4450,7 @@ void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, // (callee_method + offset_of_quick_compiled_code)() __ call(Address(callee_method.AsRegister<Register>(), ArtMethod::EntryPointFromQuickCompiledCodeOffset( - kX86WordSize).Int32Value())); + kX86PointerSize).Int32Value())); break; } @@ -4445,7 +4484,7 @@ void CodeGeneratorX86::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp __ movl(temp, Address(temp, method_offset)); // call temp->GetEntryPoint(); __ call(Address( - temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value())); + temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value())); } void CodeGeneratorX86::RecordSimplePatch() { @@ -4549,7 +4588,7 @@ void CodeGeneratorX86::MarkGCCard(Register temp, __ testl(value, value); __ j(kEqual, &is_null); } - __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86WordSize>().Int32Value())); + __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86PointerSize>().Int32Value())); __ movl(temp, object); __ shrl(temp, Immediate(gc::accounting::CardTable::kCardShift)); __ movb(Address(temp, card, TIMES_1, 0), @@ -5510,10 +5549,16 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) { void LocationsBuilderX86::VisitArrayLength(HArrayLength* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + if (!instruction->IsEmittedAtUseSite()) { + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + } } void InstructionCodeGeneratorX86::VisitArrayLength(HArrayLength* instruction) { + if (instruction->IsEmittedAtUseSite()) { + return; + } + LocationSummary* locations = instruction->GetLocations(); uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction); Register obj = locations->InAt(0).AsRegister<Register>(); @@ -5528,7 +5573,10 @@ void LocationsBuilderX86::VisitBoundsCheck(HBoundsCheck* instruction) { : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); locations->SetInAt(0, Location::RegisterOrConstant(instruction->InputAt(0))); - locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + HInstruction* length = instruction->InputAt(1); + if (!length->IsEmittedAtUseSite()) { + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + } if (instruction->HasUses()) { locations->SetOut(Location::SameAsFirstInput()); } @@ -5562,12 +5610,28 @@ void InstructionCodeGeneratorX86::VisitBoundsCheck(HBoundsCheck* instruction) { codegen_->AddSlowPath(slow_path); __ j(kAboveEqual, slow_path->GetEntryLabel()); } else { - Register length = length_loc.AsRegister<Register>(); - if (index_loc.IsConstant()) { - int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant()); - __ cmpl(length, Immediate(value)); + HInstruction* array_length = instruction->InputAt(1); + if (array_length->IsEmittedAtUseSite()) { + // Address the length field in the array. + DCHECK(array_length->IsArrayLength()); + uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength()); + Location array_loc = array_length->GetLocations()->InAt(0); + Address array_len(array_loc.AsRegister<Register>(), len_offset); + if (index_loc.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant()); + __ cmpl(array_len, Immediate(value)); + } else { + __ cmpl(array_len, index_loc.AsRegister<Register>()); + } + codegen_->MaybeRecordImplicitNullCheck(array_length); } else { - __ cmpl(length, index_loc.AsRegister<Register>()); + Register length = length_loc.AsRegister<Register>(); + if (index_loc.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant()); + __ cmpl(length, Immediate(value)); + } else { + __ cmpl(length, index_loc.AsRegister<Register>()); + } } codegen_->AddSlowPath(slow_path); __ j(kBelowEqual, slow_path->GetEntryLabel()); @@ -5616,7 +5680,7 @@ void InstructionCodeGeneratorX86::GenerateSuspendCheck(HSuspendCheck* instructio DCHECK_EQ(slow_path->GetSuccessor(), successor); } - __ fs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86WordSize>().Int32Value()), + __ fs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>().Int32Value()), Immediate(0)); if (successor == nullptr) { __ j(kNotEqual, slow_path->GetEntryLabel()); @@ -6167,52 +6231,19 @@ void InstructionCodeGeneratorX86::VisitLoadString(HLoadString* load) { codegen_->RecordSimplePatch(); return; // No dex cache slow path. } - case HLoadString::LoadKind::kDexCacheAddress: { - DCHECK_NE(load->GetAddress(), 0u); - uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress()); - // /* GcRoot<mirror::String> */ out = *address - GenerateGcRootFieldLoad(load, out_loc, Address::Absolute(address)); - break; - } - case HLoadString::LoadKind::kDexCachePcRelative: { - Register base_reg = locations->InAt(0).AsRegister<Register>(); - uint32_t offset = load->GetDexCacheElementOffset(); - Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset); - // /* GcRoot<mirror::String> */ out = *(base + offset) /* PC-relative */ - GenerateGcRootFieldLoad( - load, out_loc, Address(base_reg, CodeGeneratorX86::kDummy32BitOffset), fixup_label); - break; - } - case HLoadString::LoadKind::kDexCacheViaMethod: { - Register current_method = locations->InAt(0).AsRegister<Register>(); - - // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_ - GenerateGcRootFieldLoad( - load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value())); - - // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_ - __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value())); - // /* GcRoot<mirror::String> */ out = out[string_index] - GenerateGcRootFieldLoad( - load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex()))); - break; - } default: - LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind(); - UNREACHABLE(); + break; } - if (!load->IsInDexCache()) { - SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load); - codegen_->AddSlowPath(slow_path); - __ testl(out, out); - __ j(kEqual, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); - } + // TODO: Re-add the compiler code to do string dex cache lookup again. + SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load); + codegen_->AddSlowPath(slow_path); + __ jmp(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } static Address GetExceptionTlsAddress() { - return Address::Absolute(Thread::ExceptionOffset<kX86WordSize>().Int32Value()); + return Address::Absolute(Thread::ExceptionOffset<kX86PointerSize>().Int32Value()); } void LocationsBuilderX86::VisitLoadException(HLoadException* load) { @@ -6235,7 +6266,7 @@ void InstructionCodeGeneratorX86::VisitClearException(HClearException* clear ATT void LocationsBuilderX86::VisitThrow(HThrow* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -6687,7 +6718,7 @@ void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) { void LocationsBuilderX86::VisitMonitorOperation(HMonitorOperation* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -6926,10 +6957,10 @@ void InstructionCodeGeneratorX86::GenerateGcRootFieldLoad(HInstruction* instruct // Slow path used to mark the GC root `root`. SlowPathCode* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root, root); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root); codegen_->AddSlowPath(slow_path); - __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86WordSize>().Int32Value()), + __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86PointerSize>().Int32Value()), Immediate(0)); __ j(kNotEqual, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); @@ -7036,12 +7067,6 @@ void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // /* LockWord */ lock_word = LockWord(monitor) static_assert(sizeof(LockWord) == sizeof(int32_t), "art::LockWord and int32_t have different sizes."); - // /* uint32_t */ rb_state = lock_word.ReadBarrierState() - __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift)); - __ andl(temp_reg, Immediate(LockWord::kReadBarrierStateMask)); - static_assert( - LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_, - "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_."); // Load fence to prevent load-load reordering. // Note that this is a no-op, thanks to the x86 memory model. @@ -7056,13 +7081,18 @@ void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // Slow path used to mark the object `ref` when it is gray. SlowPathCode* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref, ref); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref); AddSlowPath(slow_path); // if (rb_state == ReadBarrier::gray_ptr_) // ref = ReadBarrier::Mark(ref); - __ cmpl(temp_reg, Immediate(ReadBarrier::gray_ptr_)); - __ j(kEqual, slow_path->GetEntryLabel()); + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with SHR. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1)); + __ j(kCarrySet, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 2a9fb80995..894f2e8f40 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -18,8 +18,8 @@ #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_ #include "arch/x86/instruction_set_features_x86.h" +#include "base/enums.h" #include "code_generator.h" -#include "dex/compiler_enums.h" #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" @@ -29,7 +29,7 @@ namespace art { namespace x86 { // Use a local definition to prevent copying mistakes. -static constexpr size_t kX86WordSize = kX86PointerSize; +static constexpr size_t kX86WordSize = static_cast<size_t>(kX86PointerSize); class CodeGeneratorX86; @@ -295,7 +295,6 @@ class InstructionCodeGeneratorX86 : public InstructionCodeGenerator { HBasicBlock* default_block); void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double); - void GenerateIntCompare(Location lhs, Location rhs); X86Assembler* const assembler_; CodeGeneratorX86* const codegen_; @@ -336,6 +335,12 @@ class CodeGeneratorX86 : public CodeGenerator { uint32_t dex_pc, SlowPathCode* slow_path); + // Generate code to invoke a runtime entry point, but do not record + // PC-related information in a stack map. + void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path); + size_t GetWordSize() const OVERRIDE { return kX86WordSize; } @@ -425,6 +430,8 @@ class CodeGeneratorX86 : public CodeGenerator { Register value, bool value_can_be_null); + void GenerateIntCompare(Location lhs, Location rhs); + void GenerateMemoryBarrier(MemBarrierKind kind); Label* GetLabelOf(HBasicBlock* block) const { diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 5e30203b38..0c55ae44de 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -51,9 +51,9 @@ static constexpr FloatRegister kFpuCalleeSaves[] = { XMM12, XMM13, XMM14, XMM15 static constexpr int kC2ConditionMask = 0x400; -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT -#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, x).Int32Value() +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT +#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, x).Int32Value() class NullCheckSlowPathX86_64 : public SlowPathCode { public: @@ -149,13 +149,11 @@ class SuspendCheckSlowPathX86_64 : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, instruction_->GetLocations()); x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, instruction_->GetLocations()); if (successor_ == nullptr) { __ jmp(GetReturnLabel()); } else { @@ -194,14 +192,31 @@ class BoundsCheckSlowPathX86_64 : public SlowPathCode { // Live registers will be restored in the catch block if caught. SaveLiveRegisters(codegen, instruction_->GetLocations()); } + // Are we using an array length from memory? + HInstruction* array_length = instruction_->InputAt(1); + Location length_loc = locations->InAt(1); + InvokeRuntimeCallingConvention calling_convention; + if (array_length->IsArrayLength() && array_length->IsEmittedAtUseSite()) { + // Load the array length into our temporary. + uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength()); + Location array_loc = array_length->GetLocations()->InAt(0); + Address array_len(array_loc.AsRegister<CpuRegister>(), len_offset); + length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(1)); + // Check for conflicts with index. + if (length_loc.Equals(locations->InAt(0))) { + // We know we aren't using parameter 2. + length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(2)); + } + __ movl(length_loc.AsRegister<CpuRegister>(), array_len); + } + // We're moving two locations to locations that could overlap, so we need a parallel // move resolver. - InvokeRuntimeCallingConvention calling_convention; codegen->EmitParallelMoves( locations->InAt(0), Location::RegisterLocation(calling_convention.GetRegisterAt(0)), Primitive::kPrimInt, - locations->InAt(1), + length_loc, Location::RegisterLocation(calling_convention.GetRegisterAt(1)), Primitive::kPrimInt); uint32_t entry_point_offset = instruction_->AsBoundsCheck()->IsStringCharAt() @@ -352,7 +367,7 @@ class TypeCheckSlowPathX86_64 : public SlowPathCode { dex_pc, this); CheckEntrypointTypes< - kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>(); + kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>(); } else { DCHECK(instruction_->IsCheckCast()); x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast), @@ -451,8 +466,8 @@ class ArraySetSlowPathX86_64 : public SlowPathCode { // Slow path marking an object during a read barrier. class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode { public: - ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location out, Location obj) - : SlowPathCode(instruction), out_(out), obj_(obj) { + ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj) + : SlowPathCode(instruction), obj_(obj) { DCHECK(kEmitCompilerReadBarrier); } @@ -460,9 +475,9 @@ class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); - Register reg_out = out_.AsRegister<Register>(); + Register reg = obj_.AsRegister<Register>(); DCHECK(locations->CanCall()); - DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg)); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || @@ -470,30 +485,40 @@ class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); - - InvokeRuntimeCallingConvention calling_convention; + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); - x86_64_codegen->Move(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_); - x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark), - instruction_, - instruction_->GetDexPc(), - this); - CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>(); - x86_64_codegen->Move(out_, Location::RegisterLocation(RAX)); - - RestoreLiveRegisters(codegen, locations); + DCHECK_NE(reg, RSP); + DCHECK(0 <= reg && reg < kNumberOfCpuRegisters) << reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in R0): + // + // RDI <- obj + // RAX <- ReadBarrierMark(RDI) + // obj <- RAX + // + // we just use rX (the register holding `obj`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(reg); + // This runtime call does not require a stack map. + x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ jmp(GetExitLabel()); } private: - const Location out_; const Location obj_; DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86_64); @@ -539,8 +564,7 @@ class ReadBarrierForHeapReferenceSlowPathX86_64 : public SlowPathCode { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) && - instruction_->GetLocations()->Intrinsified())) + (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); @@ -725,8 +749,8 @@ class ReadBarrierForRootSlowPathX86_64 : public SlowPathCode { }; #undef __ -// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy. -#define __ down_cast<X86_64Assembler*>(GetAssembler())-> // NOLINT +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86_64Assembler*>(GetAssembler())-> // NOLINT inline Condition X86_64IntegerCondition(IfCondition cond) { switch (cond) { @@ -858,7 +882,7 @@ void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo // (callee_method + offset_of_quick_compiled_code)() __ call(Address(callee_method.AsRegister<CpuRegister>(), ArtMethod::EntryPointFromQuickCompiledCodeOffset( - kX86_64WordSize).SizeValue())); + kX86_64PointerSize).SizeValue())); break; } @@ -893,7 +917,7 @@ void CodeGeneratorX86_64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t __ movq(temp, Address(temp, method_offset)); // call temp->GetEntryPoint(); __ call(Address(temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset( - kX86_64WordSize).SizeValue())); + kX86_64PointerSize).SizeValue())); } void CodeGeneratorX86_64::RecordSimplePatch() { @@ -1006,7 +1030,7 @@ void CodeGeneratorX86_64::InvokeRuntime(QuickEntrypointEnum entrypoint, HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path) { - InvokeRuntime(GetThreadOffset<kX86_64WordSize>(entrypoint).Int32Value(), + InvokeRuntime(GetThreadOffset<kX86_64PointerSize>(entrypoint).Int32Value(), instruction, dex_pc, slow_path); @@ -1021,6 +1045,13 @@ void CodeGeneratorX86_64::InvokeRuntime(int32_t entry_point_offset, RecordPcInfo(instruction, dex_pc, slow_path); } +void CodeGeneratorX86_64::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path) { + ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path); + __ gs()->call(Address::Absolute(entry_point_offset, /* no_rip */ true)); +} + static constexpr int kNumberOfCpuRegisterPairs = 0; // Use a fake return address register to mimic Quick. static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1); @@ -2257,8 +2288,6 @@ void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invo LocationSummary* locations = invoke->GetLocations(); CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>(); CpuRegister hidden_reg = locations->GetTemp(1).AsRegister<CpuRegister>(); - uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset( - invoke->GetImtIndex() % mirror::Class::kImtSize, kX86_64PointerSize).Uint32Value(); Location receiver = locations->InAt(0); size_t class_offset = mirror::Object::ClassOffset().SizeValue(); @@ -2284,11 +2313,17 @@ void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invo // intact/accessible until the end of the marking phase (the // concurrent copying collector may not in the future). __ MaybeUnpoisonHeapReference(temp); + // temp = temp->GetAddressOfIMT() + __ movq(temp, + Address(temp, mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value())); + // temp = temp->GetImtEntryAt(method_offset); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + invoke->GetImtIndex(), kX86_64PointerSize)); // temp = temp->GetImtEntryAt(method_offset); __ movq(temp, Address(temp, method_offset)); // call temp->GetEntryPoint(); - __ call(Address(temp, - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64WordSize).SizeValue())); + __ call(Address( + temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64PointerSize).SizeValue())); DCHECK(!codegen_->IsLeafMethod()); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -3909,7 +3944,7 @@ void InstructionCodeGeneratorX86_64::VisitUShr(HUShr* ushr) { void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; if (instruction->IsStringAlloc()) { locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); @@ -3926,7 +3961,7 @@ void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. CpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<CpuRegister>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64WordSize); + MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64PointerSize); __ gs()->movq(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString), /* no_rip */ true)); __ call(Address(temp, code_offset.SizeValue())); codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); @@ -3942,7 +3977,7 @@ void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) void LocationsBuilderX86_64::VisitNewArray(HNewArray* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->SetOut(Location::RegisterLocation(RAX)); @@ -4002,16 +4037,20 @@ void LocationsBuilderX86_64::VisitClassTableGet(HClassTableGet* instruction) { void InstructionCodeGeneratorX86_64::VisitClassTableGet(HClassTableGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - uint32_t method_offset = 0; if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) { - method_offset = mirror::Class::EmbeddedVTableEntryOffset( + uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( instruction->GetIndex(), kX86_64PointerSize).SizeValue(); + __ movq(locations->Out().AsRegister<CpuRegister>(), + Address(locations->InAt(0).AsRegister<CpuRegister>(), method_offset)); } else { - method_offset = mirror::Class::EmbeddedImTableEntryOffset( - instruction->GetIndex() % mirror::Class::kImtSize, kX86_64PointerSize).Uint32Value(); + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + instruction->GetIndex(), kX86_64PointerSize)); + __ movq(locations->Out().AsRegister<CpuRegister>(), + Address(locations->InAt(0).AsRegister<CpuRegister>(), + mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value())); + __ movq(locations->Out().AsRegister<CpuRegister>(), + Address(locations->Out().AsRegister<CpuRegister>(), method_offset)); } - __ movq(locations->Out().AsRegister<CpuRegister>(), - Address(locations->InAt(0).AsRegister<CpuRegister>(), method_offset)); } void LocationsBuilderX86_64::VisitNot(HNot* not_) { @@ -4980,10 +5019,16 @@ void LocationsBuilderX86_64::VisitArrayLength(HArrayLength* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + if (!instruction->IsEmittedAtUseSite()) { + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + } } void InstructionCodeGeneratorX86_64::VisitArrayLength(HArrayLength* instruction) { + if (instruction->IsEmittedAtUseSite()) { + return; + } + LocationSummary* locations = instruction->GetLocations(); uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction); CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>(); @@ -4998,7 +5043,10 @@ void LocationsBuilderX86_64::VisitBoundsCheck(HBoundsCheck* instruction) { : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); locations->SetInAt(0, Location::RegisterOrConstant(instruction->InputAt(0))); - locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + HInstruction* length = instruction->InputAt(1); + if (!length->IsEmittedAtUseSite()) { + locations->SetInAt(1, Location::RegisterOrConstant(length)); + } if (instruction->HasUses()) { locations->SetOut(Location::SameAsFirstInput()); } @@ -5008,8 +5056,7 @@ void InstructionCodeGeneratorX86_64::VisitBoundsCheck(HBoundsCheck* instruction) LocationSummary* locations = instruction->GetLocations(); Location index_loc = locations->InAt(0); Location length_loc = locations->InAt(1); - SlowPathCode* slow_path = - new (GetGraph()->GetArena()) BoundsCheckSlowPathX86_64(instruction); + SlowPathCode* slow_path = new (GetGraph()->GetArena()) BoundsCheckSlowPathX86_64(instruction); if (length_loc.IsConstant()) { int32_t length = CodeGenerator::GetInt32ValueOf(length_loc.GetConstant()); @@ -5032,12 +5079,28 @@ void InstructionCodeGeneratorX86_64::VisitBoundsCheck(HBoundsCheck* instruction) codegen_->AddSlowPath(slow_path); __ j(kAboveEqual, slow_path->GetEntryLabel()); } else { - CpuRegister length = length_loc.AsRegister<CpuRegister>(); - if (index_loc.IsConstant()) { - int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant()); - __ cmpl(length, Immediate(value)); + HInstruction* array_length = instruction->InputAt(1); + if (array_length->IsEmittedAtUseSite()) { + // Address the length field in the array. + DCHECK(array_length->IsArrayLength()); + uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength()); + Location array_loc = array_length->GetLocations()->InAt(0); + Address array_len(array_loc.AsRegister<CpuRegister>(), len_offset); + if (index_loc.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant()); + __ cmpl(array_len, Immediate(value)); + } else { + __ cmpl(array_len, index_loc.AsRegister<CpuRegister>()); + } + codegen_->MaybeRecordImplicitNullCheck(array_length); } else { - __ cmpl(length, index_loc.AsRegister<CpuRegister>()); + CpuRegister length = length_loc.AsRegister<CpuRegister>(); + if (index_loc.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant()); + __ cmpl(length, Immediate(value)); + } else { + __ cmpl(length, index_loc.AsRegister<CpuRegister>()); + } } codegen_->AddSlowPath(slow_path); __ j(kBelowEqual, slow_path->GetEntryLabel()); @@ -5054,7 +5117,7 @@ void CodeGeneratorX86_64::MarkGCCard(CpuRegister temp, __ testl(value, value); __ j(kEqual, &is_null); } - __ gs()->movq(card, Address::Absolute(Thread::CardTableOffset<kX86_64WordSize>().Int32Value(), + __ gs()->movq(card, Address::Absolute(Thread::CardTableOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true)); __ movq(temp, object); __ shrq(temp, Immediate(gc::accounting::CardTable::kCardShift)); @@ -5106,7 +5169,7 @@ void InstructionCodeGeneratorX86_64::GenerateSuspendCheck(HSuspendCheck* instruc DCHECK_EQ(slow_path->GetSuccessor(), successor); } - __ gs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64WordSize>().Int32Value(), + __ gs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true), Immediate(0)); if (successor == nullptr) { @@ -5573,57 +5636,19 @@ void InstructionCodeGeneratorX86_64::VisitLoadString(HLoadString* load) { codegen_->RecordSimplePatch(); return; // No dex cache slow path. } - case HLoadString::LoadKind::kDexCacheAddress: { - DCHECK_NE(load->GetAddress(), 0u); - // /* GcRoot<mirror::String> */ out = *address - if (IsUint<32>(load->GetAddress())) { - Address address = Address::Absolute(load->GetAddress(), /* no_rip */ true); - GenerateGcRootFieldLoad(load, out_loc, address); - } else { - // TODO: Consider using opcode A1, i.e. movl eax, moff32 (with 64-bit address). - __ movq(out, Immediate(load->GetAddress())); - GenerateGcRootFieldLoad(load, out_loc, Address(out, 0)); - } - break; - } - case HLoadString::LoadKind::kDexCachePcRelative: { - uint32_t offset = load->GetDexCacheElementOffset(); - Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset); - Address address = Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset, - /* no_rip */ false); - // /* GcRoot<mirror::String> */ out = *address /* PC-relative */ - GenerateGcRootFieldLoad(load, out_loc, address, fixup_label); - break; - } - case HLoadString::LoadKind::kDexCacheViaMethod: { - CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>(); - - // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_ - GenerateGcRootFieldLoad( - load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value())); - // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_ - __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Uint32Value())); - // /* GcRoot<mirror::String> */ out = out[string_index] - GenerateGcRootFieldLoad( - load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex()))); - break; - } default: - LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind(); - UNREACHABLE(); + break; } - if (!load->IsInDexCache()) { - SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load); - codegen_->AddSlowPath(slow_path); - __ testl(out, out); - __ j(kEqual, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); - } + // TODO: Re-add the compiler code to do string dex cache lookup again. + SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load); + codegen_->AddSlowPath(slow_path); + __ jmp(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } static Address GetExceptionTlsAddress() { - return Address::Absolute(Thread::ExceptionOffset<kX86_64WordSize>().Int32Value(), + return Address::Absolute(Thread::ExceptionOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true); } @@ -5647,7 +5672,7 @@ void InstructionCodeGeneratorX86_64::VisitClearException(HClearException* clear void LocationsBuilderX86_64::VisitThrow(HThrow* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -6157,7 +6182,7 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) { void LocationsBuilderX86_64::VisitMonitorOperation(HMonitorOperation* instruction) { LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } @@ -6378,10 +6403,10 @@ void InstructionCodeGeneratorX86_64::GenerateGcRootFieldLoad(HInstruction* instr // Slow path used to mark the GC root `root`. SlowPathCode* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root, root); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root); codegen_->AddSlowPath(slow_path); - __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64WordSize>().Int32Value(), + __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true), Immediate(0)); __ j(kNotEqual, slow_path->GetEntryLabel()); @@ -6489,12 +6514,6 @@ void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction // /* LockWord */ lock_word = LockWord(monitor) static_assert(sizeof(LockWord) == sizeof(int32_t), "art::LockWord and int32_t have different sizes."); - // /* uint32_t */ rb_state = lock_word.ReadBarrierState() - __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift)); - __ andl(temp_reg, Immediate(LockWord::kReadBarrierStateMask)); - static_assert( - LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_, - "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_."); // Load fence to prevent load-load reordering. // Note that this is a no-op, thanks to the x86-64 memory model. @@ -6509,13 +6528,18 @@ void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction // Slow path used to mark the object `ref` when it is gray. SlowPathCode* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref, ref); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref); AddSlowPath(slow_path); // if (rb_state == ReadBarrier::gray_ptr_) // ref = ReadBarrier::Mark(ref); - __ cmpl(temp_reg, Immediate(ReadBarrier::gray_ptr_)); - __ j(kEqual, slow_path->GetEntryLabel()); + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with SHR. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1)); + __ j(kCarrySet, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index d7cfd37c33..4e0e34ce38 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -19,7 +19,6 @@ #include "arch/x86_64/instruction_set_features_x86_64.h" #include "code_generator.h" -#include "dex/compiler_enums.h" #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" @@ -29,7 +28,7 @@ namespace art { namespace x86_64 { // Use a local definition to prevent copying mistakes. -static constexpr size_t kX86_64WordSize = kX86_64PointerSize; +static constexpr size_t kX86_64WordSize = static_cast<size_t>(kX86_64PointerSize); // Some x86_64 instructions require a register to be available as temp. static constexpr Register TMP = R11; @@ -318,6 +317,12 @@ class CodeGeneratorX86_64 : public CodeGenerator { uint32_t dex_pc, SlowPathCode* slow_path); + // Generate code to invoke a runtime entry point, but do not record + // PC-related information in a stack map. + void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path); + size_t GetWordSize() const OVERRIDE { return kX86_64WordSize; } diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 6be79fa75c..fe6c0a305e 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -29,12 +29,6 @@ #include "arch/x86_64/instruction_set_features_x86_64.h" #include "base/macros.h" #include "builder.h" -#include "code_generator_arm.h" -#include "code_generator_arm64.h" -#include "code_generator_mips.h" -#include "code_generator_mips64.h" -#include "code_generator_x86.h" -#include "code_generator_x86_64.h" #include "code_simulator_container.h" #include "common_compiler_test.h" #include "dex_file.h" @@ -44,7 +38,7 @@ #include "nodes.h" #include "optimizing_unit_test.h" #include "prepare_for_register_allocation.h" -#include "register_allocator.h" +#include "register_allocator_linear_scan.h" #include "ssa_liveness_analysis.h" #include "utils.h" #include "utils/arm/managed_register_arm.h" @@ -52,10 +46,35 @@ #include "utils/mips64/managed_register_mips64.h" #include "utils/x86/managed_register_x86.h" +#ifdef ART_ENABLE_CODEGEN_arm +#include "code_generator_arm.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_arm64 +#include "code_generator_arm64.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_x86 +#include "code_generator_x86.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_x86_64 +#include "code_generator_x86_64.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_mips +#include "code_generator_mips.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_mips64 +#include "code_generator_mips64.h" +#endif + #include "gtest/gtest.h" namespace art { +#ifdef ART_ENABLE_CODEGEN_arm // Provide our own codegen, that ensures the C calling conventions // are preserved. Currently, ART and C do not match as R4 is caller-save // in ART, and callee-save in C. Alternatively, we could use or write @@ -80,7 +99,9 @@ class TestCodeGeneratorARM : public arm::CodeGeneratorARM { blocked_register_pairs_[arm::R6_R7] = false; } }; +#endif +#ifdef ART_ENABLE_CODEGEN_x86 class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 { public: TestCodeGeneratorX86(HGraph* graph, @@ -105,6 +126,7 @@ class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 { blocked_register_pairs_[x86::ECX_EDI] = false; } }; +#endif class InternalCodeAllocator : public CodeAllocator { public: @@ -219,7 +241,7 @@ static void RunCode(CodeGenerator* codegen, PrepareForRegisterAllocation(graph).Run(); liveness.Analyze(); - RegisterAllocator(graph->GetArena(), codegen, liveness).AllocateRegisters(); + RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters(); hook_before_codegen(graph); InternalCodeAllocator allocator; @@ -234,37 +256,54 @@ static void RunCode(InstructionSet target_isa, bool has_result, Expected expected) { CompilerOptions compiler_options; +#ifdef ART_ENABLE_CODEGEN_arm if (target_isa == kArm || target_isa == kThumb2) { std::unique_ptr<const ArmInstructionSetFeatures> features_arm( ArmInstructionSetFeatures::FromCppDefines()); TestCodeGeneratorARM codegenARM(graph, *features_arm.get(), compiler_options); RunCode(&codegenARM, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kArm64) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_arm64 + if (target_isa == kArm64) { std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64( Arm64InstructionSetFeatures::FromCppDefines()); arm64::CodeGeneratorARM64 codegenARM64(graph, *features_arm64.get(), compiler_options); RunCode(&codegenARM64, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kX86) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_x86 + if (target_isa == kX86) { std::unique_ptr<const X86InstructionSetFeatures> features_x86( X86InstructionSetFeatures::FromCppDefines()); - x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options); + TestCodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options); RunCode(&codegenX86, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kX86_64) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_x86_64 + if (target_isa == kX86_64) { std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64( X86_64InstructionSetFeatures::FromCppDefines()); x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options); RunCode(&codegenX86_64, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kMips) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_mips + if (target_isa == kMips) { std::unique_ptr<const MipsInstructionSetFeatures> features_mips( MipsInstructionSetFeatures::FromCppDefines()); mips::CodeGeneratorMIPS codegenMIPS(graph, *features_mips.get(), compiler_options); RunCode(&codegenMIPS, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kMips64) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_mips64 + if (target_isa == kMips64) { std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64( Mips64InstructionSetFeatures::FromCppDefines()); mips64::CodeGeneratorMIPS64 codegenMIPS64(graph, *features_mips64.get(), compiler_options); RunCode(&codegenMIPS64, graph, hook_before_codegen, has_result, expected); } +#endif } static ::std::vector<InstructionSet> GetTargetISAs() { diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h index a849448cf9..cc949c5275 100644 --- a/compiler/optimizing/common_arm64.h +++ b/compiler/optimizing/common_arm64.h @@ -21,8 +21,14 @@ #include "locations.h" #include "nodes.h" #include "utils/arm64/assembler_arm64.h" -#include "vixl/a64/disasm-a64.h" -#include "vixl/a64/macro-assembler-a64.h" + +// TODO(VIXL): Make VIXL compile with -Wshadow. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#include "aarch64/disasm-aarch64.h" +#include "aarch64/macro-assembler-aarch64.h" +#include "aarch64/simulator-aarch64.h" +#pragma GCC diagnostic pop namespace art { namespace arm64 { @@ -34,87 +40,88 @@ static_assert((SP == 31) && (WSP == 31) && (XZR == 32) && (WZR == 32), static inline int VIXLRegCodeFromART(int code) { if (code == SP) { - return vixl::kSPRegInternalCode; + return vixl::aarch64::kSPRegInternalCode; } if (code == XZR) { - return vixl::kZeroRegCode; + return vixl::aarch64::kZeroRegCode; } return code; } static inline int ARTRegCodeFromVIXL(int code) { - if (code == vixl::kSPRegInternalCode) { + if (code == vixl::aarch64::kSPRegInternalCode) { return SP; } - if (code == vixl::kZeroRegCode) { + if (code == vixl::aarch64::kZeroRegCode) { return XZR; } return code; } -static inline vixl::Register XRegisterFrom(Location location) { +static inline vixl::aarch64::Register XRegisterFrom(Location location) { DCHECK(location.IsRegister()) << location; - return vixl::Register::XRegFromCode(VIXLRegCodeFromART(location.reg())); + return vixl::aarch64::Register::GetXRegFromCode(VIXLRegCodeFromART(location.reg())); } -static inline vixl::Register WRegisterFrom(Location location) { +static inline vixl::aarch64::Register WRegisterFrom(Location location) { DCHECK(location.IsRegister()) << location; - return vixl::Register::WRegFromCode(VIXLRegCodeFromART(location.reg())); + return vixl::aarch64::Register::GetWRegFromCode(VIXLRegCodeFromART(location.reg())); } -static inline vixl::Register RegisterFrom(Location location, Primitive::Type type) { +static inline vixl::aarch64::Register RegisterFrom(Location location, Primitive::Type type) { DCHECK(type != Primitive::kPrimVoid && !Primitive::IsFloatingPointType(type)) << type; return type == Primitive::kPrimLong ? XRegisterFrom(location) : WRegisterFrom(location); } -static inline vixl::Register OutputRegister(HInstruction* instr) { +static inline vixl::aarch64::Register OutputRegister(HInstruction* instr) { return RegisterFrom(instr->GetLocations()->Out(), instr->GetType()); } -static inline vixl::Register InputRegisterAt(HInstruction* instr, int input_index) { +static inline vixl::aarch64::Register InputRegisterAt(HInstruction* instr, int input_index) { return RegisterFrom(instr->GetLocations()->InAt(input_index), instr->InputAt(input_index)->GetType()); } -static inline vixl::FPRegister DRegisterFrom(Location location) { +static inline vixl::aarch64::FPRegister DRegisterFrom(Location location) { DCHECK(location.IsFpuRegister()) << location; - return vixl::FPRegister::DRegFromCode(location.reg()); + return vixl::aarch64::FPRegister::GetDRegFromCode(location.reg()); } -static inline vixl::FPRegister SRegisterFrom(Location location) { +static inline vixl::aarch64::FPRegister SRegisterFrom(Location location) { DCHECK(location.IsFpuRegister()) << location; - return vixl::FPRegister::SRegFromCode(location.reg()); + return vixl::aarch64::FPRegister::GetSRegFromCode(location.reg()); } -static inline vixl::FPRegister FPRegisterFrom(Location location, Primitive::Type type) { +static inline vixl::aarch64::FPRegister FPRegisterFrom(Location location, Primitive::Type type) { DCHECK(Primitive::IsFloatingPointType(type)) << type; return type == Primitive::kPrimDouble ? DRegisterFrom(location) : SRegisterFrom(location); } -static inline vixl::FPRegister OutputFPRegister(HInstruction* instr) { +static inline vixl::aarch64::FPRegister OutputFPRegister(HInstruction* instr) { return FPRegisterFrom(instr->GetLocations()->Out(), instr->GetType()); } -static inline vixl::FPRegister InputFPRegisterAt(HInstruction* instr, int input_index) { +static inline vixl::aarch64::FPRegister InputFPRegisterAt(HInstruction* instr, int input_index) { return FPRegisterFrom(instr->GetLocations()->InAt(input_index), instr->InputAt(input_index)->GetType()); } -static inline vixl::CPURegister CPURegisterFrom(Location location, Primitive::Type type) { - return Primitive::IsFloatingPointType(type) ? vixl::CPURegister(FPRegisterFrom(location, type)) - : vixl::CPURegister(RegisterFrom(location, type)); +static inline vixl::aarch64::CPURegister CPURegisterFrom(Location location, Primitive::Type type) { + return Primitive::IsFloatingPointType(type) + ? vixl::aarch64::CPURegister(FPRegisterFrom(location, type)) + : vixl::aarch64::CPURegister(RegisterFrom(location, type)); } -static inline vixl::CPURegister OutputCPURegister(HInstruction* instr) { +static inline vixl::aarch64::CPURegister OutputCPURegister(HInstruction* instr) { return Primitive::IsFloatingPointType(instr->GetType()) - ? static_cast<vixl::CPURegister>(OutputFPRegister(instr)) - : static_cast<vixl::CPURegister>(OutputRegister(instr)); + ? static_cast<vixl::aarch64::CPURegister>(OutputFPRegister(instr)) + : static_cast<vixl::aarch64::CPURegister>(OutputRegister(instr)); } -static inline vixl::CPURegister InputCPURegisterAt(HInstruction* instr, int index) { +static inline vixl::aarch64::CPURegister InputCPURegisterAt(HInstruction* instr, int index) { return Primitive::IsFloatingPointType(instr->InputAt(index)->GetType()) - ? static_cast<vixl::CPURegister>(InputFPRegisterAt(instr, index)) - : static_cast<vixl::CPURegister>(InputRegisterAt(instr, index)); + ? static_cast<vixl::aarch64::CPURegister>(InputFPRegisterAt(instr, index)) + : static_cast<vixl::aarch64::CPURegister>(InputRegisterAt(instr, index)); } static inline int64_t Int64ConstantFrom(Location location) { @@ -129,63 +136,70 @@ static inline int64_t Int64ConstantFrom(Location location) { } } -static inline vixl::Operand OperandFrom(Location location, Primitive::Type type) { +static inline vixl::aarch64::Operand OperandFrom(Location location, Primitive::Type type) { if (location.IsRegister()) { - return vixl::Operand(RegisterFrom(location, type)); + return vixl::aarch64::Operand(RegisterFrom(location, type)); } else { - return vixl::Operand(Int64ConstantFrom(location)); + return vixl::aarch64::Operand(Int64ConstantFrom(location)); } } -static inline vixl::Operand InputOperandAt(HInstruction* instr, int input_index) { +static inline vixl::aarch64::Operand InputOperandAt(HInstruction* instr, int input_index) { return OperandFrom(instr->GetLocations()->InAt(input_index), instr->InputAt(input_index)->GetType()); } -static inline vixl::MemOperand StackOperandFrom(Location location) { - return vixl::MemOperand(vixl::sp, location.GetStackIndex()); +static inline vixl::aarch64::MemOperand StackOperandFrom(Location location) { + return vixl::aarch64::MemOperand(vixl::aarch64::sp, location.GetStackIndex()); } -static inline vixl::MemOperand HeapOperand(const vixl::Register& base, size_t offset = 0) { +static inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base, + size_t offset = 0) { // A heap reference must be 32bit, so fit in a W register. DCHECK(base.IsW()); - return vixl::MemOperand(base.X(), offset); + return vixl::aarch64::MemOperand(base.X(), offset); } -static inline vixl::MemOperand HeapOperand(const vixl::Register& base, - const vixl::Register& regoffset, - vixl::Shift shift = vixl::LSL, - unsigned shift_amount = 0) { +static inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base, + const vixl::aarch64::Register& regoffset, + vixl::aarch64::Shift shift = vixl::aarch64::LSL, + unsigned shift_amount = 0) { // A heap reference must be 32bit, so fit in a W register. DCHECK(base.IsW()); - return vixl::MemOperand(base.X(), regoffset, shift, shift_amount); + return vixl::aarch64::MemOperand(base.X(), regoffset, shift, shift_amount); } -static inline vixl::MemOperand HeapOperand(const vixl::Register& base, Offset offset) { +static inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base, + Offset offset) { return HeapOperand(base, offset.SizeValue()); } -static inline vixl::MemOperand HeapOperandFrom(Location location, Offset offset) { +static inline vixl::aarch64::MemOperand HeapOperandFrom(Location location, Offset offset) { return HeapOperand(RegisterFrom(location, Primitive::kPrimNot), offset); } -static inline Location LocationFrom(const vixl::Register& reg) { - return Location::RegisterLocation(ARTRegCodeFromVIXL(reg.code())); +static inline Location LocationFrom(const vixl::aarch64::Register& reg) { + return Location::RegisterLocation(ARTRegCodeFromVIXL(reg.GetCode())); } -static inline Location LocationFrom(const vixl::FPRegister& fpreg) { - return Location::FpuRegisterLocation(fpreg.code()); +static inline Location LocationFrom(const vixl::aarch64::FPRegister& fpreg) { + return Location::FpuRegisterLocation(fpreg.GetCode()); } -static inline vixl::Operand OperandFromMemOperand(const vixl::MemOperand& mem_op) { +static inline vixl::aarch64::Operand OperandFromMemOperand( + const vixl::aarch64::MemOperand& mem_op) { if (mem_op.IsImmediateOffset()) { - return vixl::Operand(mem_op.offset()); + return vixl::aarch64::Operand(mem_op.GetOffset()); } else { DCHECK(mem_op.IsRegisterOffset()); - if (mem_op.extend() != vixl::NO_EXTEND) { - return vixl::Operand(mem_op.regoffset(), mem_op.extend(), mem_op.shift_amount()); - } else if (mem_op.shift() != vixl::NO_SHIFT) { - return vixl::Operand(mem_op.regoffset(), mem_op.shift(), mem_op.shift_amount()); + if (mem_op.GetExtend() != vixl::aarch64::NO_EXTEND) { + return vixl::aarch64::Operand(mem_op.GetRegisterOffset(), + mem_op.GetExtend(), + mem_op.GetShiftAmount()); + } else if (mem_op.GetShift() != vixl::aarch64::NO_SHIFT) { + return vixl::aarch64::Operand(mem_op.GetRegisterOffset(), + mem_op.GetShift(), + mem_op.GetShiftAmount()); } else { LOG(FATAL) << "Should not reach here"; UNREACHABLE(); @@ -212,13 +226,13 @@ static bool CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* inst if (instr->IsAnd() || instr->IsOr() || instr->IsXor()) { // Uses logical operations. - return vixl::Assembler::IsImmLogical(value, vixl::kXRegSize); + return vixl::aarch64::Assembler::IsImmLogical(value, vixl::aarch64::kXRegSize); } else if (instr->IsNeg()) { // Uses mov -immediate. - return vixl::Assembler::IsImmMovn(value, vixl::kXRegSize); + return vixl::aarch64::Assembler::IsImmMovn(value, vixl::aarch64::kXRegSize); } else { DCHECK(instr->IsAdd() || - instr->IsArm64IntermediateAddress() || + instr->IsIntermediateAddress() || instr->IsBoundsCheck() || instr->IsCompare() || instr->IsCondition() || @@ -227,7 +241,8 @@ static bool CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* inst // Uses aliases of ADD/SUB instructions. // If `value` does not fit but `-value` does, VIXL will automatically use // the 'opposite' instruction. - return vixl::Assembler::IsImmAddSub(value) || vixl::Assembler::IsImmAddSub(-value); + return vixl::aarch64::Assembler::IsImmAddSub(value) + || vixl::aarch64::Assembler::IsImmAddSub(-value); } } @@ -263,30 +278,30 @@ static inline bool ArtVixlRegCodeCoherentForRegSet(uint32_t art_core_registers, return true; } -static inline vixl::Shift ShiftFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) { +static inline vixl::aarch64::Shift ShiftFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) { switch (op_kind) { - case HArm64DataProcWithShifterOp::kASR: return vixl::ASR; - case HArm64DataProcWithShifterOp::kLSL: return vixl::LSL; - case HArm64DataProcWithShifterOp::kLSR: return vixl::LSR; + case HArm64DataProcWithShifterOp::kASR: return vixl::aarch64::ASR; + case HArm64DataProcWithShifterOp::kLSL: return vixl::aarch64::LSL; + case HArm64DataProcWithShifterOp::kLSR: return vixl::aarch64::LSR; default: LOG(FATAL) << "Unexpected op kind " << op_kind; UNREACHABLE(); - return vixl::NO_SHIFT; + return vixl::aarch64::NO_SHIFT; } } -static inline vixl::Extend ExtendFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) { +static inline vixl::aarch64::Extend ExtendFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) { switch (op_kind) { - case HArm64DataProcWithShifterOp::kUXTB: return vixl::UXTB; - case HArm64DataProcWithShifterOp::kUXTH: return vixl::UXTH; - case HArm64DataProcWithShifterOp::kUXTW: return vixl::UXTW; - case HArm64DataProcWithShifterOp::kSXTB: return vixl::SXTB; - case HArm64DataProcWithShifterOp::kSXTH: return vixl::SXTH; - case HArm64DataProcWithShifterOp::kSXTW: return vixl::SXTW; + case HArm64DataProcWithShifterOp::kUXTB: return vixl::aarch64::UXTB; + case HArm64DataProcWithShifterOp::kUXTH: return vixl::aarch64::UXTH; + case HArm64DataProcWithShifterOp::kUXTW: return vixl::aarch64::UXTW; + case HArm64DataProcWithShifterOp::kSXTB: return vixl::aarch64::SXTB; + case HArm64DataProcWithShifterOp::kSXTH: return vixl::aarch64::SXTH; + case HArm64DataProcWithShifterOp::kSXTW: return vixl::aarch64::SXTW; default: LOG(FATAL) << "Unexpected op kind " << op_kind; UNREACHABLE(); - return vixl::NO_EXTEND; + return vixl::aarch64::NO_EXTEND; } } diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc index 49cfff46d8..e1bde7c737 100644 --- a/compiler/optimizing/dead_code_elimination.cc +++ b/compiler/optimizing/dead_code_elimination.cc @@ -88,13 +88,207 @@ void HDeadCodeElimination::MaybeRecordDeadBlock(HBasicBlock* block) { } } -void HDeadCodeElimination::RemoveDeadBlocks() { - if (graph_->HasIrreducibleLoops()) { - // Do not eliminate dead blocks if the graph has irreducible loops. We could - // support it, but that would require changes in our loop representation to handle - // multiple entry points. We decided it was not worth the complexity. - return; +void HDeadCodeElimination::MaybeRecordSimplifyIf() { + if (stats_ != nullptr) { + stats_->RecordStat(MethodCompilationStat::kSimplifyIf); + } +} + +static bool HasInput(HCondition* instruction, HInstruction* input) { + return (instruction->InputAt(0) == input) || + (instruction->InputAt(1) == input); +} + +static bool HasEquality(IfCondition condition) { + switch (condition) { + case kCondEQ: + case kCondLE: + case kCondGE: + case kCondBE: + case kCondAE: + return true; + case kCondNE: + case kCondLT: + case kCondGT: + case kCondB: + case kCondA: + return false; + } +} + +static HConstant* Evaluate(HCondition* condition, HInstruction* left, HInstruction* right) { + if (left == right && !Primitive::IsFloatingPointType(left->GetType())) { + return condition->GetBlock()->GetGraph()->GetIntConstant( + HasEquality(condition->GetCondition()) ? 1 : 0); + } + + if (!left->IsConstant() || !right->IsConstant()) { + return nullptr; + } + + if (left->IsIntConstant()) { + return condition->Evaluate(left->AsIntConstant(), right->AsIntConstant()); + } else if (left->IsNullConstant()) { + return condition->Evaluate(left->AsNullConstant(), right->AsNullConstant()); + } else if (left->IsLongConstant()) { + return condition->Evaluate(left->AsLongConstant(), right->AsLongConstant()); + } else if (left->IsFloatConstant()) { + return condition->Evaluate(left->AsFloatConstant(), right->AsFloatConstant()); + } else { + DCHECK(left->IsDoubleConstant()); + return condition->Evaluate(left->AsDoubleConstant(), right->AsDoubleConstant()); + } +} + +// Simplify the pattern: +// +// B1 B2 ... +// goto goto goto +// \ | / +// \ | / +// B3 +// i1 = phi(input, input) +// (i2 = condition on i1) +// if i1 (or i2) +// / \ +// / \ +// B4 B5 +// +// Into: +// +// B1 B2 ... +// | | | +// B4 B5 B? +// +// This simplification cannot be applied for loop headers, as they +// contain a suspend check. +// +// Note that we rely on the dead code elimination to get rid of B3. +bool HDeadCodeElimination::SimplifyIfs() { + bool simplified_one_or_more_ifs = false; + bool rerun_dominance_and_loop_analysis = false; + + for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) { + HBasicBlock* block = it.Current(); + HInstruction* last = block->GetLastInstruction(); + HInstruction* first = block->GetFirstInstruction(); + if (last->IsIf() && + block->HasSinglePhi() && + block->GetFirstPhi()->HasOnlyOneNonEnvironmentUse()) { + bool has_only_phi_and_if = (last == first) && (last->InputAt(0) == block->GetFirstPhi()); + bool has_only_phi_condition_and_if = + !has_only_phi_and_if && + first->IsCondition() && + HasInput(first->AsCondition(), block->GetFirstPhi()) && + (first->GetNext() == last) && + (last->InputAt(0) == first) && + first->HasOnlyOneNonEnvironmentUse(); + + if (has_only_phi_and_if || has_only_phi_condition_and_if) { + DCHECK(!block->IsLoopHeader()); + HPhi* phi = block->GetFirstPhi()->AsPhi(); + bool phi_input_is_left = (first->InputAt(0) == phi); + + // Walk over all inputs of the phis and update the control flow of + // predecessors feeding constants to the phi. + // Note that phi->InputCount() may change inside the loop. + for (size_t i = 0; i < phi->InputCount();) { + HInstruction* input = phi->InputAt(i); + HInstruction* value_to_check = nullptr; + if (has_only_phi_and_if) { + if (input->IsIntConstant()) { + value_to_check = input; + } + } else { + DCHECK(has_only_phi_condition_and_if); + if (phi_input_is_left) { + value_to_check = Evaluate(first->AsCondition(), input, first->InputAt(1)); + } else { + value_to_check = Evaluate(first->AsCondition(), first->InputAt(0), input); + } + } + if (value_to_check == nullptr) { + // Could not evaluate to a constant, continue iterating over the inputs. + ++i; + } else { + HBasicBlock* predecessor_to_update = block->GetPredecessors()[i]; + HBasicBlock* successor_to_update = nullptr; + if (value_to_check->AsIntConstant()->IsTrue()) { + successor_to_update = last->AsIf()->IfTrueSuccessor(); + } else { + DCHECK(value_to_check->AsIntConstant()->IsFalse()) + << value_to_check->AsIntConstant()->GetValue(); + successor_to_update = last->AsIf()->IfFalseSuccessor(); + } + predecessor_to_update->ReplaceSuccessor(block, successor_to_update); + phi->RemoveInputAt(i); + simplified_one_or_more_ifs = true; + if (block->IsInLoop()) { + rerun_dominance_and_loop_analysis = true; + } + // For simplicity, don't create a dead block, let the dead code elimination + // pass deal with it. + if (phi->InputCount() == 1) { + break; + } + } + } + if (block->GetPredecessors().size() == 1) { + phi->ReplaceWith(phi->InputAt(0)); + block->RemovePhi(phi); + if (has_only_phi_condition_and_if) { + // Evaluate here (and not wait for a constant folding pass) to open + // more opportunities for DCE. + HInstruction* result = first->AsCondition()->TryStaticEvaluation(); + if (result != nullptr) { + first->ReplaceWith(result); + block->RemoveInstruction(first); + } + } + } + if (simplified_one_or_more_ifs) { + MaybeRecordSimplifyIf(); + } + } + } + } + // We need to re-analyze the graph in order to run DCE afterwards. + if (simplified_one_or_more_ifs) { + if (rerun_dominance_and_loop_analysis) { + graph_->ClearLoopInformation(); + graph_->ClearDominanceInformation(); + graph_->BuildDominatorTree(); + } else { + graph_->ClearDominanceInformation(); + // We have introduced critical edges, remove them. + graph_->SimplifyCFG(); + graph_->ComputeDominanceInformation(); + graph_->ComputeTryBlockInformation(); + } + } + + return simplified_one_or_more_ifs; +} + +void HDeadCodeElimination::ConnectSuccessiveBlocks() { + // Order does not matter. + for (HReversePostOrderIterator it(*graph_); !it.Done();) { + HBasicBlock* block = it.Current(); + if (block->IsEntryBlock() || !block->GetLastInstruction()->IsGoto()) { + it.Advance(); + continue; + } + HBasicBlock* successor = block->GetSingleSuccessor(); + if (successor->IsExitBlock() || successor->GetPredecessors().size() != 1u) { + it.Advance(); + continue; + } + block->MergeWith(successor); + // Reiterate on this block in case it can be merged with its new successor. } +} + +bool HDeadCodeElimination::RemoveDeadBlocks() { // Classify blocks as reachable/unreachable. ArenaAllocator* allocator = graph_->GetArena(); ArenaBitVector live_blocks(allocator, graph_->GetBlocks().size(), false, kArenaAllocDCE); @@ -132,23 +326,7 @@ void HDeadCodeElimination::RemoveDeadBlocks() { graph_->ComputeTryBlockInformation(); } } - - // Connect successive blocks created by dead branches. Order does not matter. - for (HReversePostOrderIterator it(*graph_); !it.Done();) { - HBasicBlock* block = it.Current(); - if (block->IsEntryBlock() || !block->GetLastInstruction()->IsGoto()) { - it.Advance(); - continue; - } - HBasicBlock* successor = block->GetSingleSuccessor(); - if (successor->IsExitBlock() || successor->GetPredecessors().size() != 1u) { - it.Advance(); - continue; - } - block->MergeWith(successor); - - // Reiterate on this block in case it can be merged with its new successor. - } + return removed_one_or_more_blocks; } void HDeadCodeElimination::RemoveDeadInstructions() { @@ -181,7 +359,20 @@ void HDeadCodeElimination::RemoveDeadInstructions() { } void HDeadCodeElimination::Run() { - RemoveDeadBlocks(); + // Do not eliminate dead blocks if the graph has irreducible loops. We could + // support it, but that would require changes in our loop representation to handle + // multiple entry points. We decided it was not worth the complexity. + if (!graph_->HasIrreducibleLoops()) { + // Simplify graph to generate more dead block patterns. + ConnectSuccessiveBlocks(); + bool did_any_simplification = false; + did_any_simplification |= SimplifyIfs(); + did_any_simplification |= RemoveDeadBlocks(); + if (did_any_simplification) { + // Connect successive blocks created by dead branches. + ConnectSuccessiveBlocks(); + } + } SsaRedundantPhiElimination(graph_).Run(); RemoveDeadInstructions(); } diff --git a/compiler/optimizing/dead_code_elimination.h b/compiler/optimizing/dead_code_elimination.h index 8d6008b845..58e700deba 100644 --- a/compiler/optimizing/dead_code_elimination.h +++ b/compiler/optimizing/dead_code_elimination.h @@ -31,18 +31,19 @@ class HDeadCodeElimination : public HOptimization { public: HDeadCodeElimination(HGraph* graph, OptimizingCompilerStats* stats = nullptr, - const char* name = kInitialDeadCodeEliminationPassName) + const char* name = kDeadCodeEliminationPassName) : HOptimization(graph, name, stats) {} void Run() OVERRIDE; - - static constexpr const char* kInitialDeadCodeEliminationPassName = "dead_code_elimination"; - static constexpr const char* kFinalDeadCodeEliminationPassName = "dead_code_elimination_final"; + static constexpr const char* kDeadCodeEliminationPassName = "dead_code_elimination"; private: void MaybeRecordDeadBlock(HBasicBlock* block); - void RemoveDeadBlocks(); + void MaybeRecordSimplifyIf(); + bool RemoveDeadBlocks(); void RemoveDeadInstructions(); + bool SimplifyIfs(); + void ConnectSuccessiveBlocks(); DISALLOW_COPY_AND_ASSIGN(HDeadCodeElimination); }; diff --git a/compiler/optimizing/dex_cache_array_fixups_arm.h b/compiler/optimizing/dex_cache_array_fixups_arm.h index 015f910328..9142e29eff 100644 --- a/compiler/optimizing/dex_cache_array_fixups_arm.h +++ b/compiler/optimizing/dex_cache_array_fixups_arm.h @@ -26,7 +26,9 @@ namespace arm { class DexCacheArrayFixups : public HOptimization { public: DexCacheArrayFixups(HGraph* graph, OptimizingCompilerStats* stats) - : HOptimization(graph, "dex_cache_array_fixups_arm", stats) {} + : HOptimization(graph, kDexCacheArrayFixupsArmPassName, stats) {} + + static constexpr const char* kDexCacheArrayFixupsArmPassName = "dex_cache_array_fixups_arm"; void Run() OVERRIDE; }; diff --git a/compiler/optimizing/dex_cache_array_fixups_mips.cc b/compiler/optimizing/dex_cache_array_fixups_mips.cc index 0f42d9ce0f..19bab08eb4 100644 --- a/compiler/optimizing/dex_cache_array_fixups_mips.cc +++ b/compiler/optimizing/dex_cache_array_fixups_mips.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "code_generator_mips.h" #include "dex_cache_array_fixups_mips.h" #include "base/arena_containers.h" @@ -27,8 +28,9 @@ namespace mips { */ class DexCacheArrayFixupsVisitor : public HGraphVisitor { public: - explicit DexCacheArrayFixupsVisitor(HGraph* graph) + explicit DexCacheArrayFixupsVisitor(HGraph* graph, CodeGenerator* codegen) : HGraphVisitor(graph), + codegen_(down_cast<CodeGeneratorMIPS*>(codegen)), dex_cache_array_bases_(std::less<const DexFile*>(), // Attribute memory use to code generator. graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {} @@ -41,9 +43,45 @@ class DexCacheArrayFixupsVisitor : public HGraphVisitor { HMipsDexCacheArraysBase* base = entry.second; base->MoveBeforeFirstUserAndOutOfLoops(); } + // Computing the dex cache base for PC-relative accesses will clobber RA with + // the NAL instruction on R2. Take a note of this before generating the method + // entry. + if (!dex_cache_array_bases_.empty() && !codegen_->GetInstructionSetFeatures().IsR6()) { + codegen_->ClobberRA(); + } } private: + void VisitLoadClass(HLoadClass* load_class) OVERRIDE { + // If this is a load with PC-relative access to the dex cache types array, + // we need to add the dex cache arrays base as the special input. + if (load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCachePcRelative) { + // Initialize base for target dex file if needed. + const DexFile& dex_file = load_class->GetDexFile(); + HMipsDexCacheArraysBase* base = GetOrCreateDexCacheArrayBase(dex_file); + // Update the element offset in base. + DexCacheArraysLayout layout(kMipsPointerSize, &dex_file); + base->UpdateElementOffset(layout.TypeOffset(load_class->GetTypeIndex())); + // Add the special argument base to the load. + load_class->AddSpecialInput(base); + } + } + + void VisitLoadString(HLoadString* load_string) OVERRIDE { + // If this is a load with PC-relative access to the dex cache strings array, + // we need to add the dex cache arrays base as the special input. + if (load_string->GetLoadKind() == HLoadString::LoadKind::kDexCachePcRelative) { + // Initialize base for target dex file if needed. + const DexFile& dex_file = load_string->GetDexFile(); + HMipsDexCacheArraysBase* base = GetOrCreateDexCacheArrayBase(dex_file); + // Update the element offset in base. + DexCacheArraysLayout layout(kMipsPointerSize, &dex_file); + base->UpdateElementOffset(layout.StringOffset(load_string->GetStringIndex())); + // Add the special argument base to the load. + load_string->AddSpecialInput(base); + } + } + void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE { // If this is an invoke with PC-relative access to the dex cache methods array, // we need to add the dex cache arrays base as the special input. @@ -74,6 +112,8 @@ class DexCacheArrayFixupsVisitor : public HGraphVisitor { }); } + CodeGeneratorMIPS* codegen_; + using DexCacheArraysBaseMap = ArenaSafeMap<const DexFile*, HMipsDexCacheArraysBase*, std::less<const DexFile*>>; DexCacheArraysBaseMap dex_cache_array_bases_; @@ -85,7 +125,7 @@ void DexCacheArrayFixups::Run() { // that can be live-in at the irreducible loop header. return; } - DexCacheArrayFixupsVisitor visitor(graph_); + DexCacheArrayFixupsVisitor visitor(graph_, codegen_); visitor.VisitInsertionOrder(); visitor.MoveBasesIfNeeded(); } diff --git a/compiler/optimizing/dex_cache_array_fixups_mips.h b/compiler/optimizing/dex_cache_array_fixups_mips.h index c8def2842e..861a199d6c 100644 --- a/compiler/optimizing/dex_cache_array_fixups_mips.h +++ b/compiler/optimizing/dex_cache_array_fixups_mips.h @@ -21,14 +21,23 @@ #include "optimization.h" namespace art { + +class CodeGenerator; + namespace mips { class DexCacheArrayFixups : public HOptimization { public: - DexCacheArrayFixups(HGraph* graph, OptimizingCompilerStats* stats) - : HOptimization(graph, "dex_cache_array_fixups_mips", stats) {} + DexCacheArrayFixups(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats) + : HOptimization(graph, kDexCacheArrayFixupsMipsPassName, stats), + codegen_(codegen) {} + + static constexpr const char* kDexCacheArrayFixupsMipsPassName = "dex_cache_array_fixups_mips"; void Run() OVERRIDE; + + private: + CodeGenerator* codegen_; }; } // namespace mips diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index 9d67373321..b3d5341de0 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -31,7 +31,7 @@ #include "nodes.h" #include "optimization.h" #include "reference_type_propagation.h" -#include "register_allocator.h" +#include "register_allocator_linear_scan.h" #include "ssa_liveness_analysis.h" #include "utils/assembler.h" @@ -122,7 +122,10 @@ class HGraphVisualizerDisassembler { new DisassemblerOptions(/* absolute_addresses */ false, base_address, end_address, - /* can_read_literals */ true))); + /* can_read_literals */ true, + Is64BitInstructionSet(instruction_set) + ? &Thread::DumpThreadOffset<PointerSize::k64> + : &Thread::DumpThreadOffset<PointerSize::k32>))); } ~HGraphVisualizerDisassembler() { @@ -298,6 +301,12 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { stream << constant->AsIntConstant()->GetValue(); } else if (constant->IsLongConstant()) { stream << constant->AsLongConstant()->GetValue(); + } else if (constant->IsFloatConstant()) { + stream << constant->AsFloatConstant()->GetValue(); + } else if (constant->IsDoubleConstant()) { + stream << constant->AsDoubleConstant()->GetValue(); + } else if (constant->IsNullConstant()) { + stream << "null"; } } else if (location.IsInvalid()) { stream << "invalid"; @@ -401,6 +410,9 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { void VisitArrayLength(HArrayLength* array_length) OVERRIDE { StartAttributeStream("is_string_length") << std::boolalpha << array_length->IsStringLength() << std::noboolalpha; + if (array_length->IsEmittedAtUseSite()) { + StartAttributeStream("emitted_at_use") << "true"; + } } void VisitBoundsCheck(HBoundsCheck* bounds_check) OVERRIDE { diff --git a/compiler/optimizing/induction_var_analysis.h b/compiler/optimizing/induction_var_analysis.h index 7c74816c26..cd4c830645 100644 --- a/compiler/optimizing/induction_var_analysis.h +++ b/compiler/optimizing/induction_var_analysis.h @@ -39,9 +39,9 @@ class HInductionVarAnalysis : public HOptimization { void Run() OVERRIDE; - private: static constexpr const char* kInductionPassName = "induction_var_analysis"; + private: struct NodeInfo { explicit NodeInfo(uint32_t d) : depth(d), done(false) {} uint32_t depth; diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc index f9e78b0a8f..451aa38033 100644 --- a/compiler/optimizing/inliner.cc +++ b/compiler/optimizing/inliner.cc @@ -17,6 +17,7 @@ #include "inliner.h" #include "art_method-inl.h" +#include "base/enums.h" #include "builder.h" #include "class_linker.h" #include "constant_folding.h" @@ -35,7 +36,7 @@ #include "nodes.h" #include "optimizing_compiler.h" #include "reference_type_propagation.h" -#include "register_allocator.h" +#include "register_allocator_linear_scan.h" #include "quick/inline_method_analyser.h" #include "sharpening.h" #include "ssa_builder.h" @@ -151,7 +152,7 @@ static ArtMethod* FindVirtualOrInterfaceTarget(HInvoke* invoke, ArtMethod* resol } ClassLinker* cl = Runtime::Current()->GetClassLinker(); - size_t pointer_size = cl->GetImagePointerSize(); + PointerSize pointer_size = cl->GetImagePointerSize(); if (invoke->IsInvokeInterface()) { resolved_method = info.GetTypeHandle()->FindVirtualMethodForInterface( resolved_method, pointer_size); @@ -208,12 +209,8 @@ static uint32_t FindClassIndexIn(mirror::Class* cls, DCHECK(cls->IsProxyClass()) << PrettyClass(cls); // TODO: deal with proxy classes. } else if (IsSameDexFile(cls->GetDexFile(), dex_file)) { + DCHECK_EQ(cls->GetDexCache(), dex_cache.Get()); index = cls->GetDexTypeIndex(); - } else { - index = cls->FindTypeIndexInOtherDexFile(dex_file); - } - - if (index != DexFile::kDexNoIndex) { // Update the dex cache to ensure the class is in. The generated code will // consider it is. We make it safe by updating the dex cache, as other // dex files might also load the class, and there is no guarantee the dex @@ -221,6 +218,14 @@ static uint32_t FindClassIndexIn(mirror::Class* cls, if (dex_cache->GetResolvedType(index) == nullptr) { dex_cache->SetResolvedType(index, cls); } + } else { + index = cls->FindTypeIndexInOtherDexFile(dex_file); + // We cannot guarantee the entry in the dex cache will resolve to the same class, + // as there may be different class loaders. So only return the index if it's + // the right class in the dex cache already. + if (index != DexFile::kDexNoIndex && dex_cache->GetResolvedType(index) != cls) { + index = DexFile::kDexNoIndex; + } } return index; @@ -239,7 +244,7 @@ class ScopedProfilingInfoInlineUse { ~ScopedProfilingInfoInlineUse() { if (profiling_info_ != nullptr) { - size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize(); + PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize(); DCHECK_EQ(profiling_info_, method_->GetProfilingInfo(pointer_size)); Runtime::Current()->GetJit()->GetCodeCache()->DoneCompilerUse(method_, self_); } @@ -273,7 +278,7 @@ bool HInliner::TryInline(HInvoke* invoke_instruction) { return false; } MethodReference ref = invoke_instruction->AsInvokeStaticOrDirect()->GetTargetMethod(); - mirror::DexCache* const dex_cache = (&caller_dex_file == ref.dex_file) + mirror::DexCache* const dex_cache = IsSameDexFile(caller_dex_file, *ref.dex_file) ? caller_compilation_unit_.GetDexCache().Get() : class_linker->FindDexCache(soa.Self(), *ref.dex_file); resolved_method = dex_cache->GetResolvedMethod( @@ -386,7 +391,7 @@ bool HInliner::TryInlineMonomorphicCall(HInvoke* invoke_instruction, } ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); - size_t pointer_size = class_linker->GetImagePointerSize(); + PointerSize pointer_size = class_linker->GetImagePointerSize(); if (invoke_instruction->IsInvokeInterface()) { resolved_method = ic.GetMonomorphicType()->FindVirtualMethodForInterface( resolved_method, pointer_size); @@ -478,7 +483,7 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, } ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); - size_t pointer_size = class_linker->GetImagePointerSize(); + PointerSize pointer_size = class_linker->GetImagePointerSize(); const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile(); bool all_targets_inlined = true; @@ -640,7 +645,7 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget(HInvoke* invoke_instruction, return false; } ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); - size_t pointer_size = class_linker->GetImagePointerSize(); + PointerSize pointer_size = class_linker->GetImagePointerSize(); DCHECK(resolved_method != nullptr); ArtMethod* actual_method = nullptr; @@ -656,8 +661,8 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget(HInvoke* invoke_instruction, } ArtMethod* new_method = nullptr; if (invoke_instruction->IsInvokeInterface()) { - new_method = ic.GetTypeAt(i)->GetEmbeddedImTableEntry( - method_index % mirror::Class::kImtSize, pointer_size); + new_method = ic.GetTypeAt(i)->GetImt(pointer_size)->Get( + method_index, pointer_size); if (new_method->IsRuntimeMethod()) { // Bail out as soon as we see a conflict trampoline in one of the target's // interface table. @@ -804,8 +809,6 @@ bool HInliner::TryInlineAndReplace(HInvoke* invoke_instruction, ArtMethod* metho bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, ArtMethod* method, HInstruction** return_replacement) { - const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile(); - if (method->IsProxyMethod()) { VLOG(compiler) << "Method " << PrettyMethod(method) << " is not inlined because of unimplemented inline support for proxy methods."; @@ -828,15 +831,6 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, return false; } - uint32_t method_index = FindMethodIndexIn( - method, caller_dex_file, invoke_instruction->GetDexMethodIndex()); - if (method_index == DexFile::kDexNoIndex) { - VLOG(compiler) << "Call to " - << PrettyMethod(method) - << " cannot be inlined because unaccessible to caller"; - return false; - } - bool same_dex_file = IsSameDexFile(*outer_compilation_unit_.GetDexFile(), *method->GetDexFile()); const DexFile::CodeItem* code_item = method->GetCodeItem(); @@ -873,7 +867,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, if (Runtime::Current()->UseJitCompilation() || !compiler_driver_->IsMethodVerifiedWithoutFailures( method->GetDexMethodIndex(), class_def_idx, *method->GetDexFile())) { - VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file) + VLOG(compiler) << "Method " << PrettyMethod(method) << " couldn't be verified, so it cannot be inlined"; return false; } @@ -883,7 +877,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, invoke_instruction->AsInvokeStaticOrDirect()->IsStaticWithImplicitClinitCheck()) { // Case of a static method that cannot be inlined because it implicitly // requires an initialization check of its declaring class. - VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file) + VLOG(compiler) << "Method " << PrettyMethod(method) << " is not inlined because it is static and requires a clinit" << " check that cannot be emitted due to Dex cache limitations"; return false; @@ -893,7 +887,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, return false; } - VLOG(compiler) << "Successfully inlined " << PrettyMethod(method_index, caller_dex_file); + VLOG(compiler) << "Successfully inlined " << PrettyMethod(method); MaybeRecordStat(kInlinedInvoke); return true; } @@ -1011,7 +1005,7 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction, invoke_instruction->GetBlock()->InsertInstructionBefore(iput, invoke_instruction); // Check whether the field is final. If it is, we need to add a barrier. - size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size); DCHECK(resolved_field != nullptr); if (resolved_field->IsFinal()) { @@ -1037,7 +1031,7 @@ HInstanceFieldGet* HInliner::CreateInstanceFieldGet(Handle<mirror::DexCache> dex uint32_t field_index, HInstruction* obj) SHARED_REQUIRES(Locks::mutator_lock_) { - size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size); DCHECK(resolved_field != nullptr); HInstanceFieldGet* iget = new (graph_->GetArena()) HInstanceFieldGet( @@ -1065,7 +1059,7 @@ HInstanceFieldSet* HInliner::CreateInstanceFieldSet(Handle<mirror::DexCache> dex HInstruction* obj, HInstruction* value) SHARED_REQUIRES(Locks::mutator_lock_) { - size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size); DCHECK(resolved_field != nullptr); HInstanceFieldSet* iput = new (graph_->GetArena()) HInstanceFieldSet( @@ -1094,8 +1088,11 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, uint32_t method_index = resolved_method->GetDexMethodIndex(); ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); Handle<mirror::DexCache> dex_cache(handles_->NewHandle(resolved_method->GetDexCache())); + Handle<mirror::ClassLoader> class_loader(handles_->NewHandle( + resolved_method->GetDeclaringClass()->GetClassLoader())); + DexCompilationUnit dex_compilation_unit( - caller_compilation_unit_.GetClassLoader(), + class_loader.ToJObject(), class_linker, callee_dex_file, code_item, @@ -1404,7 +1401,7 @@ bool HInliner::ArgumentTypesMoreSpecific(HInvoke* invoke_instruction, ArtMethod* } } - size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize(); + PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize(); // Iterate over the list of parameter types and test whether any of the // actual inputs has a more specific reference type than the type declared in @@ -1461,7 +1458,7 @@ void HInliner::FixUpReturnReferenceType(ArtMethod* resolved_method, // TODO: we could be more precise by merging the phi inputs but that requires // some functionality from the reference type propagation. DCHECK(return_replacement->IsPhi()); - size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize(); + PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize(); mirror::Class* cls = resolved_method->GetReturnType(false /* resolve */, pointer_size); return_replacement->SetReferenceTypeInfo(GetClassRTI(cls)); } diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc index b4125299ea..e5dab569fd 100644 --- a/compiler/optimizing/instruction_builder.cc +++ b/compiler/optimizing/instruction_builder.cc @@ -16,8 +16,10 @@ #include "instruction_builder.h" +#include "art_method-inl.h" #include "bytecode_utils.h" #include "class_linker.h" +#include "dex_instruction-inl.h" #include "driver/compiler_options.h" #include "scoped_thread_state_change.h" @@ -890,7 +892,7 @@ bool HInstructionBuilder::BuildInvoke(const Instruction& instruction, return_type, dex_pc, method_idx, - resolved_method->GetDexMethodIndex()); + resolved_method->GetImtIndex()); } return HandleInvoke(invoke, diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h index 9cfc065da6..517cf76831 100644 --- a/compiler/optimizing/instruction_builder.h +++ b/compiler/optimizing/instruction_builder.h @@ -30,6 +30,8 @@ namespace art { +class Instruction; + class HInstructionBuilder : public ValueObject { public: HInstructionBuilder(HGraph* graph, diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc index e0410dcdb2..4ca0600dba 100644 --- a/compiler/optimizing/instruction_simplifier.cc +++ b/compiler/optimizing/instruction_simplifier.cc @@ -920,6 +920,7 @@ void InstructionSimplifierVisitor::VisitTypeConversion(HTypeConversion* instruct void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) { HConstant* input_cst = instruction->GetConstantRight(); HInstruction* input_other = instruction->GetLeastConstantLeft(); + bool integral_type = Primitive::IsIntegralType(instruction->GetType()); if ((input_cst != nullptr) && input_cst->IsArithmeticZero()) { // Replace code looking like // ADD dst, src, 0 @@ -928,7 +929,7 @@ void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) { // Note that we cannot optimize `x + 0.0` to `x` for floating-point. When // `x` is `-0.0`, the former expression yields `0.0`, while the later // yields `-0.0`. - if (Primitive::IsIntegralType(instruction->GetType())) { + if (integral_type) { instruction->ReplaceWith(input_other); instruction->GetBlock()->RemoveInstruction(instruction); RecordSimplification(); @@ -974,10 +975,31 @@ void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) { // so no need to return. TryHandleAssociativeAndCommutativeOperation(instruction); - if ((instruction->GetLeft()->IsSub() || instruction->GetRight()->IsSub()) && + if ((left->IsSub() || right->IsSub()) && TrySubtractionChainSimplification(instruction)) { return; } + + if (integral_type) { + // Replace code patterns looking like + // SUB dst1, x, y SUB dst1, x, y + // ADD dst2, dst1, y ADD dst2, y, dst1 + // with + // SUB dst1, x, y + // ADD instruction is not needed in this case, we may use + // one of inputs of SUB instead. + if (left->IsSub() && left->InputAt(1) == right) { + instruction->ReplaceWith(left->InputAt(0)); + RecordSimplification(); + instruction->GetBlock()->RemoveInstruction(instruction); + return; + } else if (right->IsSub() && right->InputAt(1) == left) { + instruction->ReplaceWith(right->InputAt(0)); + RecordSimplification(); + instruction->GetBlock()->RemoveInstruction(instruction); + return; + } + } } void InstructionSimplifierVisitor::VisitAnd(HAnd* instruction) { @@ -1511,6 +1533,29 @@ void InstructionSimplifierVisitor::VisitSub(HSub* instruction) { if (TrySubtractionChainSimplification(instruction)) { return; } + + if (left->IsAdd()) { + // Replace code patterns looking like + // ADD dst1, x, y ADD dst1, x, y + // SUB dst2, dst1, y SUB dst2, dst1, x + // with + // ADD dst1, x, y + // SUB instruction is not needed in this case, we may use + // one of inputs of ADD instead. + // It is applicable to integral types only. + DCHECK(Primitive::IsIntegralType(type)); + if (left->InputAt(1) == right) { + instruction->ReplaceWith(left->InputAt(0)); + RecordSimplification(); + instruction->GetBlock()->RemoveInstruction(instruction); + return; + } else if (left->InputAt(0) == right) { + instruction->ReplaceWith(left->InputAt(1)); + RecordSimplification(); + instruction->GetBlock()->RemoveInstruction(instruction); + return; + } + } } void InstructionSimplifierVisitor::VisitUShr(HUShr* instruction) { diff --git a/compiler/optimizing/instruction_simplifier_arm.cc b/compiler/optimizing/instruction_simplifier_arm.cc index cd026b8770..495f3fd232 100644 --- a/compiler/optimizing/instruction_simplifier_arm.cc +++ b/compiler/optimizing/instruction_simplifier_arm.cc @@ -14,8 +14,10 @@ * limitations under the License. */ +#include "code_generator.h" #include "instruction_simplifier_arm.h" #include "instruction_simplifier_shared.h" +#include "mirror/array-inl.h" namespace art { namespace arm { @@ -38,6 +40,46 @@ void InstructionSimplifierArmVisitor::VisitAnd(HAnd* instruction) { } } +void InstructionSimplifierArmVisitor::VisitArrayGet(HArrayGet* instruction) { + size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction); + Primitive::Type type = instruction->GetType(); + + if (type == Primitive::kPrimLong + || type == Primitive::kPrimFloat + || type == Primitive::kPrimDouble) { + // T32 doesn't support ShiftedRegOffset mem address mode for these types + // to enable optimization. + return; + } + + if (TryExtractArrayAccessAddress(instruction, + instruction->GetArray(), + instruction->GetIndex(), + data_offset)) { + RecordSimplification(); + } +} + +void InstructionSimplifierArmVisitor::VisitArraySet(HArraySet* instruction) { + size_t access_size = Primitive::ComponentSize(instruction->GetComponentType()); + size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value(); + Primitive::Type type = instruction->GetComponentType(); + + if (type == Primitive::kPrimLong + || type == Primitive::kPrimFloat + || type == Primitive::kPrimDouble) { + // T32 doesn't support ShiftedRegOffset mem address mode for these types + // to enable optimization. + return; + } + + if (TryExtractArrayAccessAddress(instruction, + instruction->GetArray(), + instruction->GetIndex(), + data_offset)) { + RecordSimplification(); + } +} } // namespace arm } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_arm.h b/compiler/optimizing/instruction_simplifier_arm.h index 14c940eb21..782110c40a 100644 --- a/compiler/optimizing/instruction_simplifier_arm.h +++ b/compiler/optimizing/instruction_simplifier_arm.h @@ -38,6 +38,8 @@ class InstructionSimplifierArmVisitor : public HGraphVisitor { void VisitMul(HMul* instruction) OVERRIDE; void VisitOr(HOr* instruction) OVERRIDE; void VisitAnd(HAnd* instruction) OVERRIDE; + void VisitArrayGet(HArrayGet* instruction) OVERRIDE; + void VisitArraySet(HArraySet* instruction) OVERRIDE; OptimizingCompilerStats* stats_; }; @@ -46,7 +48,9 @@ class InstructionSimplifierArmVisitor : public HGraphVisitor { class InstructionSimplifierArm : public HOptimization { public: InstructionSimplifierArm(HGraph* graph, OptimizingCompilerStats* stats) - : HOptimization(graph, "instruction_simplifier_arm", stats) {} + : HOptimization(graph, kInstructionSimplifierArmPassName, stats) {} + + static constexpr const char* kInstructionSimplifierArmPassName = "instruction_simplifier_arm"; void Run() OVERRIDE { InstructionSimplifierArmVisitor visitor(graph_, stats_); diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc index 983d31d168..6d107d571f 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.cc +++ b/compiler/optimizing/instruction_simplifier_arm64.cc @@ -28,56 +28,6 @@ using helpers::CanFitInShifterOperand; using helpers::HasShifterOperand; using helpers::ShifterOperandSupportsExtension; -void InstructionSimplifierArm64Visitor::TryExtractArrayAccessAddress(HInstruction* access, - HInstruction* array, - HInstruction* index, - size_t data_offset) { - if (kEmitCompilerReadBarrier) { - // The read barrier instrumentation does not support the - // HArm64IntermediateAddress instruction yet. - // - // TODO: Handle this case properly in the ARM64 code generator and - // re-enable this optimization; otherwise, remove this TODO. - // b/26601270 - return; - } - if (index->IsConstant() || - (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) { - // When the index is a constant all the addressing can be fitted in the - // memory access instruction, so do not split the access. - return; - } - if (access->IsArraySet() && - access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) { - // The access may require a runtime call or the original array pointer. - return; - } - - // Proceed to extract the base address computation. - ArenaAllocator* arena = GetGraph()->GetArena(); - - HIntConstant* offset = GetGraph()->GetIntConstant(data_offset); - HArm64IntermediateAddress* address = - new (arena) HArm64IntermediateAddress(array, offset, kNoDexPc); - address->SetReferenceTypeInfo(array->GetReferenceTypeInfo()); - access->GetBlock()->InsertInstructionBefore(address, access); - access->ReplaceInput(address, 0); - // Both instructions must depend on GC to prevent any instruction that can - // trigger GC to be inserted between the two. - access->AddSideEffects(SideEffects::DependsOnGC()); - DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC())); - DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC())); - // TODO: Code generation for HArrayGet and HArraySet will check whether the input address - // is an HArm64IntermediateAddress and generate appropriate code. - // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe - // `HArm64Load` and `HArm64Store`). We defer these changes because these new instructions would - // not bring any advantages yet. - // Also see the comments in - // `InstructionCodeGeneratorARM64::VisitArrayGet()` and - // `InstructionCodeGeneratorARM64::VisitArraySet()`. - RecordSimplification(); -} - bool InstructionSimplifierArm64Visitor::TryMergeIntoShifterOperand(HInstruction* use, HInstruction* bitfield_op, bool do_merge) { @@ -190,19 +140,23 @@ void InstructionSimplifierArm64Visitor::VisitAnd(HAnd* instruction) { void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) { size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction); - TryExtractArrayAccessAddress(instruction, - instruction->GetArray(), - instruction->GetIndex(), - data_offset); + if (TryExtractArrayAccessAddress(instruction, + instruction->GetArray(), + instruction->GetIndex(), + data_offset)) { + RecordSimplification(); + } } void InstructionSimplifierArm64Visitor::VisitArraySet(HArraySet* instruction) { size_t access_size = Primitive::ComponentSize(instruction->GetComponentType()); size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value(); - TryExtractArrayAccessAddress(instruction, - instruction->GetArray(), - instruction->GetIndex(), - data_offset); + if (TryExtractArrayAccessAddress(instruction, + instruction->GetArray(), + instruction->GetIndex(), + data_offset)) { + RecordSimplification(); + } } void InstructionSimplifierArm64Visitor::VisitMul(HMul* instruction) { diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h index 4735f85ab0..f71684efe9 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.h +++ b/compiler/optimizing/instruction_simplifier_arm64.h @@ -35,10 +35,6 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor { } } - void TryExtractArrayAccessAddress(HInstruction* access, - HInstruction* array, - HInstruction* index, - size_t data_offset); bool TryMergeIntoUsersShifterOperand(HInstruction* instruction); bool TryMergeIntoShifterOperand(HInstruction* use, HInstruction* bitfield_op, @@ -86,8 +82,9 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor { class InstructionSimplifierArm64 : public HOptimization { public: InstructionSimplifierArm64(HGraph* graph, OptimizingCompilerStats* stats) - : HOptimization(graph, "instruction_simplifier_arm64", stats) {} - + : HOptimization(graph, kInstructionSimplifierArm64PassName, stats) {} + static constexpr const char* kInstructionSimplifierArm64PassName + = "instruction_simplifier_arm64"; void Run() OVERRIDE { InstructionSimplifierArm64Visitor visitor(graph_, stats_); visitor.VisitReversePostOrder(); diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc index dab1ebc16d..8f7778fe68 100644 --- a/compiler/optimizing/instruction_simplifier_shared.cc +++ b/compiler/optimizing/instruction_simplifier_shared.cc @@ -226,4 +226,59 @@ bool TryMergeNegatedInput(HBinaryOperation* op) { return false; } + +bool TryExtractArrayAccessAddress(HInstruction* access, + HInstruction* array, + HInstruction* index, + size_t data_offset) { + if (kEmitCompilerReadBarrier) { + // The read barrier instrumentation does not support the + // HIntermediateAddress instruction yet. + // + // TODO: Handle this case properly in the ARM64 and ARM code generator and + // re-enable this optimization; otherwise, remove this TODO. + // b/26601270 + return false; + } + if (index->IsConstant() || + (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) { + // When the index is a constant all the addressing can be fitted in the + // memory access instruction, so do not split the access. + return false; + } + if (access->IsArraySet() && + access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) { + // The access may require a runtime call or the original array pointer. + return false; + } + + // Proceed to extract the base address computation. + HGraph* graph = access->GetBlock()->GetGraph(); + ArenaAllocator* arena = graph->GetArena(); + + HIntConstant* offset = graph->GetIntConstant(data_offset); + HIntermediateAddress* address = + new (arena) HIntermediateAddress(array, offset, kNoDexPc); + address->SetReferenceTypeInfo(array->GetReferenceTypeInfo()); + access->GetBlock()->InsertInstructionBefore(address, access); + access->ReplaceInput(address, 0); + // Both instructions must depend on GC to prevent any instruction that can + // trigger GC to be inserted between the two. + access->AddSideEffects(SideEffects::DependsOnGC()); + DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC())); + DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC())); + // TODO: Code generation for HArrayGet and HArraySet will check whether the input address + // is an HIntermediateAddress and generate appropriate code. + // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe + // `HArm64Load` and `HArm64Store`,`HArmLoad` and `HArmStore`). We defer these changes + // because these new instructions would not bring any advantages yet. + // Also see the comments in + // `InstructionCodeGeneratorARM::VisitArrayGet()` + // `InstructionCodeGeneratorARM::VisitArraySet()` + // `InstructionCodeGeneratorARM64::VisitArrayGet()` + // `InstructionCodeGeneratorARM64::VisitArraySet()`. + return true; +} + + } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h index b1fe8f4756..56804f5e90 100644 --- a/compiler/optimizing/instruction_simplifier_shared.h +++ b/compiler/optimizing/instruction_simplifier_shared.h @@ -26,6 +26,11 @@ bool TryCombineMultiplyAccumulate(HMul* mul, InstructionSet isa); // a negated bitwise instruction. bool TryMergeNegatedInput(HBinaryOperation* op); +bool TryExtractArrayAccessAddress(HInstruction* access, + HInstruction* array, + HInstruction* index, + size_t data_offset); + } // namespace art #endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_ diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h index 3429a8fdbb..1a8eb58857 100644 --- a/compiler/optimizing/intrinsics.h +++ b/compiler/optimizing/intrinsics.h @@ -27,9 +27,6 @@ namespace art { class CompilerDriver; class DexFile; -// Temporary measure until we have caught up with the Java 7 definition of Math.round. b/26327751 -static constexpr bool kRoundIsPlusPointFive = false; - // Positive floating-point infinities. static constexpr uint32_t kPositiveInfinityFloat = 0x7f800000U; static constexpr uint64_t kPositiveInfinityDouble = UINT64_C(0x7ff0000000000000); diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 579fb9d3bb..0bbc0e54bc 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -41,6 +41,92 @@ ArenaAllocator* IntrinsicCodeGeneratorARM::GetAllocator() { using IntrinsicSlowPathARM = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM>; +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT + +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { + public: + explicit ReadBarrierSystemArrayCopySlowPathARM(HInstruction* instruction) + : SlowPathCode(instruction) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); + uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + Register dest = locations->InAt(2).AsRegister<Register>(); + Location dest_pos = locations->InAt(3); + Register src_curr_addr = locations->GetTemp(0).AsRegister<Register>(); + Register dst_curr_addr = locations->GetTemp(1).AsRegister<Register>(); + Register src_stop_addr = locations->GetTemp(2).AsRegister<Register>(); + Register tmp = locations->GetTemp(3).AsRegister<Register>(); + + __ Bind(GetEntryLabel()); + // Compute the base destination address in `dst_curr_addr`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(dst_curr_addr, dest, element_size * constant + offset); + } else { + __ add(dst_curr_addr, + dest, + ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(dst_curr_addr, offset); + } + + Label loop; + __ Bind(&loop); + __ ldr(tmp, Address(src_curr_addr, element_size, Address::PostIndex)); + __ MaybeUnpoisonHeapReference(tmp); + // TODO: Inline the mark bit check before calling the runtime? + // tmp = ReadBarrier::Mark(tmp); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + // (See ReadBarrierMarkSlowPathARM::EmitNativeCode for more + // explanations.) + DCHECK_NE(tmp, SP); + DCHECK_NE(tmp, LR); + DCHECK_NE(tmp, PC); + // IP is used internally by the ReadBarrierMarkRegX entry point + // as a temporary (and not preserved). It thus cannot be used by + // any live register in this slow path. + DCHECK_NE(src_curr_addr, IP); + DCHECK_NE(dst_curr_addr, IP); + DCHECK_NE(src_stop_addr, IP); + DCHECK_NE(tmp, IP); + DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp; + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp); + // This runtime call does not require a stack map. + arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + __ MaybePoisonHeapReference(tmp); + __ str(tmp, Address(dst_curr_addr, element_size, Address::PostIndex)); + __ cmp(src_curr_addr, ShifterOperand(src_stop_addr)); + __ b(&loop, NE); + __ b(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM); +}; + +#undef __ + bool IntrinsicLocationsBuilderARM::TryDispatch(HInvoke* invoke) { Dispatch(invoke); LocationSummary* res = invoke->GetLocations(); @@ -1201,7 +1287,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke, } __ LoadFromOffset(kLoadWord, LR, TR, - QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pIndexOf).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pIndexOf).Int32Value()); CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>(); __ blx(LR); @@ -1212,7 +1298,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke, void IntrinsicLocationsBuilderARM::VisitStringIndexOf(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's // best to align the inputs accordingly. @@ -1232,7 +1318,7 @@ void IntrinsicCodeGeneratorARM::VisitStringIndexOf(HInvoke* invoke) { void IntrinsicLocationsBuilderARM::VisitStringIndexOfAfter(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's // best to align the inputs accordingly. @@ -1250,7 +1336,7 @@ void IntrinsicCodeGeneratorARM::VisitStringIndexOfAfter(HInvoke* invoke) { void IntrinsicLocationsBuilderARM::VisitStringNewStringFromBytes(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1270,8 +1356,10 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromBytes(HInvoke* invoke) { codegen_->AddSlowPath(slow_path); __ b(slow_path->GetEntryLabel(), EQ); - __ LoadFromOffset( - kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromBytes).Int32Value()); + __ LoadFromOffset(kLoadWord, + LR, + TR, + QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pAllocStringFromBytes).Int32Value()); CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>(); __ blx(LR); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1280,7 +1368,7 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromBytes(HInvoke* invoke) { void IntrinsicLocationsBuilderARM::VisitStringNewStringFromChars(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1298,8 +1386,10 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromChars(HInvoke* invoke) { // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data) // // all include a null check on `data` before calling that method. - __ LoadFromOffset( - kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromChars).Int32Value()); + __ LoadFromOffset(kLoadWord, + LR, + TR, + QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pAllocStringFromChars).Int32Value()); CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>(); __ blx(LR); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1307,7 +1397,7 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromChars(HInvoke* invoke) { void IntrinsicLocationsBuilderARM::VisitStringNewStringFromString(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1325,7 +1415,7 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromString(HInvoke* invoke) __ b(slow_path->GetEntryLabel(), EQ); __ LoadFromOffset(kLoadWord, - LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromString).Int32Value()); + LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pAllocStringFromString).Int32Value()); CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>(); __ blx(LR); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1333,9 +1423,9 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromString(HInvoke* invoke) } void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -1358,6 +1448,13 @@ void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { if (length != nullptr && !assembler_->ShifterOperandCanAlwaysHold(length->GetValue())) { locations->SetInAt(4, Location::RequiresRegister()); } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Temporary register IP cannot be used in + // ReadBarrierSystemArrayCopySlowPathARM64 (because that register + // is clobbered by ReadBarrierMarkRegX entry points). Get an extra + // temporary register from the register allocator. + locations->AddTemp(Location::RequiresRegister()); + } } static void CheckPosition(ArmAssembler* assembler, @@ -1423,9 +1520,9 @@ static void CheckPosition(ArmAssembler* assembler, } void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); ArmAssembler* assembler = GetAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -1434,18 +1531,22 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); Register src = locations->InAt(0).AsRegister<Register>(); Location src_pos = locations->InAt(1); Register dest = locations->InAt(2).AsRegister<Register>(); Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); - Register temp1 = locations->GetTemp(0).AsRegister<Register>(); - Register temp2 = locations->GetTemp(1).AsRegister<Register>(); - Register temp3 = locations->GetTemp(2).AsRegister<Register>(); + Location temp1_loc = locations->GetTemp(0); + Register temp1 = temp1_loc.AsRegister<Register>(); + Location temp2_loc = locations->GetTemp(1); + Register temp2 = temp2_loc.AsRegister<Register>(); + Location temp3_loc = locations->GetTemp(2); + Register temp3 = temp3_loc.AsRegister<Register>(); - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); Label conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -1461,7 +1562,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ cmp(src, ShifterOperand(dest)); - __ b(slow_path->GetEntryLabel(), EQ); + __ b(intrinsic_slow_path->GetEntryLabel(), EQ); } // Checked when building locations. @@ -1473,7 +1574,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { __ b(&conditions_on_positions_validated, NE); } __ cmp(dest_pos.AsRegister<Register>(), ShifterOperand(src_pos_constant)); - __ b(slow_path->GetEntryLabel(), GT); + __ b(intrinsic_slow_path->GetEntryLabel(), GT); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -1486,19 +1587,19 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { } else { __ cmp(src_pos.AsRegister<Register>(), ShifterOperand(dest_pos.AsRegister<Register>())); } - __ b(slow_path->GetEntryLabel(), LT); + __ b(intrinsic_slow_path->GetEntryLabel(), LT); } __ Bind(&conditions_on_positions_validated); if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. - __ CompareAndBranchIfZero(src, slow_path->GetEntryLabel()); + __ CompareAndBranchIfZero(src, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. - __ CompareAndBranchIfZero(dest, slow_path->GetEntryLabel()); + __ CompareAndBranchIfZero(dest, intrinsic_slow_path->GetEntryLabel()); } // If the length is negative, bail out. @@ -1507,7 +1608,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { __ cmp(length.AsRegister<Register>(), ShifterOperand(0)); - __ b(slow_path->GetEntryLabel(), LT); + __ b(intrinsic_slow_path->GetEntryLabel(), LT); } // Validity checks: source. @@ -1515,7 +1616,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -1524,7 +1625,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); @@ -1533,112 +1634,287 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. - __ LoadFromOffset(kLoadWord, temp1, dest, class_offset); - __ LoadFromOffset(kLoadWord, temp2, src, class_offset); - bool did_unpoison = false; - if (!optimizations.GetDestinationIsNonPrimitiveArray() || - !optimizations.GetSourceIsNonPrimitiveArray()) { - // One or two of the references need to be unpoisoned. Unpoison them - // both to make the identity check valid. - __ MaybeUnpoisonHeapReference(temp1); - __ MaybeUnpoisonHeapReference(temp2); - did_unpoison = true; - } - if (!optimizations.GetDestinationIsNonPrimitiveArray()) { - // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); - __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp3); - __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel()); - } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ CompareAndBranchIfZero(temp1, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp1, temp1, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel()); + } - if (!optimizations.GetSourceIsNonPrimitiveArray()) { - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp2->component_type_ - __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset); - __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp3); - __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel()); - } + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false); + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ CompareAndBranchIfZero(temp2, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp2, temp2, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp2, intrinsic_slow_path->GetEntryLabel()); + } - __ cmp(temp1, ShifterOperand(temp2)); + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // Note: if heap poisoning is on, we are comparing two unpoisoned references here. + __ cmp(temp1, ShifterOperand(temp2)); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + Label do_copy; + __ b(&do_copy, EQ); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); + __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ b(intrinsic_slow_path->GetEntryLabel(), NE); + } + } else { + // Non read barrier code. + + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ LoadFromOffset(kLoadWord, temp1, dest, class_offset); + // /* HeapReference<Class> */ temp2 = src->klass_ + __ LoadFromOffset(kLoadWord, temp2, src, class_offset); + bool did_unpoison = false; + if (!optimizations.GetDestinationIsNonPrimitiveArray() || + !optimizations.GetSourceIsNonPrimitiveArray()) { + // One or two of the references need to be unpoisoned. Unpoison them + // both to make the identity check valid. + __ MaybeUnpoisonHeapReference(temp1); + __ MaybeUnpoisonHeapReference(temp2); + did_unpoison = true; + } - if (optimizations.GetDestinationIsTypedObjectArray()) { - Label do_copy; - __ b(&do_copy, EQ); - if (!did_unpoison) { + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); + } + + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp2->component_type_ + __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); + } + + __ cmp(temp1, ShifterOperand(temp2)); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + Label do_copy; + __ b(&do_copy, EQ); + if (!did_unpoison) { + __ MaybeUnpoisonHeapReference(temp1); + } + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset); __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); + // No need to unpoison the result, we're comparing against null. + __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ b(intrinsic_slow_path->GetEntryLabel(), NE); } - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->super_class_ - __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); - // No need to unpoison the result, we're comparing against null. - __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel()); - __ Bind(&do_copy); - } else { - __ b(slow_path->GetEntryLabel(), NE); } } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ LoadFromOffset(kLoadWord, temp1, src, class_offset); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); - __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp3); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp3_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp3` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ LoadFromOffset(kLoadWord, temp1, src, class_offset); + __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp3); + } + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel()); + __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); } - // Compute base source address, base destination address, and end source address. - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + // Compute the base source address in `temp1`. if (src_pos.IsConstant()) { int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); __ AddConstant(temp1, src, element_size * constant + offset); } else { - __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, 2)); + __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, element_size_shift)); __ AddConstant(temp1, offset); } - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(temp2, dest, element_size * constant + offset); - } else { - __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, 2)); - __ AddConstant(temp2, offset); - } - + // Compute the end source address in `temp3`. if (length.IsConstant()) { int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); __ AddConstant(temp3, temp1, element_size * constant); } else { - __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, 2)); + __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, element_size_shift)); } - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - Label loop, done; - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&done, EQ); - __ Bind(&loop); - __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); - __ str(IP, Address(temp2, element_size, Address::PostIndex)); - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&loop, NE); - __ Bind(&done); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // The base destination address is computed later, as `temp2` is + // used for intermediate computations. + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + Label loop, done; + + // Don't enter copy loop if `length == 0`. + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&done, EQ); + + // /* int32_t */ monitor = src->monitor_ + __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `temp2`. + __ add(src, src, ShifterOperand(temp2, LSR, 32)); + + // Slow path used to copy array when `src` is gray. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1); + // Carry flag is the last bit shifted out by LSRS. + __ b(read_barrier_slow_path->GetEntryLabel(), CS); + + // Fast-path copy. + + // Compute the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(temp2, dest, element_size * constant + offset); + } else { + __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(temp2, offset); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); + __ str(IP, Address(temp2, element_size, Address::PostIndex)); + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&loop, NE); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); + } else { + // Non read barrier code. + + // Compute the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(temp2, dest, element_size * constant + offset); + } else { + __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(temp2, offset); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + Label loop, done; + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&done, EQ); + __ Bind(&loop); + __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); + __ str(IP, Address(temp2, element_size, Address::PostIndex)); + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&loop, NE); + __ Bind(&done); + } // We only need one card marking on the destination array. codegen_->MarkGCCard(temp1, @@ -1647,7 +1923,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { Register(kNoRegister), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { @@ -1665,7 +1941,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { DCHECK_EQ(invoke->GetType(), Primitive::kPrimDouble); LocationSummary* const locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); const InvokeRuntimeCallingConvention calling_convention; @@ -1692,7 +1968,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) DCHECK_EQ(invoke->GetType(), Primitive::kPrimDouble); LocationSummary* const locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); const InvokeRuntimeCallingConvention calling_convention; @@ -1718,7 +1994,7 @@ static void GenFPToFPCall(HInvoke* invoke, DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(0))); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(1))); - __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmWordSize>(entry).Int32Value()); + __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmPointerSize>(entry).Int32Value()); // Native code uses the soft float ABI. __ vmovrrd(calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1), @@ -1744,7 +2020,7 @@ static void GenFPFPToFPCall(HInvoke* invoke, DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(2))); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(3))); - __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmWordSize>(entry).Int32Value()); + __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmPointerSize>(entry).Int32Value()); // Native code uses the soft float ABI. __ vmovrrd(calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1), @@ -1979,6 +2255,51 @@ void IntrinsicCodeGeneratorARM::VisitShortReverseBytes(HInvoke* invoke) { __ revsh(out, in); } +static void GenBitCount(HInvoke* instr, Primitive::Type type, ArmAssembler* assembler) { + DCHECK(Primitive::IsIntOrLongType(type)) << type; + DCHECK_EQ(instr->GetType(), Primitive::kPrimInt); + DCHECK_EQ(Primitive::PrimitiveKind(instr->InputAt(0)->GetType()), type); + + bool is_long = type == Primitive::kPrimLong; + LocationSummary* locations = instr->GetLocations(); + Location in = locations->InAt(0); + Register src_0 = is_long ? in.AsRegisterPairLow<Register>() : in.AsRegister<Register>(); + Register src_1 = is_long ? in.AsRegisterPairHigh<Register>() : src_0; + SRegister tmp_s = locations->GetTemp(0).AsFpuRegisterPairLow<SRegister>(); + DRegister tmp_d = FromLowSToD(tmp_s); + Register out_r = locations->Out().AsRegister<Register>(); + + // Move data from core register(s) to temp D-reg for bit count calculation, then move back. + // According to Cortex A57 and A72 optimization guides, compared to transferring to full D-reg, + // transferring data from core reg to upper or lower half of vfp D-reg requires extra latency, + // That's why for integer bit count, we use 'vmov d0, r0, r0' instead of 'vmov d0[0], r0'. + __ vmovdrr(tmp_d, src_1, src_0); // Temp DReg |--src_1|--src_0| + __ vcntd(tmp_d, tmp_d); // Temp DReg |c|c|c|c|c|c|c|c| + __ vpaddld(tmp_d, tmp_d, 8, /* is_unsigned */ true); // Temp DReg |--c|--c|--c|--c| + __ vpaddld(tmp_d, tmp_d, 16, /* is_unsigned */ true); // Temp DReg |------c|------c| + if (is_long) { + __ vpaddld(tmp_d, tmp_d, 32, /* is_unsigned */ true); // Temp DReg |--------------c| + } + __ vmovrs(out_r, tmp_s); +} + +void IntrinsicLocationsBuilderARM::VisitIntegerBitCount(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); + invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister()); +} + +void IntrinsicCodeGeneratorARM::VisitIntegerBitCount(HInvoke* invoke) { + GenBitCount(invoke, Primitive::kPrimInt, GetAssembler()); +} + +void IntrinsicLocationsBuilderARM::VisitLongBitCount(HInvoke* invoke) { + VisitIntegerBitCount(invoke); +} + +void IntrinsicCodeGeneratorARM::VisitLongBitCount(HInvoke* invoke) { + GenBitCount(invoke, Primitive::kPrimLong, GetAssembler()); +} + void IntrinsicLocationsBuilderARM::VisitStringGetCharsNoCheck(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, LocationSummary::kNoCall, @@ -2119,8 +2440,6 @@ void IntrinsicCodeGeneratorARM::VisitDoubleIsInfinite(HInvoke* invoke) { __ Lsr(out, out, 5); } -UNIMPLEMENTED_INTRINSIC(ARM, IntegerBitCount) -UNIMPLEMENTED_INTRINSIC(ARM, LongBitCount) UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble) UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat) UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble) diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 1d507530aa..91374b3108 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -26,12 +26,15 @@ #include "mirror/string.h" #include "thread.h" #include "utils/arm64/assembler_arm64.h" -#include "utils/arm64/constants_arm64.h" -#include "vixl/a64/disasm-a64.h" -#include "vixl/a64/macro-assembler-a64.h" +using namespace vixl::aarch64; // NOLINT(build/namespaces) -using namespace vixl; // NOLINT(build/namespaces) +// TODO(VIXL): Make VIXL compile with -Wshadow. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#include "aarch64/disasm-aarch64.h" +#include "aarch64/macro-assembler-aarch64.h" +#pragma GCC diagnostic pop namespace art { @@ -57,15 +60,15 @@ ALWAYS_INLINE inline MemOperand AbsoluteHeapOperandFrom(Location location, size_ } // namespace -vixl::MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() { - return codegen_->GetAssembler()->vixl_masm_; +MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() { + return codegen_->GetVIXLAssembler(); } ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() { return codegen_->GetGraph()->GetArena(); } -#define __ codegen->GetAssembler()->vixl_masm_-> +#define __ codegen->GetVIXLAssembler()-> static void MoveFromReturnRegister(Location trg, Primitive::Type type, @@ -141,6 +144,73 @@ class IntrinsicSlowPathARM64 : public SlowPathCodeARM64 { DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARM64); }; +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 { + public: + ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp) + : SlowPathCodeARM64(instruction), tmp_(tmp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE { + CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + + Register src_curr_addr = XRegisterFrom(locations->GetTemp(0)); + Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1)); + Register src_stop_addr = XRegisterFrom(locations->GetTemp(2)); + Register tmp_reg = WRegisterFrom(tmp_); + + __ Bind(GetEntryLabel()); + vixl::aarch64::Label slow_copy_loop; + __ Bind(&slow_copy_loop); + __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex)); + codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg); + // TODO: Inline the mark bit check before calling the runtime? + // tmp_reg = ReadBarrier::Mark(tmp_reg); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more + // explanations.) + DCHECK_NE(tmp_.reg(), LR); + DCHECK_NE(tmp_.reg(), WSP); + DCHECK_NE(tmp_.reg(), WZR); + // IP0 is used internally by the ReadBarrierMarkRegX entry point + // as a temporary (and not preserved). It thus cannot be used by + // any live register in this slow path. + DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0); + DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0); + DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0); + DCHECK_NE(tmp_.reg(), IP0); + DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg(); + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg()); + // This runtime call does not require a stack map. + codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg); + __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex)); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&slow_copy_loop, ne); + __ B(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM64"; } + + private: + Location tmp_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64); +}; #undef __ bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) { @@ -170,14 +240,14 @@ static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { locations->SetOut(Location::RequiresFpuRegister()); } -static void MoveFPToInt(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) { +static void MoveFPToInt(LocationSummary* locations, bool is64bit, MacroAssembler* masm) { Location input = locations->InAt(0); Location output = locations->Out(); __ Fmov(is64bit ? XRegisterFrom(output) : WRegisterFrom(output), is64bit ? DRegisterFrom(input) : SRegisterFrom(input)); } -static void MoveIntToFP(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) { +static void MoveIntToFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) { Location input = locations->InAt(0); Location output = locations->Out(); __ Fmov(is64bit ? DRegisterFrom(output) : SRegisterFrom(output), @@ -222,7 +292,7 @@ static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { static void GenReverseBytes(LocationSummary* locations, Primitive::Type type, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { Location in = locations->InAt(0); Location out = locations->Out(); @@ -276,7 +346,7 @@ static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { static void GenNumberOfLeadingZeros(LocationSummary* locations, Primitive::Type type, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong); Location in = locations->InAt(0); @@ -303,7 +373,7 @@ void IntrinsicCodeGeneratorARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) static void GenNumberOfTrailingZeros(LocationSummary* locations, Primitive::Type type, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong); Location in = locations->InAt(0); @@ -331,7 +401,7 @@ void IntrinsicCodeGeneratorARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke static void GenReverse(LocationSummary* locations, Primitive::Type type, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong); Location in = locations->InAt(0); @@ -356,7 +426,7 @@ void IntrinsicCodeGeneratorARM64::VisitLongReverse(HInvoke* invoke) { GenReverse(invoke->GetLocations(), Primitive::kPrimLong, GetVIXLAssembler()); } -static void GenBitCount(HInvoke* instr, Primitive::Type type, vixl::MacroAssembler* masm) { +static void GenBitCount(HInvoke* instr, Primitive::Type type, MacroAssembler* masm) { DCHECK(Primitive::IsIntOrLongType(type)) << type; DCHECK_EQ(instr->GetType(), Primitive::kPrimInt); DCHECK_EQ(Primitive::PrimitiveKind(instr->InputAt(0)->GetType()), type); @@ -397,7 +467,7 @@ static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } -static void MathAbsFP(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) { +static void MathAbsFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) { Location in = locations->InAt(0); Location out = locations->Out(); @@ -433,7 +503,7 @@ static void CreateIntToInt(ArenaAllocator* arena, HInvoke* invoke) { static void GenAbsInteger(LocationSummary* locations, bool is64bit, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { Location in = locations->InAt(0); Location output = locations->Out(); @@ -463,7 +533,7 @@ void IntrinsicCodeGeneratorARM64::VisitMathAbsLong(HInvoke* invoke) { static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { Location op1 = locations->InAt(0); Location op2 = locations->InAt(1); Location out = locations->Out(); @@ -523,7 +593,7 @@ void IntrinsicCodeGeneratorARM64::VisitMathMaxFloatFloat(HInvoke* invoke) { static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { Location op1 = locations->InAt(0); Location op2 = locations->InAt(1); Location out = locations->Out(); @@ -574,7 +644,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathSqrt(HInvoke* invoke) { void IntrinsicCodeGeneratorARM64::VisitMathSqrt(HInvoke* invoke) { LocationSummary* locations = invoke->GetLocations(); - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Fsqrt(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0))); } @@ -584,7 +654,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathCeil(HInvoke* invoke) { void IntrinsicCodeGeneratorARM64::VisitMathCeil(HInvoke* invoke) { LocationSummary* locations = invoke->GetLocations(); - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Frintp(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0))); } @@ -594,7 +664,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathFloor(HInvoke* invoke) { void IntrinsicCodeGeneratorARM64::VisitMathFloor(HInvoke* invoke) { LocationSummary* locations = invoke->GetLocations(); - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Frintm(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0))); } @@ -604,7 +674,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathRint(HInvoke* invoke) { void IntrinsicCodeGeneratorARM64::VisitMathRint(HInvoke* invoke) { LocationSummary* locations = invoke->GetLocations(); - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0))); } @@ -617,7 +687,7 @@ static void CreateFPToIntPlusFPTempLocations(ArenaAllocator* arena, HInvoke* inv locations->AddTemp(Location::RequiresFpuRegister()); } -static void GenMathRound(HInvoke* invoke, bool is_double, vixl::MacroAssembler* masm) { +static void GenMathRound(HInvoke* invoke, bool is_double, vixl::aarch64::MacroAssembler* masm) { // Java 8 API definition for Math.round(): // Return the closest long or int to the argument, with ties rounding to positive infinity. // @@ -635,13 +705,13 @@ static void GenMathRound(HInvoke* invoke, bool is_double, vixl::MacroAssembler* FPRegister in_reg = is_double ? DRegisterFrom(l->InAt(0)) : SRegisterFrom(l->InAt(0)); FPRegister tmp_fp = is_double ? DRegisterFrom(l->GetTemp(0)) : SRegisterFrom(l->GetTemp(0)); Register out_reg = is_double ? XRegisterFrom(l->Out()) : WRegisterFrom(l->Out()); - vixl::Label done; + vixl::aarch64::Label done; // Round to nearest integer, ties away from zero. __ Fcvtas(out_reg, in_reg); // For positive values, zero or NaN inputs, rounding is done. - __ Tbz(out_reg, out_reg.size() - 1, &done); + __ Tbz(out_reg, out_reg.GetSizeInBits() - 1, &done); // Handle input < 0 cases. // If input is negative but not a tie, previous result (round to nearest) is valid. @@ -675,7 +745,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) { } void IntrinsicCodeGeneratorARM64::VisitMemoryPeekByte(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Ldrsb(WRegisterFrom(invoke->GetLocations()->Out()), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -685,7 +755,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekIntNative(HInvoke* invoke) { } void IntrinsicCodeGeneratorARM64::VisitMemoryPeekIntNative(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Ldr(WRegisterFrom(invoke->GetLocations()->Out()), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -695,7 +765,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekLongNative(HInvoke* invoke) } void IntrinsicCodeGeneratorARM64::VisitMemoryPeekLongNative(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Ldr(XRegisterFrom(invoke->GetLocations()->Out()), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -705,7 +775,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekShortNative(HInvoke* invoke) } void IntrinsicCodeGeneratorARM64::VisitMemoryPeekShortNative(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Ldrsh(WRegisterFrom(invoke->GetLocations()->Out()), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -723,7 +793,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeByte(HInvoke* invoke) { } void IntrinsicCodeGeneratorARM64::VisitMemoryPokeByte(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Strb(WRegisterFrom(invoke->GetLocations()->InAt(1)), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -733,7 +803,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeIntNative(HInvoke* invoke) { } void IntrinsicCodeGeneratorARM64::VisitMemoryPokeIntNative(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Str(WRegisterFrom(invoke->GetLocations()->InAt(1)), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -743,7 +813,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeLongNative(HInvoke* invoke) } void IntrinsicCodeGeneratorARM64::VisitMemoryPokeLongNative(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Str(XRegisterFrom(invoke->GetLocations()->InAt(1)), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -753,7 +823,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeShortNative(HInvoke* invoke) } void IntrinsicCodeGeneratorARM64::VisitMemoryPokeShortNative(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); __ Strh(WRegisterFrom(invoke->GetLocations()->InAt(1)), AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0)); } @@ -767,7 +837,7 @@ void IntrinsicLocationsBuilderARM64::VisitThreadCurrentThread(HInvoke* invoke) { void IntrinsicCodeGeneratorARM64::VisitThreadCurrentThread(HInvoke* invoke) { codegen_->Load(Primitive::kPrimNot, WRegisterFrom(invoke->GetLocations()->Out()), - MemOperand(tr, Thread::PeerOffset<8>().Int32Value())); + MemOperand(tr, Thread::PeerOffset<kArm64PointerSize>().Int32Value())); } static void GenUnsafeGet(HInvoke* invoke, @@ -778,7 +848,7 @@ static void GenUnsafeGet(HInvoke* invoke, DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong) || (type == Primitive::kPrimNot)); - vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_; + MacroAssembler* masm = codegen->GetVIXLAssembler(); Location base_loc = locations->InAt(1); Register base = WRegisterFrom(base_loc); // Object pointer. Location offset_loc = locations->InAt(2); @@ -912,7 +982,7 @@ static void GenUnsafePut(LocationSummary* locations, bool is_volatile, bool is_ordered, CodeGeneratorARM64* codegen) { - vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_; + MacroAssembler* masm = codegen->GetVIXLAssembler(); Register base = WRegisterFrom(locations->InAt(1)); // Object pointer. Register offset = XRegisterFrom(locations->InAt(2)); // Long offset. @@ -1031,7 +1101,7 @@ static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, } static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorARM64* codegen) { - vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_; + MacroAssembler* masm = codegen->GetVIXLAssembler(); Register out = WRegisterFrom(locations->Out()); // Boolean result. @@ -1070,7 +1140,7 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value)); // result = tmp_value != 0; - vixl::Label loop_head, exit_loop; + vixl::aarch64::Label loop_head, exit_loop; __ Bind(&loop_head); // TODO: When `type == Primitive::kPrimNot`, add a read barrier for // the reference stored in the object before attempting the CAS, @@ -1154,20 +1224,22 @@ void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) { } void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); - Register str = XRegisterFrom(locations->InAt(0)); - Register arg = XRegisterFrom(locations->InAt(1)); + Register str = InputRegisterAt(invoke, 0); + Register arg = InputRegisterAt(invoke, 1); + DCHECK(str.IsW()); + DCHECK(arg.IsW()); Register out = OutputRegister(invoke); Register temp0 = WRegisterFrom(locations->GetTemp(0)); Register temp1 = WRegisterFrom(locations->GetTemp(1)); Register temp2 = WRegisterFrom(locations->GetTemp(2)); - vixl::Label loop; - vixl::Label find_char_diff; - vixl::Label end; + vixl::aarch64::Label loop; + vixl::aarch64::Label find_char_diff; + vixl::aarch64::Label end; // Get offsets of count and value fields within a string object. const int32_t count_offset = mirror::String::CountOffset().Int32Value(); @@ -1189,8 +1261,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) { __ Subs(out, str, arg); __ B(&end, eq); // Load lengths of this and argument strings. - __ Ldr(temp0, MemOperand(str.X(), count_offset)); - __ Ldr(temp1, MemOperand(arg.X(), count_offset)); + __ Ldr(temp0, HeapOperand(str, count_offset)); + __ Ldr(temp1, HeapOperand(arg, count_offset)); // Return zero if both strings are empty. __ Orr(out, temp0, temp1); __ Cbz(out, &end); @@ -1219,8 +1291,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) { // Loop to compare 4x16-bit characters at a time (ok because of string data alignment). __ Bind(&loop); - __ Ldr(temp4, MemOperand(str.X(), temp1)); - __ Ldr(temp0, MemOperand(arg.X(), temp1)); + __ Ldr(temp4, MemOperand(str.X(), temp1.X())); + __ Ldr(temp0, MemOperand(arg.X(), temp1.X())); __ Cmp(temp4, temp0); __ B(ne, &find_char_diff); __ Add(temp1, temp1, char_size * 4); @@ -1239,14 +1311,14 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) { __ Clz(temp1, temp1); // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then // the difference occurs outside the remaining string data, so just return length diff (out). - __ Cmp(temp2, Operand(temp1, LSR, 4)); + __ Cmp(temp2, Operand(temp1.W(), LSR, 4)); __ B(le, &end); // Extract the characters and calculate the difference. __ Bic(temp1, temp1, 0xf); __ Lsr(temp0, temp0, temp1); __ Lsr(temp4, temp4, temp1); __ And(temp4, temp4, 0xffff); - __ Sub(out, temp4, Operand(temp0, UXTH)); + __ Sub(out, temp4.W(), Operand(temp0.W(), UXTH)); __ Bind(&end); @@ -1269,7 +1341,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) { } void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); Register str = WRegisterFrom(locations->InAt(0)); @@ -1281,10 +1353,10 @@ void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) { Register temp1 = WRegisterFrom(locations->GetTemp(0)); Register temp2 = WRegisterFrom(locations->GetTemp(1)); - vixl::Label loop; - vixl::Label end; - vixl::Label return_true; - vixl::Label return_false; + vixl::aarch64::Label loop; + vixl::aarch64::Label end; + vixl::aarch64::Label return_true; + vixl::aarch64::Label return_false; // Get offsets of count, value, and class fields within a string object. const int32_t count_offset = mirror::String::CountOffset().Int32Value(); @@ -1357,7 +1429,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) { } static void GenerateVisitStringIndexOf(HInvoke* invoke, - vixl::MacroAssembler* masm, + MacroAssembler* masm, CodeGeneratorARM64* codegen, ArenaAllocator* allocator, bool start_at_zero) { @@ -1394,7 +1466,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke, __ Mov(tmp_reg, 0); } - __ Ldr(lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pIndexOf).Int32Value())); + __ Ldr(lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pIndexOf).Int32Value())); CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>(); __ Blr(lr); @@ -1405,7 +1477,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke, void IntrinsicLocationsBuilderARM64::VisitStringIndexOf(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's // best to align the inputs accordingly. @@ -1425,7 +1497,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringIndexOf(HInvoke* invoke) { void IntrinsicLocationsBuilderARM64::VisitStringIndexOfAfter(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's // best to align the inputs accordingly. @@ -1443,7 +1515,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) { void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); @@ -1454,7 +1526,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invo } void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); Register byte_array = WRegisterFrom(locations->InAt(0)); @@ -1464,7 +1536,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) __ B(eq, slow_path->GetEntryLabel()); __ Ldr(lr, - MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromBytes).Int32Value())); + MemOperand(tr, + QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pAllocStringFromBytes).Int32Value())); CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>(); __ Blr(lr); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1473,7 +1546,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); @@ -1483,7 +1556,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invo } void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); // No need to emit code checking whether `locations->InAt(2)` is a null // pointer, as callers of the native method @@ -1492,7 +1565,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) // // all include a null check on `data` before calling that method. __ Ldr(lr, - MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromChars).Int32Value())); + MemOperand(tr, + QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pAllocStringFromChars).Int32Value())); CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>(); __ Blr(lr); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1500,7 +1574,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); @@ -1508,7 +1582,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* inv } void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); Register string_to_copy = WRegisterFrom(locations->InAt(0)); @@ -1518,7 +1592,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke __ B(eq, slow_path->GetEntryLabel()); __ Ldr(lr, - MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromString).Int32Value())); + MemOperand(tr, + QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pAllocStringFromString).Int32Value())); CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>(); __ Blr(lr); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1531,7 +1606,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { DCHECK(Primitive::IsFloatingPointType(invoke->GetType())); LocationSummary* const locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; @@ -1546,7 +1621,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) DCHECK(Primitive::IsFloatingPointType(invoke->GetType())); LocationSummary* const locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; @@ -1556,10 +1631,11 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) } static void GenFPToFPCall(HInvoke* invoke, - vixl::MacroAssembler* masm, + MacroAssembler* masm, CodeGeneratorARM64* codegen, QuickEntrypointEnum entry) { - __ Ldr(lr, MemOperand(tr, GetThreadOffset<kArm64WordSize>(entry).Int32Value())); + __ Ldr(lr, MemOperand(tr, + GetThreadOffset<kArm64PointerSize>(entry).Int32Value())); __ Blr(lr); codegen->RecordPcInfo(invoke, invoke->GetDexPc()); } @@ -1716,7 +1792,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) } void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); // Check assumption that sizeof(Char) is 2 (used in scaling below). @@ -1756,9 +1832,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ Sub(num_chr, srcEnd, srcBegin); // Do the copy. - vixl::Label loop; - vixl::Label done; - vixl::Label remainder; + vixl::aarch64::Label loop; + vixl::aarch64::Label done; + vixl::aarch64::Label remainder; // Early out for valid zero-length retrievals. __ Cbz(num_chr, &done); @@ -1773,9 +1849,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) { // Main loop used for longer fetches loads and stores 8x16-bit characters at a time. // (Unaligned addresses are acceptable here and not worth inlining extra code to rectify.) __ Bind(&loop); - __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, vixl::PostIndex)); + __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, PostIndex)); __ Subs(num_chr, num_chr, 8); - __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, vixl::PostIndex)); + __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, PostIndex)); __ B(ge, &loop); __ Adds(num_chr, num_chr, 8); @@ -1784,9 +1860,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) { // Main loop for < 8 character case and remainder handling. Loads and stores one // 16-bit Java character at a time. __ Bind(&remainder); - __ Ldrh(tmp1, MemOperand(src_ptr, char_size, vixl::PostIndex)); + __ Ldrh(tmp1, MemOperand(src_ptr, char_size, PostIndex)); __ Subs(num_chr, num_chr, 1); - __ Strh(tmp1, MemOperand(dst_ptr, char_size, vixl::PostIndex)); + __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex)); __ B(gt, &remainder); __ Bind(&done); @@ -1800,7 +1876,7 @@ static void SetSystemArrayCopyLocationRequires(LocationSummary* locations, uint32_t at, HInstruction* input) { HIntConstant* const_input = input->AsIntConstant(); - if (const_input != nullptr && !vixl::Assembler::IsImmAddSub(const_input->GetValue())) { + if (const_input != nullptr && !vixl::aarch64::Assembler::IsImmAddSub(const_input->GetValue())) { locations->SetInAt(at, Location::RequiresRegister()); } else { locations->SetInAt(at, Location::RegisterOrConstant(input)); @@ -1847,7 +1923,7 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopyChar(HInvoke* invoke) { locations->AddTemp(Location::RequiresRegister()); } -static void CheckSystemArrayCopyPosition(vixl::MacroAssembler* masm, +static void CheckSystemArrayCopyPosition(MacroAssembler* masm, const Location& pos, const Register& input, const Location& length, @@ -1880,7 +1956,7 @@ static void CheckSystemArrayCopyPosition(vixl::MacroAssembler* masm, } else { // Check that pos >= 0. Register pos_reg = WRegisterFrom(pos); - __ Tbnz(pos_reg, pos_reg.size() - 1, slow_path->GetEntryLabel()); + __ Tbnz(pos_reg, pos_reg.GetSizeInBits() - 1, slow_path->GetEntryLabel()); // Check that pos <= length(input) && (length(input) - pos) >= length. __ Ldr(temp, MemOperand(input, length_offset)); @@ -1893,7 +1969,7 @@ static void CheckSystemArrayCopyPosition(vixl::MacroAssembler* masm, // Compute base source address, base destination address, and end source address // for System.arraycopy* intrinsics. -static void GenSystemArrayCopyAddresses(vixl::MacroAssembler* masm, +static void GenSystemArrayCopyAddresses(MacroAssembler* masm, Primitive::Type type, const Register& src, const Location& src_pos, @@ -1934,7 +2010,7 @@ static void GenSystemArrayCopyAddresses(vixl::MacroAssembler* masm, } void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) { - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); Register src = XRegisterFrom(locations->InAt(0)); Location src_pos = locations->InAt(1); @@ -2007,12 +2083,12 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) { const int32_t char_size = Primitive::ComponentSize(Primitive::kPrimChar); UseScratchRegisterScope temps(masm); Register tmp = temps.AcquireW(); - vixl::Label loop, done; + vixl::aarch64::Label loop, done; __ Bind(&loop); __ Cmp(src_curr_addr, src_stop_addr); __ B(&done, eq); - __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, vixl::PostIndex)); - __ Strh(tmp, MemOperand(dst_curr_addr, char_size, vixl::PostIndex)); + __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex)); + __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex)); __ B(&loop); __ Bind(&done); @@ -2026,9 +2102,9 @@ static constexpr int32_t kSystemArrayCopyThreshold = 128; // We want to use two temporary registers in order to reduce the register pressure in arm64. // So we don't use the CodeGenerator::CreateSystemArrayCopyLocationSummary. void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -2081,20 +2157,29 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { locations->AddTemp(Location::RequiresRegister()); locations->AddTemp(Location::RequiresRegister()); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Temporary register IP0, obtained from the VIXL scratch register + // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64 + // (because that register is clobbered by ReadBarrierMarkRegX + // entry points). Get an extra temporary register from the + // register allocator. + locations->AddTemp(Location::RequiresRegister()); + } } void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); - vixl::MacroAssembler* masm = GetVIXLAssembler(); + MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); Register src = XRegisterFrom(locations->InAt(0)); Location src_pos = locations->InAt(1); @@ -2102,12 +2187,14 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); Register temp1 = WRegisterFrom(locations->GetTemp(0)); + Location temp1_loc = LocationFrom(temp1); Register temp2 = WRegisterFrom(locations->GetTemp(1)); + Location temp2_loc = LocationFrom(temp2); - SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCodeARM64* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); - vixl::Label conditions_on_positions_validated; + vixl::aarch64::Label conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); // If source and destination are the same, we go to slow path if we need to do @@ -2121,7 +2208,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ Cmp(src, dest); - __ B(slow_path->GetEntryLabel(), eq); + __ B(intrinsic_slow_path->GetEntryLabel(), eq); } // Checked when building locations. DCHECK(!optimizations.GetDestinationIsSource() @@ -2132,7 +2219,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { __ B(&conditions_on_positions_validated, ne); } __ Cmp(WRegisterFrom(dest_pos), src_pos_constant); - __ B(slow_path->GetEntryLabel(), gt); + __ B(intrinsic_slow_path->GetEntryLabel(), gt); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -2141,19 +2228,19 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { } __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()), OperandFrom(dest_pos, invoke->InputAt(3)->GetType())); - __ B(slow_path->GetEntryLabel(), lt); + __ B(intrinsic_slow_path->GetEntryLabel(), lt); } __ Bind(&conditions_on_positions_validated); if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. - __ Cbz(src, slow_path->GetEntryLabel()); + __ Cbz(src, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. - __ Cbz(dest, slow_path->GetEntryLabel()); + __ Cbz(dest, intrinsic_slow_path->GetEntryLabel()); } // We have already checked in the LocationsBuilder for the constant case. @@ -2161,17 +2248,17 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { // If the length is negative, bail out. - __ Tbnz(WRegisterFrom(length), kWRegSize - 1, slow_path->GetEntryLabel()); + __ Tbnz(WRegisterFrom(length), kWRegSize - 1, intrinsic_slow_path->GetEntryLabel()); // If the length >= 128 then (currently) prefer native implementation. __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold); - __ B(slow_path->GetEntryLabel(), ge); + __ B(intrinsic_slow_path->GetEntryLabel(), ge); } // Validity checks: source. CheckSystemArrayCopyPosition(masm, src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -2180,90 +2267,236 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); { // We use a block to end the scratch scope before the write barrier, thus // freeing the temporary registers so they can be used in `MarkGCCard`. UseScratchRegisterScope temps(masm); + // Note: Because it is acquired from VIXL's scratch register pool, + // `temp3` might be IP0, and thus cannot be used as `ref` argument + // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier + // calls below (see ReadBarrierMarkSlowPathARM64 for more details). Register temp3 = temps.AcquireW(); + if (!optimizations.GetDoesNotNeedTypeCheck()) { // Check whether all elements of the source array are assignable to the component // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. - __ Ldr(temp1, MemOperand(dest, class_offset)); - __ Ldr(temp2, MemOperand(src, class_offset)); - bool did_unpoison = false; - if (!optimizations.GetDestinationIsNonPrimitiveArray() || - !optimizations.GetSourceIsNonPrimitiveArray()) { - // One or two of the references need to be unpoisoned. Unpoison them - // both to make the identity check valid. - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2); - did_unpoison = true; - } - if (!optimizations.GetDestinationIsNonPrimitiveArray()) { - // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ Ldr(temp3, HeapOperand(temp1, component_offset)); - __ Cbz(temp3, slow_path->GetEntryLabel()); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); - __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ Cbnz(temp3, slow_path->GetEntryLabel()); - } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + src.W(), + class_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + temp1, + component_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + __ Cbz(temp1, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_); + __ Ldrh(temp1, HeapOperand(temp1, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel()); + } - if (!optimizations.GetSourceIsNonPrimitiveArray()) { - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp2->component_type_ - __ Ldr(temp3, HeapOperand(temp2, component_offset)); - __ Cbz(temp3, slow_path->GetEntryLabel()); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); - __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ Cbnz(temp3, slow_path->GetEntryLabel()); - } + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + dest.W(), + class_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp2_loc, + temp1, + component_offset, + temp3, + /* needs_null_check */ false, + /* use_load_acquire */ false); + __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_); + __ Ldrh(temp2, HeapOperand(temp2, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel()); + } + + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp2_loc, + src.W(), + class_offset, + temp3, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // Note: if heap poisoning is on, we are comparing two unpoisoned references here. + __ Cmp(temp1, temp2); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + vixl::aarch64::Label do_copy; + __ B(&do_copy, eq); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + temp1, + component_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ Ldr(temp1, HeapOperand(temp1, super_offset)); + __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ B(intrinsic_slow_path->GetEntryLabel(), ne); + } + } else { + // Non read barrier code. + + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ Ldr(temp1, MemOperand(dest, class_offset)); + // /* HeapReference<Class> */ temp2 = src->klass_ + __ Ldr(temp2, MemOperand(src, class_offset)); + bool did_unpoison = false; + if (!optimizations.GetDestinationIsNonPrimitiveArray() || + !optimizations.GetSourceIsNonPrimitiveArray()) { + // One or two of the references need to be unpoisoned. Unpoison them + // both to make the identity check valid. + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2); + did_unpoison = true; + } + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + __ Ldr(temp3, HeapOperand(temp1, component_offset)); + __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel()); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel()); + } + + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp2->component_type_ + __ Ldr(temp3, HeapOperand(temp2, component_offset)); + __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel()); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel()); + } - __ Cmp(temp1, temp2); + __ Cmp(temp1, temp2); - if (optimizations.GetDestinationIsTypedObjectArray()) { - vixl::Label do_copy; - __ B(&do_copy, eq); - if (!did_unpoison) { + if (optimizations.GetDestinationIsTypedObjectArray()) { + vixl::aarch64::Label do_copy; + __ B(&do_copy, eq); + if (!did_unpoison) { + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + } + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ Ldr(temp1, HeapOperand(temp1, component_offset)); codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + __ Ldr(temp1, HeapOperand(temp1, super_offset)); + // No need to unpoison the result, we're comparing against null. + __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ B(intrinsic_slow_path->GetEntryLabel(), ne); } - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ Ldr(temp1, HeapOperand(temp1, component_offset)); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->super_class_ - __ Ldr(temp1, HeapOperand(temp1, super_offset)); - // No need to unpoison the result, we're comparing against null. - __ Cbnz(temp1, slow_path->GetEntryLabel()); - __ Bind(&do_copy); - } else { - __ B(slow_path->GetEntryLabel(), ne); } } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ Ldr(temp1, HeapOperand(src.W(), class_offset)); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ Ldr(temp3, HeapOperand(temp1, component_offset)); - __ Cbz(temp3, slow_path->GetEntryLabel()); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); - __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + src.W(), + class_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp2_loc, + temp1, + component_offset, + temp3, + /* needs_null_check */ false, + /* use_load_acquire */ false); + __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ Ldr(temp1, HeapOperand(src.W(), class_offset)); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + __ Ldr(temp2, HeapOperand(temp1, component_offset)); + __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2); + } + // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_); + __ Ldrh(temp2, HeapOperand(temp2, primitive_offset)); static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ Cbnz(temp3, slow_path->GetEntryLabel()); + __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel()); } Register src_curr_addr = temp1.X(); Register dst_curr_addr = temp2.X(); - Register src_stop_addr = temp3.X(); + Register src_stop_addr; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Temporary register IP0, obtained from the VIXL scratch + // register pool as `temp3`, cannot be used in + // ReadBarrierSystemArrayCopySlowPathARM64 (because that + // register is clobbered by ReadBarrierMarkRegX entry points). + // So another temporary register allocated by the register + // allocator instead. + DCHECK_EQ(LocationFrom(temp3).reg(), IP0); + src_stop_addr = XRegisterFrom(locations->GetTemp(2)); + } else { + src_stop_addr = temp3.X(); + } GenSystemArrayCopyAddresses(masm, Primitive::kPrimNot, @@ -2276,30 +2509,103 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { dst_curr_addr, src_stop_addr); - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - vixl::Label loop, done; const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - __ Bind(&loop); - __ Cmp(src_curr_addr, src_stop_addr); - __ B(&done, eq); - { + + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + vixl::aarch64::Label loop, done; + + // Don't enter copy loop if `length == 0`. + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&done, eq); + Register tmp = temps.AcquireW(); - __ Ldr(tmp, MemOperand(src_curr_addr, element_size, vixl::PostIndex)); - __ Str(tmp, MemOperand(dst_curr_addr, element_size, vixl::PostIndex)); + // Make sure `tmp` is not IP0, as it is clobbered by + // ReadBarrierMarkRegX entry points in + // ReadBarrierSystemArrayCopySlowPathARM64. + DCHECK_NE(LocationFrom(tmp).reg(), IP0); + + // /* int32_t */ monitor = src->monitor_ + __ Ldr(tmp, HeapOperand(src.W(), monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `tmp`. + __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32)); + + // Slow path used to copy array when `src` is gray. + SlowPathCodeARM64* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp)); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); + __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&loop, ne); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); + } else { + // Non read barrier code. + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + vixl::aarch64::Label loop, done; + __ Bind(&loop); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&done, eq); + { + Register tmp = temps.AcquireW(); + __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); + __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); + } + __ B(&loop); + __ Bind(&done); } - __ B(&loop); - __ Bind(&done); } // We only need one card marking on the destination array. codegen_->MarkGCCard(dest.W(), Register(), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } static void GenIsInfinite(LocationSummary* locations, bool is64bit, - vixl::MacroAssembler* masm) { + MacroAssembler* masm) { Operand infinity; Register out; @@ -2311,7 +2617,7 @@ static void GenIsInfinite(LocationSummary* locations, out = WRegisterFrom(locations->Out()); } - const Register zero = vixl::Assembler::AppropriateZeroRegFor(out); + const Register zero = vixl::aarch64::Assembler::AppropriateZeroRegFor(out); MoveFPToInt(locations, is64bit, masm); __ Eor(out, out, infinity); diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h index d47448a9c3..525153621b 100644 --- a/compiler/optimizing/intrinsics_arm64.h +++ b/compiler/optimizing/intrinsics_arm64.h @@ -20,10 +20,11 @@ #include "intrinsics.h" namespace vixl { +namespace aarch64 { class MacroAssembler; -} // namespace vixl +}} // namespace vixl::aarch64 namespace art { @@ -73,7 +74,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) #undef OPTIMIZING_INTRINSICS private: - vixl::MacroAssembler* GetVIXLAssembler(); + vixl::aarch64::MacroAssembler* GetVIXLAssembler(); ArenaAllocator* GetAllocator(); diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index 7137fd9c11..6e5eb6622b 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -1875,7 +1875,7 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeCASObject(HInvoke* invoke) { // int java.lang.String.compareTo(String anotherString) void IntrinsicLocationsBuilderMIPS::VisitStringCompareTo(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1899,8 +1899,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringCompareTo(HInvoke* invoke) { __ LoadFromOffset(kLoadWord, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, - pStringCompareTo).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pStringCompareTo).Int32Value()); __ Jalr(T9); __ Nop(); __ Bind(slow_path->GetExitLabel()); @@ -2059,7 +2058,7 @@ static void GenerateStringIndexOf(HInvoke* invoke, __ LoadFromOffset(kLoadWord, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pIndexOf).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pIndexOf).Int32Value()); __ Jalr(T9); __ Nop(); @@ -2071,7 +2070,7 @@ static void GenerateStringIndexOf(HInvoke* invoke, // int java.lang.String.indexOf(int ch) void IntrinsicLocationsBuilderMIPS::VisitStringIndexOf(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime // calling convention. So it's best to align the inputs accordingly. @@ -2096,7 +2095,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringIndexOf(HInvoke* invoke) { // int java.lang.String.indexOf(int ch, int fromIndex) void IntrinsicLocationsBuilderMIPS::VisitStringIndexOfAfter(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime // calling convention. So it's best to align the inputs accordingly. @@ -2122,7 +2121,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringIndexOfAfter(HInvoke* invoke) { // java.lang.StringFactory.newStringFromBytes(byte[] data, int high, int offset, int byteCount) void IntrinsicLocationsBuilderMIPS::VisitStringNewStringFromBytes(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -2145,7 +2144,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromBytes(HInvoke* invoke) __ LoadFromOffset(kLoadWord, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pAllocStringFromBytes).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromBytes).Int32Value()); __ Jalr(T9); __ Nop(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -2155,7 +2154,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromBytes(HInvoke* invoke) // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data) void IntrinsicLocationsBuilderMIPS::VisitStringNewStringFromChars(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -2178,7 +2177,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromChars(HInvoke* invoke) __ LoadFromOffset(kLoadWord, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pAllocStringFromChars).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromChars).Int32Value()); __ Jalr(T9); __ Nop(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -2187,7 +2186,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromChars(HInvoke* invoke) // java.lang.StringFactory.newStringFromString(String toCopy) void IntrinsicLocationsBuilderMIPS::VisitStringNewStringFromString(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -2207,7 +2206,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromString(HInvoke* invoke) __ LoadFromOffset(kLoadWord, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pAllocStringFromString).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromString).Int32Value()); __ Jalr(T9); __ Nop(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index cc4971b8f8..1e18540e1a 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -1519,7 +1519,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASObject(HInvoke* invoke) { // int java.lang.String.compareTo(String anotherString) void IntrinsicLocationsBuilderMIPS64::VisitStringCompareTo(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1543,7 +1543,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringCompareTo(HInvoke* invoke) { __ LoadFromOffset(kLoadDoubleword, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, pStringCompareTo).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pStringCompareTo).Int32Value()); __ Jalr(T9); __ Nop(); __ Bind(slow_path->GetExitLabel()); @@ -1694,7 +1694,7 @@ static void GenerateStringIndexOf(HInvoke* invoke, __ LoadFromOffset(kLoadDoubleword, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, pIndexOf).Int32Value()); + QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pIndexOf).Int32Value()); CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>(); __ Jalr(T9); __ Nop(); @@ -1707,7 +1707,7 @@ static void GenerateStringIndexOf(HInvoke* invoke, // int java.lang.String.indexOf(int ch) void IntrinsicLocationsBuilderMIPS64::VisitStringIndexOf(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime // calling convention. So it's best to align the inputs accordingly. @@ -1728,7 +1728,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringIndexOf(HInvoke* invoke) { // int java.lang.String.indexOf(int ch, int fromIndex) void IntrinsicLocationsBuilderMIPS64::VisitStringIndexOfAfter(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); // We have a hand-crafted assembly stub that follows the runtime // calling convention. So it's best to align the inputs accordingly. @@ -1748,7 +1748,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringIndexOfAfter(HInvoke* invoke) { // java.lang.StringFactory.newStringFromBytes(byte[] data, int high, int offset, int byteCount) void IntrinsicLocationsBuilderMIPS64::VisitStringNewStringFromBytes(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1771,7 +1771,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromBytes(HInvoke* invoke __ LoadFromOffset(kLoadDoubleword, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, + QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pAllocStringFromBytes).Int32Value()); CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>(); __ Jalr(T9); @@ -1783,7 +1783,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromBytes(HInvoke* invoke // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data) void IntrinsicLocationsBuilderMIPS64::VisitStringNewStringFromChars(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1805,7 +1805,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromChars(HInvoke* invoke __ LoadFromOffset(kLoadDoubleword, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, + QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pAllocStringFromChars).Int32Value()); CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>(); __ Jalr(T9); @@ -1816,7 +1816,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromChars(HInvoke* invoke // java.lang.StringFactory.newStringFromString(String toCopy) void IntrinsicLocationsBuilderMIPS64::VisitStringNewStringFromString(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1836,7 +1836,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromString(HInvoke* invok __ LoadFromOffset(kLoadDoubleword, T9, TR, - QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, + QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pAllocStringFromString).Int32Value()); CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>(); __ Jalr(T9); @@ -1879,6 +1879,84 @@ void IntrinsicCodeGeneratorMIPS64::VisitDoubleIsInfinite(HInvoke* invoke) { GenIsInfinite(invoke->GetLocations(), /* is64bit */ true, GetAssembler()); } +static void GenHighestOneBit(LocationSummary* locations, + Primitive::Type type, + Mips64Assembler* assembler) { + DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong) << PrettyDescriptor(type); + + GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>(); + GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + + if (type == Primitive::kPrimLong) { + __ Dclz(TMP, in); + __ LoadConst64(AT, INT64_C(0x8000000000000000)); + __ Dsrlv(out, AT, TMP); + } else { + __ Clz(TMP, in); + __ LoadConst32(AT, 0x80000000); + __ Srlv(out, AT, TMP); + } + // For either value of "type", when "in" is zero, "out" should also + // be zero. Without this extra "and" operation, when "in" is zero, + // "out" would be either Integer.MIN_VALUE, or Long.MIN_VALUE because + // the MIPS logical shift operations "dsrlv", and "srlv" don't use + // the shift amount (TMP) directly; they use either (TMP % 64) or + // (TMP % 32), respectively. + __ And(out, out, in); +} + +// int java.lang.Integer.highestOneBit(int) +void IntrinsicLocationsBuilderMIPS64::VisitIntegerHighestOneBit(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitIntegerHighestOneBit(HInvoke* invoke) { + GenHighestOneBit(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); +} + +// long java.lang.Long.highestOneBit(long) +void IntrinsicLocationsBuilderMIPS64::VisitLongHighestOneBit(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitLongHighestOneBit(HInvoke* invoke) { + GenHighestOneBit(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); +} + +static void GenLowestOneBit(LocationSummary* locations, + Primitive::Type type, + Mips64Assembler* assembler) { + DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong) << PrettyDescriptor(type); + + GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>(); + GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + + if (type == Primitive::kPrimLong) { + __ Dsubu(TMP, ZERO, in); + } else { + __ Subu(TMP, ZERO, in); + } + __ And(out, TMP, in); +} + +// int java.lang.Integer.lowestOneBit(int) +void IntrinsicLocationsBuilderMIPS64::VisitIntegerLowestOneBit(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitIntegerLowestOneBit(HInvoke* invoke) { + GenLowestOneBit(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); +} + +// long java.lang.Long.lowestOneBit(long) +void IntrinsicLocationsBuilderMIPS64::VisitLongLowestOneBit(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitLongLowestOneBit(HInvoke* invoke) { + GenLowestOneBit(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); +} + UNIMPLEMENTED_INTRINSIC(MIPS64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(MIPS64, StringGetCharsNoCheck) UNIMPLEMENTED_INTRINSIC(MIPS64, SystemArrayCopyChar) @@ -1902,11 +1980,6 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, MathSinh) UNIMPLEMENTED_INTRINSIC(MIPS64, MathTan) UNIMPLEMENTED_INTRINSIC(MIPS64, MathTanh) -UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerHighestOneBit) -UNIMPLEMENTED_INTRINSIC(MIPS64, LongHighestOneBit) -UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerLowestOneBit) -UNIMPLEMENTED_INTRINSIC(MIPS64, LongLowestOneBit) - // 1.8. UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndAddInt) UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndAddLong) diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 812bdf550e..49d6c1952c 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -70,6 +70,105 @@ static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) { using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>; +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT + +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode { + public: + explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction) + : SlowPathCode(instruction) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + Register src = locations->InAt(0).AsRegister<Register>(); + Location src_pos = locations->InAt(1); + Register dest = locations->InAt(2).AsRegister<Register>(); + Location dest_pos = locations->InAt(3); + Location length = locations->InAt(4); + Location temp1_loc = locations->GetTemp(0); + Register temp1 = temp1_loc.AsRegister<Register>(); + Register temp2 = locations->GetTemp(1).AsRegister<Register>(); + Register temp3 = locations->GetTemp(2).AsRegister<Register>(); + + __ Bind(GetEntryLabel()); + // In this code path, registers `temp1`, `temp2`, and `temp3` + // (resp.) are not used for the base source address, the base + // destination address, and the end source address (resp.), as in + // other SystemArrayCopy intrinsic code paths. Instead they are + // (resp.) used for: + // - the loop index (`i`); + // - the source index (`src_index`) and the loaded (source) + // reference (`value`); and + // - the destination index (`dest_index`). + + // i = 0 + __ xorl(temp1, temp1); + NearLabel loop; + __ Bind(&loop); + // value = src_array[i + src_pos] + if (src_pos.IsConstant()) { + int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); + int32_t adjusted_offset = offset + constant * element_size; + __ movl(temp2, Address(src, temp1, ScaleFactor::TIMES_4, adjusted_offset)); + } else { + __ leal(temp2, Address(src_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0)); + __ movl(temp2, Address(src, temp2, ScaleFactor::TIMES_4, offset)); + } + __ MaybeUnpoisonHeapReference(temp2); + // TODO: Inline the mark bit check before calling the runtime? + // value = ReadBarrier::Mark(value) + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + // (See ReadBarrierMarkSlowPathX86::EmitNativeCode for more + // explanations.) + DCHECK_NE(temp2, ESP); + DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2; + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2); + // This runtime call does not require a stack map. + x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + __ MaybePoisonHeapReference(temp2); + // dest_array[i + dest_pos] = value + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + int32_t adjusted_offset = offset + constant * element_size; + __ movl(Address(dest, temp1, ScaleFactor::TIMES_4, adjusted_offset), temp2); + } else { + __ leal(temp3, Address(dest_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0)); + __ movl(Address(dest, temp3, ScaleFactor::TIMES_4, offset), temp2); + } + // ++i + __ addl(temp1, Immediate(1)); + // if (i != length) goto loop + x86_codegen->GenerateIntCompare(temp1_loc, length); + __ j(kNotEqual, &loop); + __ jmp(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86); +}; + +#undef __ + #define __ assembler-> static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) { @@ -706,7 +805,7 @@ static void CreateSSE41FPToFPLocations(ArenaAllocator* arena, // We have to fall back to a call to the intrinsic. LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall); + LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); locations->SetOut(Location::FpuRegisterLocation(XMM0)); @@ -752,20 +851,20 @@ void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) { GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0); } -// Note that 32 bit x86 doesn't have the capability to inline MathRoundDouble, -// as it needs 64 bit instructions. void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) { - // See intrinsics.h. - if (!kRoundIsPlusPointFive) { - return; - } - // Do we have instruction support? if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) { + HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect(); + DCHECK(static_or_direct != nullptr); LocationSummary* locations = new (arena_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); locations->SetInAt(0, Location::RequiresFpuRegister()); + if (static_or_direct->HasSpecialInput() && + invoke->InputAt( + static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) { + locations->SetInAt(1, Location::RequiresRegister()); + } locations->SetOut(Location::RequiresRegister()); locations->AddTemp(Location::RequiresFpuRegister()); locations->AddTemp(Location::RequiresFpuRegister()); @@ -774,7 +873,7 @@ void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) { // We have to fall back to a call to the intrinsic. LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall); + LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); locations->SetOut(Location::RegisterLocation(EAX)); @@ -784,54 +883,61 @@ void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) { void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) { LocationSummary* locations = invoke->GetLocations(); - if (locations->WillCall()) { + if (locations->WillCall()) { // TODO: can we reach this? InvokeOutOfLineIntrinsic(codegen_, invoke); return; } - // Implement RoundFloat as t1 = floor(input + 0.5f); convert to int. XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); Register out = locations->Out().AsRegister<Register>(); - XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); - XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); - NearLabel done, nan; + NearLabel skip_incr, done; X86Assembler* assembler = GetAssembler(); - // Generate 0.5 into inPlusPointFive. - __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f))); - __ movd(inPlusPointFive, out); - - // Add in the input. - __ addss(inPlusPointFive, in); - - // And truncate to an integer. - __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1)); - + // Since no direct x86 rounding instruction matches the required semantics, + // this intrinsic is implemented as follows: + // result = floor(in); + // if (in - result >= 0.5f) + // result = result + 1.0f; + __ movss(t2, in); + __ roundss(t1, in, Immediate(1)); + __ subss(t2, t1); + if (locations->GetInputCount() == 2 && locations->InAt(1).IsValid()) { + // Direct constant area available. + Register constant_area = locations->InAt(1).AsRegister<Register>(); + __ comiss(t2, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(0.5f), constant_area)); + __ j(kBelow, &skip_incr); + __ addss(t1, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(1.0f), constant_area)); + __ Bind(&skip_incr); + } else { + // No constant area: go through stack. + __ pushl(Immediate(bit_cast<int32_t, float>(0.5f))); + __ pushl(Immediate(bit_cast<int32_t, float>(1.0f))); + __ comiss(t2, Address(ESP, 4)); + __ j(kBelow, &skip_incr); + __ addss(t1, Address(ESP, 0)); + __ Bind(&skip_incr); + __ addl(ESP, Immediate(8)); + } + + // Final conversion to an integer. Unfortunately this also does not have a + // direct x86 instruction, since NaN should map to 0 and large positive + // values need to be clipped to the extreme value. __ movl(out, Immediate(kPrimIntMax)); - // maxInt = int-to-float(out) - __ cvtsi2ss(maxInt, out); - - // if inPlusPointFive >= maxInt goto done - __ comiss(inPlusPointFive, maxInt); - __ j(kAboveEqual, &done); - - // if input == NaN goto nan - __ j(kUnordered, &nan); - - // output = float-to-int-truncate(input) - __ cvttss2si(out, inPlusPointFive); - __ jmp(&done); - __ Bind(&nan); - - // output = 0 - __ xorl(out, out); + __ cvtsi2ss(t2, out); + __ comiss(t1, t2); + __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered + __ movl(out, Immediate(0)); // does not change flags + __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out) + __ cvttss2si(out, t1); __ Bind(&done); } static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); @@ -857,7 +963,7 @@ static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86* codegen, QuickEntry } // Now do the actual call. - __ fs()->call(Address::Absolute(GetThreadOffset<kX86WordSize>(entry))); + __ fs()->call(Address::Absolute(GetThreadOffset<kX86PointerSize>(entry))); // Extract the return value from the FP stack. __ fstpl(Address(ESP, 0)); @@ -985,7 +1091,7 @@ void IntrinsicCodeGeneratorX86::VisitMathTanh(HInvoke* invoke) { static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); @@ -1216,7 +1322,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) { void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) { // The inputs plus one temp. LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1237,7 +1343,7 @@ void IntrinsicCodeGeneratorX86::VisitStringCompareTo(HInvoke* invoke) { codegen_->AddSlowPath(slow_path); __ j(kEqual, slow_path->GetEntryLabel()); - __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pStringCompareTo))); + __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pStringCompareTo))); __ Bind(slow_path->GetExitLabel()); } @@ -1490,7 +1596,7 @@ void IntrinsicCodeGeneratorX86::VisitStringIndexOfAfter(HInvoke* invoke) { void IntrinsicLocationsBuilderX86::VisitStringNewStringFromBytes(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1510,7 +1616,7 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromBytes(HInvoke* invoke) { codegen_->AddSlowPath(slow_path); __ j(kEqual, slow_path->GetEntryLabel()); - __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromBytes))); + __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pAllocStringFromBytes))); CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); __ Bind(slow_path->GetExitLabel()); @@ -1518,7 +1624,7 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromBytes(HInvoke* invoke) { void IntrinsicLocationsBuilderX86::VisitStringNewStringFromChars(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1536,14 +1642,14 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromChars(HInvoke* invoke) { // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data) // // all include a null check on `data` before calling that method. - __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromChars))); + __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pAllocStringFromChars))); CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); } void IntrinsicLocationsBuilderX86::VisitStringNewStringFromString(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1560,7 +1666,8 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromString(HInvoke* invoke) codegen_->AddSlowPath(slow_path); __ j(kEqual, slow_path->GetEntryLabel()); - __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromString))); + __ fs()->call( + Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pAllocStringFromString))); CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); __ Bind(slow_path->GetExitLabel()); @@ -1801,7 +1908,7 @@ void IntrinsicLocationsBuilderX86::VisitThreadCurrentThread(HInvoke* invoke) { void IntrinsicCodeGeneratorX86::VisitThreadCurrentThread(HInvoke* invoke) { Register out = invoke->GetLocations()->Out().AsRegister<Register>(); - GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86WordSize>())); + GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86PointerSize>())); } static void GenUnsafeGet(HInvoke* invoke, @@ -2670,9 +2777,9 @@ static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1) } void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -2702,9 +2809,9 @@ void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); X86Assembler* assembler = GetAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -2713,17 +2820,21 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); Register src = locations->InAt(0).AsRegister<Register>(); Location src_pos = locations->InAt(1); Register dest = locations->InAt(2).AsRegister<Register>(); Location dest_pos = locations->InAt(3); - Location length = locations->InAt(4); - Register temp1 = locations->GetTemp(0).AsRegister<Register>(); - Register temp2 = locations->GetTemp(1).AsRegister<Register>(); + Location length_arg = locations->InAt(4); + Location length = length_arg; + Location temp1_loc = locations->GetTemp(0); + Register temp1 = temp1_loc.AsRegister<Register>(); + Location temp2_loc = locations->GetTemp(1); + Register temp2 = temp2_loc.AsRegister<Register>(); - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); NearLabel conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -2739,7 +2850,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ cmpl(src, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -2747,7 +2858,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotEqual, &conditions_on_positions_validated); } __ cmpl(dest_pos.AsRegister<Register>(), Immediate(src_pos_constant)); - __ j(kGreater, slow_path->GetEntryLabel()); + __ j(kGreater, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -2757,10 +2868,10 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { if (dest_pos.IsConstant()) { int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); __ cmpl(src_pos.AsRegister<Register>(), Immediate(dest_pos_constant)); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } else { __ cmpl(src_pos.AsRegister<Register>(), dest_pos.AsRegister<Register>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } } @@ -2769,16 +2880,17 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. __ testl(src, src); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. __ testl(dest, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } - Register temp3 = locations->GetTemp(2).AsRegister<Register>(); + Location temp3_loc = locations->GetTemp(2); + Register temp3 = temp3_loc.AsRegister<Register>(); if (length.IsStackSlot()) { __ movl(temp3, Address(ESP, length.GetStackIndex())); length = Location::RegisterLocation(temp3); @@ -2790,7 +2902,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { __ testl(length.AsRegister<Register>(), length.AsRegister<Register>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } // Validity checks: source. @@ -2798,7 +2910,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -2807,7 +2919,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); @@ -2816,72 +2928,159 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. + if (!optimizations.GetSourceIsNonPrimitiveArray()) { - // /* HeapReference<Class> */ temp1 = temp1->klass_ - __ movl(temp1, Address(src, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ movl(temp1, Address(temp1, component_offset)); - __ testl(temp1, temp1); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp1); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ movl(temp1, Address(src, class_offset)); + __ MaybeUnpoisonHeapReference(temp1); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ movl(temp1, Address(temp1, component_offset)); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp1); + } __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } - if (!optimizations.GetDestinationIsNonPrimitiveArray()) { - // /* HeapReference<Class> */ temp1 = temp1->klass_ - __ movl(temp1, Address(dest, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ temp2 = temp1->component_type_ - __ movl(temp2, Address(temp1, component_offset)); - __ testl(temp2, temp2); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp2); - __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - // Re-poison the heap reference to make the compare instruction below - // compare two poisoned references. - __ PoisonHeapReference(temp1); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + if (length.Equals(Location::RegisterLocation(temp3))) { + // When Baker read barriers are enabled, register `temp3`, + // which in the present case contains the `length` parameter, + // will be overwritten below. Make the `length` location + // reference the original stack location; it will be moved + // back to `temp3` later if necessary. + DCHECK(length_arg.IsStackSlot()); + length = length_arg; + } + + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false); + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(temp2, temp2); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + } + + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // Note: if heap poisoning is on, we are comparing two unpoisoned references here. + __ cmpl(temp1, temp2); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + NearLabel do_copy; + __ j(kEqual, &do_copy); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ cmpl(Address(temp1, super_offset), Immediate(0)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + } } else { - // /* HeapReference<Class> */ temp1 = temp1->klass_ - __ movl(temp1, Address(dest, class_offset)); - } + // Non read barrier code. - // Note: if poisoning is on, we are here comparing two poisoned references. - __ cmpl(temp1, Address(src, class_offset)); + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ movl(temp1, Address(dest, class_offset)); + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + __ MaybeUnpoisonHeapReference(temp1); + // Bail out if the destination is not a non primitive array. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + __ movl(temp2, Address(temp1, component_offset)); + __ testl(temp2, temp2); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp2); + __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + // Re-poison the heap reference to make the compare instruction below + // compare two poisoned references. + __ PoisonHeapReference(temp1); + } - if (optimizations.GetDestinationIsTypedObjectArray()) { - NearLabel do_copy; - __ j(kEqual, &do_copy); + // Note: if heap poisoning is on, we are comparing two poisoned references here. + __ cmpl(temp1, Address(src, class_offset)); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + NearLabel do_copy; + __ j(kEqual, &do_copy); + __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ movl(temp1, Address(temp1, component_offset)); + __ MaybeUnpoisonHeapReference(temp1); + __ cmpl(Address(temp1, super_offset), Immediate(0)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + } + } + } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { + DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); + // Bail out if the source is not a non primitive array. + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ movl(temp1, Address(src, class_offset)); __ MaybeUnpoisonHeapReference(temp1); // /* HeapReference<Class> */ temp1 = temp1->component_type_ __ movl(temp1, Address(temp1, component_offset)); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); __ MaybeUnpoisonHeapReference(temp1); - __ cmpl(Address(temp1, super_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - __ Bind(&do_copy); - } else { - __ j(kNotEqual, slow_path->GetEntryLabel()); } - } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { - DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ movl(temp1, Address(src, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ movl(temp1, Address(temp1, component_offset)); - __ testl(temp1, temp1); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp1); __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } - // Compute base source address, base destination address, and end source address. + // Compute the base source address in `temp1`. int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); DCHECK_EQ(element_size, 4); uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); @@ -2892,35 +3091,136 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ leal(temp1, Address(src, src_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); } - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp2, Address(dest, element_size * constant + offset)); - } else { - __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); - } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // If it is needed (in the case of the fast-path loop), the base + // destination address is computed later, as `temp2` is used for + // intermediate computations. - if (length.IsConstant()) { - int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp3, Address(temp1, element_size * constant)); + // Compute the end source address in `temp3`. + if (length.IsConstant()) { + int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp3, Address(temp1, element_size * constant)); + } else { + if (length.IsStackSlot()) { + // Location `length` is again pointing at a stack slot, as + // register `temp3` (which was containing the length parameter + // earlier) has been overwritten; restore it now + DCHECK(length.Equals(length_arg)); + __ movl(temp3, Address(ESP, length.GetStackIndex())); + length = Location::RegisterLocation(temp3); + } + __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); + } + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // for (size_t i = 0; i != length; ++i) { + // dest_array[dest_pos + i] = + // MaybePoison(ReadBarrier::Mark(MaybeUnpoison(src_array[src_pos + i]))); + // } + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + NearLabel loop, done; + + // Don't enter copy loop if `length == 0`. + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + + // /* int32_t */ monitor = src->monitor_ + __ movl(temp2, Address(src, monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Load fence to prevent load-load reordering. + // Note that this is a no-op, thanks to the x86 memory model. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + + // Slow path used to copy array when `src` is gray. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with SHR. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ shrl(temp2, Immediate(LockWord::kReadBarrierStateShift + 1)); + __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + + // Set the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp2, Address(dest, element_size * constant + offset)); + } else { + __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ pushl(Address(temp1, 0)); + __ cfi().AdjustCFAOffset(4); + __ popl(Address(temp2, 0)); + __ cfi().AdjustCFAOffset(-4); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); } else { - __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); - } - - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - NearLabel loop, done; - __ cmpl(temp1, temp3); - __ j(kEqual, &done); - __ Bind(&loop); - __ pushl(Address(temp1, 0)); - __ cfi().AdjustCFAOffset(4); - __ popl(Address(temp2, 0)); - __ cfi().AdjustCFAOffset(-4); - __ addl(temp1, Immediate(element_size)); - __ addl(temp2, Immediate(element_size)); - __ cmpl(temp1, temp3); - __ j(kNotEqual, &loop); - __ Bind(&done); + // Non read barrier code. + + // Compute the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp2, Address(dest, element_size * constant + offset)); + } else { + __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); + } + + // Compute the end source address in `temp3`. + if (length.IsConstant()) { + int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp3, Address(temp1, element_size * constant)); + } else { + __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + NearLabel loop, done; + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + __ Bind(&loop); + __ pushl(Address(temp1, 0)); + __ cfi().AdjustCFAOffset(4); + __ popl(Address(temp2, 0)); + __ cfi().AdjustCFAOffset(-4); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + __ Bind(&done); + } // We only need one card marking on the destination array. codegen_->MarkGCCard(temp1, @@ -2929,7 +3229,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { Register(kNoRegister), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble) diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 891aaf5ff9..311e1cd6eb 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -64,6 +64,65 @@ static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) { using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>; +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT + +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode { + public: + explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction) + : SlowPathCode(instruction) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + + CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>(); + CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>(); + CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>(); + + __ Bind(GetEntryLabel()); + NearLabel loop; + __ Bind(&loop); + __ movl(CpuRegister(TMP), Address(src_curr_addr, 0)); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + // TODO: Inline the mark bit check before calling the runtime? + // TMP = ReadBarrier::Mark(TMP); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP); + // This runtime call does not require a stack map. + x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + __ MaybePoisonHeapReference(CpuRegister(TMP)); + __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP)); + __ addl(src_curr_addr, Immediate(element_size)); + __ addl(dst_curr_addr, Immediate(element_size)); + __ cmpl(src_curr_addr, src_stop_addr); + __ j(kNotEqual, &loop); + __ jmp(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64); +}; + +#undef __ + #define __ assembler-> static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { @@ -526,7 +585,7 @@ static void CreateSSE41FPToFPLocations(ArenaAllocator* arena, // We have to fall back to a call to the intrinsic. LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall); + LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); locations->SetOut(Location::FpuRegisterLocation(XMM0)); @@ -583,12 +642,13 @@ static void CreateSSE41FPToIntLocations(ArenaAllocator* arena, locations->SetInAt(0, Location::RequiresFpuRegister()); locations->SetOut(Location::RequiresRegister()); locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); return; } // We have to fall back to a call to the intrinsic. LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall); + LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0))); locations->SetOut(Location::RegisterLocation(RAX)); @@ -597,10 +657,7 @@ static void CreateSSE41FPToIntLocations(ArenaAllocator* arena, } void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) { - // See intrinsics.h. - if (kRoundIsPlusPointFive) { - CreateSSE41FPToIntLocations(arena_, invoke, codegen_); - } + CreateSSE41FPToIntLocations(arena_, invoke, codegen_); } void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) { @@ -610,47 +667,41 @@ void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) { return; } - // Implement RoundFloat as t1 = floor(input + 0.5f); convert to int. XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); CpuRegister out = locations->Out().AsRegister<CpuRegister>(); - XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); - NearLabel done, nan; + XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); + NearLabel skip_incr, done; X86_64Assembler* assembler = GetAssembler(); - // Load 0.5 into inPlusPointFive. - __ movss(inPlusPointFive, codegen_->LiteralFloatAddress(0.5f)); - - // Add in the input. - __ addss(inPlusPointFive, in); - - // And truncate to an integer. - __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1)); - - // Load maxInt into out. - codegen_->Load64BitValue(out, kPrimIntMax); - - // if inPlusPointFive >= maxInt goto done - __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax))); - __ j(kAboveEqual, &done); - - // if input == NaN goto nan - __ j(kUnordered, &nan); - - // output = float-to-int-truncate(input) - __ cvttss2si(out, inPlusPointFive); - __ jmp(&done); - __ Bind(&nan); - - // output = 0 - __ xorl(out, out); + // Since no direct x86 rounding instruction matches the required semantics, + // this intrinsic is implemented as follows: + // result = floor(in); + // if (in - result >= 0.5f) + // result = result + 1.0f; + __ movss(t2, in); + __ roundss(t1, in, Immediate(1)); + __ subss(t2, t1); + __ comiss(t2, codegen_->LiteralFloatAddress(0.5f)); + __ j(kBelow, &skip_incr); + __ addss(t1, codegen_->LiteralFloatAddress(1.0f)); + __ Bind(&skip_incr); + + // Final conversion to an integer. Unfortunately this also does not have a + // direct x86 instruction, since NaN should map to 0 and large positive + // values need to be clipped to the extreme value. + codegen_->Load32BitValue(out, kPrimIntMax); + __ cvtsi2ss(t2, out); + __ comiss(t1, t2); + __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered + __ movl(out, Immediate(0)); // does not change flags + __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out) + __ cvttss2si(out, t1); __ Bind(&done); } void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) { - // See intrinsics.h. - if (kRoundIsPlusPointFive) { - CreateSSE41FPToIntLocations(arena_, invoke, codegen_); - } + CreateSSE41FPToIntLocations(arena_, invoke, codegen_); } void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) { @@ -660,46 +711,43 @@ void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) { return; } - // Implement RoundDouble as t1 = floor(input + 0.5); convert to long. XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>(); CpuRegister out = locations->Out().AsRegister<CpuRegister>(); - XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); - NearLabel done, nan; + XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); + NearLabel skip_incr, done; X86_64Assembler* assembler = GetAssembler(); - // Load 0.5 into inPlusPointFive. - __ movsd(inPlusPointFive, codegen_->LiteralDoubleAddress(0.5)); - - // Add in the input. - __ addsd(inPlusPointFive, in); - - // And truncate to an integer. - __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1)); - - // Load maxLong into out. + // Since no direct x86 rounding instruction matches the required semantics, + // this intrinsic is implemented as follows: + // result = floor(in); + // if (in - result >= 0.5) + // result = result + 1.0f; + __ movsd(t2, in); + __ roundsd(t1, in, Immediate(1)); + __ subsd(t2, t1); + __ comisd(t2, codegen_->LiteralDoubleAddress(0.5)); + __ j(kBelow, &skip_incr); + __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f)); + __ Bind(&skip_incr); + + // Final conversion to an integer. Unfortunately this also does not have a + // direct x86 instruction, since NaN should map to 0 and large positive + // values need to be clipped to the extreme value. codegen_->Load64BitValue(out, kPrimLongMax); - - // if inPlusPointFive >= maxLong goto done - __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax))); - __ j(kAboveEqual, &done); - - // if input == NaN goto nan - __ j(kUnordered, &nan); - - // output = double-to-long-truncate(input) - __ cvttsd2si(out, inPlusPointFive, /* is64bit */ true); - __ jmp(&done); - __ Bind(&nan); - - // output = 0 - __ xorl(out, out); + __ cvtsi2sd(t2, out, /* is64bit */ true); + __ comisd(t1, t2); + __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered + __ movl(out, Immediate(0)); // does not change flags, implicit zero extension to 64-bit + __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out) + __ cvttsd2si(out, t1, /* is64bit */ true); __ Bind(&done); } static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); @@ -720,7 +768,7 @@ static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen, DCHECK(invoke->IsInvokeStaticOrDirect()); X86_64Assembler* assembler = codegen->GetAssembler(); - __ gs()->call(Address::Absolute(GetThreadOffset<kX86_64WordSize>(entry), true)); + __ gs()->call(Address::Absolute(GetThreadOffset<kX86_64PointerSize>(entry), true)); codegen->RecordPcInfo(invoke, invoke->GetDexPc()); } @@ -839,7 +887,7 @@ void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) { static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); @@ -1064,9 +1112,9 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -1074,9 +1122,9 @@ void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); X86_64Assembler* assembler = GetAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -1085,18 +1133,23 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>(); Location src_pos = locations->InAt(1); CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>(); Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); - CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>(); - CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>(); - CpuRegister temp3 = locations->GetTemp(2).AsRegister<CpuRegister>(); + Location temp1_loc = locations->GetTemp(0); + CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>(); + Location temp2_loc = locations->GetTemp(1); + CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>(); + Location temp3_loc = locations->GetTemp(2); + CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>(); + Location TMP_loc = Location::RegisterLocation(TMP); - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); NearLabel conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -1112,7 +1165,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ cmpl(src, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -1120,7 +1173,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotEqual, &conditions_on_positions_validated); } __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant)); - __ j(kGreater, slow_path->GetEntryLabel()); + __ j(kGreater, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -1130,10 +1183,10 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { if (dest_pos.IsConstant()) { int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant)); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } else { __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } } @@ -1142,13 +1195,13 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. __ testl(src, src); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. __ testl(dest, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } // If the length is negative, bail out. @@ -1157,7 +1210,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } // Validity checks: source. @@ -1165,7 +1218,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -1174,7 +1227,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); @@ -1183,38 +1236,80 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. - __ movl(temp1, Address(dest, class_offset)); - __ movl(temp2, Address(src, class_offset)); + bool did_unpoison = false; - if (!optimizations.GetDestinationIsNonPrimitiveArray() || - !optimizations.GetSourceIsNonPrimitiveArray()) { - // One or two of the references need to be unpoisoned. Unpoison them - // both to make the identity check valid. - __ MaybeUnpoisonHeapReference(temp1); - __ MaybeUnpoisonHeapReference(temp2); - did_unpoison = true; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, dest, class_offset, temp3_loc, /* needs_null_check */ false); + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // If heap poisoning is enabled, `temp1` and `temp2` have been + // unpoisoned by the the previous calls to + // GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ movl(temp1, Address(dest, class_offset)); + // /* HeapReference<Class> */ temp2 = src->klass_ + __ movl(temp2, Address(src, class_offset)); + if (!optimizations.GetDestinationIsNonPrimitiveArray() || + !optimizations.GetSourceIsNonPrimitiveArray()) { + // One or two of the references need to be unpoisoned. Unpoison them + // both to make the identity check valid. + __ MaybeUnpoisonHeapReference(temp1); + __ MaybeUnpoisonHeapReference(temp2); + did_unpoison = true; + } } if (!optimizations.GetDestinationIsNonPrimitiveArray()) { // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ TMP = temp1->component_type_ - __ movl(CpuRegister(TMP), Address(temp1, component_offset)); - __ testl(CpuRegister(TMP), CpuRegister(TMP)); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ TMP = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `TMP` has been unpoisoned by + // the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ TMP = temp1->component_type_ + __ movl(CpuRegister(TMP), Address(temp1, component_offset)); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + } __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetSourceIsNonPrimitiveArray()) { // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ TMP = temp2->component_type_ - __ movl(CpuRegister(TMP), Address(temp2, component_offset)); - __ testl(CpuRegister(TMP), CpuRegister(TMP)); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ TMP = temp2->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, TMP_loc, temp2, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `TMP` has been unpoisoned by + // the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ TMP = temp2->component_type_ + __ movl(CpuRegister(TMP), Address(temp2, component_offset)); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + } __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } __ cmpl(temp1, temp2); @@ -1222,34 +1317,56 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { if (optimizations.GetDestinationIsTypedObjectArray()) { NearLabel do_copy; __ j(kEqual, &do_copy); - if (!did_unpoison) { + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ cmpl(Address(temp1, super_offset), Immediate(0)); + } else { + if (!did_unpoison) { + __ MaybeUnpoisonHeapReference(temp1); + } + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ movl(temp1, Address(temp1, component_offset)); __ MaybeUnpoisonHeapReference(temp1); + // No need to unpoison the following heap reference load, as + // we're comparing against null. + __ cmpl(Address(temp1, super_offset), Immediate(0)); } - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ movl(temp1, Address(temp1, component_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->super_class_ - __ movl(temp1, Address(temp1, super_offset)); - // No need to unpoison the result, we're comparing against null. - __ testl(temp1, temp1); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); __ Bind(&do_copy); } else { - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ movl(temp1, Address(src, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ TMP = temp1->component_type_ - __ movl(CpuRegister(TMP), Address(temp1, component_offset)); - __ testl(CpuRegister(TMP), CpuRegister(TMP)); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ TMP = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ movl(temp1, Address(src, class_offset)); + __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ TMP = temp1->component_type_ + __ movl(CpuRegister(TMP), Address(temp1, component_offset)); + // No need to unpoison `TMP` now, as we're comparing against null. + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + } __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } // Compute base source address, base destination address, and end source address. @@ -1277,19 +1394,86 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0)); } - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - NearLabel loop, done; - __ cmpl(temp1, temp3); - __ j(kEqual, &done); - __ Bind(&loop); - __ movl(CpuRegister(TMP), Address(temp1, 0)); - __ movl(Address(temp2, 0), CpuRegister(TMP)); - __ addl(temp1, Immediate(element_size)); - __ addl(temp2, Immediate(element_size)); - __ cmpl(temp1, temp3); - __ j(kNotEqual, &loop); - __ Bind(&done); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + NearLabel loop, done; + + // Don't enter copy loop if `length == 0`. + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + + // /* int32_t */ monitor = src->monitor_ + __ movl(CpuRegister(TMP), Address(src, monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Load fence to prevent load-load reordering. + // Note that this is a no-op, thanks to the x86-64 memory model. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + + // Slow path used to copy array when `src` is gray. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with SHR. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ shrl(CpuRegister(TMP), Immediate(LockWord::kReadBarrierStateShift + 1)); + __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ movl(CpuRegister(TMP), Address(temp1, 0)); + __ movl(Address(temp2, 0), CpuRegister(TMP)); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); + } else { + // Non read barrier code. + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + NearLabel loop, done; + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + __ Bind(&loop); + __ movl(CpuRegister(TMP), Address(temp1, 0)); + __ movl(Address(temp2, 0), CpuRegister(TMP)); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + __ Bind(&done); + } // We only need one card marking on the destination array. codegen_->MarkGCCard(temp1, @@ -1298,12 +1482,12 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { CpuRegister(kNoRegister), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1324,7 +1508,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) { codegen_->AddSlowPath(slow_path); __ j(kEqual, slow_path->GetEntryLabel()); - __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pStringCompareTo), + __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, pStringCompareTo), /* no_rip */ true)); __ Bind(slow_path->GetExitLabel()); } @@ -1577,7 +1761,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) { void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1597,7 +1781,8 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke codegen_->AddSlowPath(slow_path); __ j(kEqual, slow_path->GetEntryLabel()); - __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromBytes), + __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, + pAllocStringFromBytes), /* no_rip */ true)); CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1606,7 +1791,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainOnly, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1624,7 +1809,8 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data) // // all include a null check on `data` before calling that method. - __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromChars), + __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, + pAllocStringFromChars), /* no_rip */ true)); CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1632,7 +1818,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + LocationSummary::kCallOnMainAndSlowPath, kIntrinsified); InvokeRuntimeCallingConvention calling_convention; locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); @@ -1649,7 +1835,8 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invok codegen_->AddSlowPath(slow_path); __ j(kEqual, slow_path->GetEntryLabel()); - __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromString), + __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, + pAllocStringFromString), /* no_rip */ true)); CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>(); codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); @@ -1875,7 +2062,7 @@ void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) { CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>(); - GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(), + GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(), /* no_rip */ true)); } diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h index 3f27c911be..5fdfb9b6ca 100644 --- a/compiler/optimizing/locations.h +++ b/compiler/optimizing/locations.h @@ -376,6 +376,10 @@ class Location : public ValueObject { return PolicyField::Decode(GetPayload()); } + bool RequiresRegisterKind() const { + return GetPolicy() == kRequiresRegister || GetPolicy() == kRequiresFpuRegister; + } + uintptr_t GetEncoding() const { return GetPayload(); } @@ -480,8 +484,9 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> { public: enum CallKind { kNoCall, + kCallOnMainAndSlowPath, kCallOnSlowPath, - kCall + kCallOnMainOnly }; LocationSummary(HInstruction* instruction, @@ -540,10 +545,29 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> { Location Out() const { return output_; } - bool CanCall() const { return call_kind_ != kNoCall; } - bool WillCall() const { return call_kind_ == kCall; } - bool OnlyCallsOnSlowPath() const { return call_kind_ == kCallOnSlowPath; } - bool NeedsSafepoint() const { return CanCall(); } + bool CanCall() const { + return call_kind_ != kNoCall; + } + + bool WillCall() const { + return call_kind_ == kCallOnMainOnly || call_kind_ == kCallOnMainAndSlowPath; + } + + bool CallsOnSlowPath() const { + return call_kind_ == kCallOnSlowPath || call_kind_ == kCallOnMainAndSlowPath; + } + + bool OnlyCallsOnSlowPath() const { + return call_kind_ == kCallOnSlowPath; + } + + bool CallsOnMainAndSlowPath() const { + return call_kind_ == kCallOnMainAndSlowPath; + } + + bool NeedsSafepoint() const { + return CanCall(); + } void SetStackBit(uint32_t index) { stack_mask_->SetBit(index); @@ -629,8 +653,7 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> { // Whether these are locations for an intrinsified call. bool intrinsified_; - ART_FRIEND_TEST(RegisterAllocatorTest, ExpectedInRegisterHint); - ART_FRIEND_TEST(RegisterAllocatorTest, SameAsFirstInputHint); + friend class RegisterAllocatorTest; DISALLOW_COPY_AND_ASSIGN(LocationSummary); }; diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index d557f42968..2808e1b5fc 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -2632,4 +2632,23 @@ std::ostream& operator<<(std::ostream& os, TypeCheckKind rhs) { } } +std::ostream& operator<<(std::ostream& os, const MemBarrierKind& kind) { + switch (kind) { + case MemBarrierKind::kAnyStore: + return os << "AnyStore"; + case MemBarrierKind::kLoadAny: + return os << "LoadAny"; + case MemBarrierKind::kStoreStore: + return os << "StoreStore"; + case MemBarrierKind::kAnyAny: + return os << "AnyAny"; + case MemBarrierKind::kNTStoreStore: + return os << "NTStoreStore"; + + default: + LOG(FATAL) << "Unknown MemBarrierKind: " << static_cast<int>(kind); + UNREACHABLE(); + } +} + } // namespace art diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 0f0ef26ea9..dfa8276651 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -25,7 +25,6 @@ #include "base/arena_containers.h" #include "base/arena_object.h" #include "base/stl_util.h" -#include "dex/compiler_enums.h" #include "dex_file.h" #include "entrypoints/quick/quick_entrypoints_enum.h" #include "handle.h" @@ -1289,7 +1288,8 @@ class HLoopInformationOutwardIterator : public ValueObject { #else #define FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M) \ M(BitwiseNegatedRight, Instruction) \ - M(MultiplyAccumulate, Instruction) + M(MultiplyAccumulate, Instruction) \ + M(IntermediateAddress, Instruction) #endif #ifndef ART_ENABLE_CODEGEN_arm @@ -1303,8 +1303,7 @@ class HLoopInformationOutwardIterator : public ValueObject { #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M) #else #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M) \ - M(Arm64DataProcWithShifterOp, Instruction) \ - M(Arm64IntermediateAddress, Instruction) + M(Arm64DataProcWithShifterOp, Instruction) #endif #ifndef ART_ENABLE_CODEGEN_mips @@ -5626,9 +5625,12 @@ inline uint32_t HLoadClass::GetDexCacheElementOffset() const { // Note: defined outside class to see operator<<(., HLoadClass::LoadKind). inline void HLoadClass::AddSpecialInput(HInstruction* special_input) { - // The special input is used for PC-relative loads on some architectures. + // The special input is used for PC-relative loads on some architectures, + // including literal pool loads, which are PC-relative too. DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || - GetLoadKind() == LoadKind::kDexCachePcRelative) << GetLoadKind(); + GetLoadKind() == LoadKind::kDexCachePcRelative || + GetLoadKind() == LoadKind::kBootImageLinkTimeAddress || + GetLoadKind() == LoadKind::kBootImageAddress) << GetLoadKind(); DCHECK(special_input_.GetInstruction() == nullptr); special_input_ = HUserRecord<HInstruction*>(special_input); special_input->AddUseAt(this, 0); @@ -5836,9 +5838,12 @@ inline uint32_t HLoadString::GetDexCacheElementOffset() const { // Note: defined outside class to see operator<<(., HLoadString::LoadKind). inline void HLoadString::AddSpecialInput(HInstruction* special_input) { - // The special input is used for PC-relative loads on some architectures. + // The special input is used for PC-relative loads on some architectures, + // including literal pool loads, which are PC-relative too. DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || - GetLoadKind() == LoadKind::kDexCachePcRelative) << GetLoadKind(); + GetLoadKind() == LoadKind::kDexCachePcRelative || + GetLoadKind() == LoadKind::kBootImageLinkTimeAddress || + GetLoadKind() == LoadKind::kBootImageAddress) << GetLoadKind(); // HLoadString::GetInputRecords() returns an empty array at this point, // so use the GetInputRecords() from the base class to set the input record. DCHECK(special_input_.GetInstruction() == nullptr); @@ -6305,6 +6310,32 @@ class HCheckCast FINAL : public HTemplateInstruction<2> { DISALLOW_COPY_AND_ASSIGN(HCheckCast); }; +/** + * @brief Memory barrier types (see "The JSR-133 Cookbook for Compiler Writers"). + * @details We define the combined barrier types that are actually required + * by the Java Memory Model, rather than using exactly the terminology from + * the JSR-133 cookbook. These should, in many cases, be replaced by acquire/release + * primitives. Note that the JSR-133 cookbook generally does not deal with + * store atomicity issues, and the recipes there are not always entirely sufficient. + * The current recipe is as follows: + * -# Use AnyStore ~= (LoadStore | StoreStore) ~= release barrier before volatile store. + * -# Use AnyAny barrier after volatile store. (StoreLoad is as expensive.) + * -# Use LoadAny barrier ~= (LoadLoad | LoadStore) ~= acquire barrier after each volatile load. + * -# Use StoreStore barrier after all stores but before return from any constructor whose + * class has final fields. + * -# Use NTStoreStore to order non-temporal stores with respect to all later + * store-to-memory instructions. Only generated together with non-temporal stores. + */ +enum MemBarrierKind { + kAnyStore, + kLoadAny, + kStoreStore, + kAnyAny, + kNTStoreStore, + kLastBarrierKind = kNTStoreStore +}; +std::ostream& operator<<(std::ostream& os, const MemBarrierKind& kind); + class HMemoryBarrier FINAL : public HTemplateInstruction<0> { public: explicit HMemoryBarrier(MemBarrierKind barrier_kind, uint32_t dex_pc = kNoDexPc) diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h index 06b073c3e2..3f88717c2a 100644 --- a/compiler/optimizing/nodes_arm64.h +++ b/compiler/optimizing/nodes_arm64.h @@ -94,32 +94,6 @@ class HArm64DataProcWithShifterOp FINAL : public HExpression<2> { std::ostream& operator<<(std::ostream& os, const HArm64DataProcWithShifterOp::OpKind op); -// This instruction computes an intermediate address pointing in the 'middle' of an object. The -// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is -// never used across anything that can trigger GC. -class HArm64IntermediateAddress FINAL : public HExpression<2> { - public: - HArm64IntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc) - : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) { - SetRawInputAt(0, base_address); - SetRawInputAt(1, offset); - } - - bool CanBeMoved() const OVERRIDE { return true; } - bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { - return true; - } - bool IsActualObject() const OVERRIDE { return false; } - - HInstruction* GetBaseAddress() const { return InputAt(0); } - HInstruction* GetOffset() const { return InputAt(1); } - - DECLARE_INSTRUCTION(Arm64IntermediateAddress); - - private: - DISALLOW_COPY_AND_ASSIGN(HArm64IntermediateAddress); -}; - } // namespace art #endif // ART_COMPILER_OPTIMIZING_NODES_ARM64_H_ diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h index f2d5cf3253..8bd8667f84 100644 --- a/compiler/optimizing/nodes_shared.h +++ b/compiler/optimizing/nodes_shared.h @@ -113,6 +113,34 @@ class HBitwiseNegatedRight FINAL : public HBinaryOperation { DISALLOW_COPY_AND_ASSIGN(HBitwiseNegatedRight); }; + +// This instruction computes an intermediate address pointing in the 'middle' of an object. The +// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is +// never used across anything that can trigger GC. +class HIntermediateAddress FINAL : public HExpression<2> { + public: + HIntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc) + : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) { + SetRawInputAt(0, base_address); + SetRawInputAt(1, offset); + } + + bool CanBeMoved() const OVERRIDE { return true; } + bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { + return true; + } + bool IsActualObject() const OVERRIDE { return false; } + + HInstruction* GetBaseAddress() const { return InputAt(0); } + HInstruction* GetOffset() const { return InputAt(1); } + + DECLARE_INSTRUCTION(IntermediateAddress); + + private: + DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress); +}; + + } // namespace art #endif // ART_COMPILER_OPTIMIZING_NODES_SHARED_H_ diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h index 2f59d4cd5b..0819fb01ac 100644 --- a/compiler/optimizing/optimization.h +++ b/compiler/optimizing/optimization.h @@ -37,7 +37,10 @@ class HOptimization : public ArenaObject<kArenaAllocOptimization> { virtual ~HOptimization() {} - // Return the name of the pass. + // Return the name of the pass. Pass names for a single HOptimization should be of form + // <optimization_name> or <optimization_name>$<pass_name> for common <optimization_name> prefix. + // Example: 'instruction_simplifier', 'instruction_simplifier$after_bce', + // 'instruction_simplifier$before_codegen'. const char* GetPassName() const { return pass_name_; } // Perform the analysis itself. diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc index a6d234d739..8c0231e1aa 100644 --- a/compiler/optimizing/optimizing_cfi_test.cc +++ b/compiler/optimizing/optimizing_cfi_test.cc @@ -157,13 +157,26 @@ class OptimizingCFITest : public CFITest { TestImpl(isa, #isa, expected_asm, expected_cfi); \ } +#ifdef ART_ENABLE_CODEGEN_arm TEST_ISA(kThumb2) +#endif +#ifdef ART_ENABLE_CODEGEN_arm64 TEST_ISA(kArm64) +#endif +#ifdef ART_ENABLE_CODEGEN_x86 TEST_ISA(kX86) +#endif +#ifdef ART_ENABLE_CODEGEN_x86_64 TEST_ISA(kX86_64) +#endif +#ifdef ART_ENABLE_CODEGEN_mips TEST_ISA(kMips) +#endif +#ifdef ART_ENABLE_CODEGEN_mips64 TEST_ISA(kMips64) +#endif +#ifdef ART_ENABLE_CODEGEN_arm TEST_F(OptimizingCFITest, kThumb2Adjust) { std::vector<uint8_t> expected_asm( expected_asm_kThumb2_adjust, @@ -184,7 +197,9 @@ TEST_F(OptimizingCFITest, kThumb2Adjust) { Finish(); Check(kThumb2, "kThumb2_adjust", expected_asm, expected_cfi); } +#endif +#ifdef ART_ENABLE_CODEGEN_mips TEST_F(OptimizingCFITest, kMipsAdjust) { // One NOP in delay slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum. static constexpr size_t kNumNops = 1u + (1u << 15); @@ -212,7 +227,9 @@ TEST_F(OptimizingCFITest, kMipsAdjust) { Finish(); Check(kMips, "kMips_adjust", expected_asm, expected_cfi); } +#endif +#ifdef ART_ENABLE_CODEGEN_mips64 TEST_F(OptimizingCFITest, kMips64Adjust) { // One NOP in forbidden slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum. static constexpr size_t kNumNops = 1u + (1u << 15); @@ -240,6 +257,7 @@ TEST_F(OptimizingCFITest, kMips64Adjust) { Finish(); Check(kMips64, "kMips64_adjust", expected_asm, expected_cfi); } +#endif #endif // ART_TARGET_ANDROID diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index d703b0f94f..a1da20bae4 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -37,6 +37,10 @@ #include "pc_relative_fixups_x86.h" #endif +#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) +#include "x86_memory_gen.h" +#endif + #include "art_method-inl.h" #include "base/arena_allocator.h" #include "base/arena_containers.h" @@ -77,7 +81,7 @@ #include "oat_quick_method_header.h" #include "prepare_for_register_allocation.h" #include "reference_type_propagation.h" -#include "register_allocator.h" +#include "register_allocator_linear_scan.h" #include "select_generator.h" #include "sharpening.h" #include "side_effects_analysis.h" @@ -91,6 +95,8 @@ namespace art { static constexpr size_t kArenaAllocatorMemoryReportThreshold = 8 * MB; +static constexpr const char* kPassNameSeparator = "$"; + /** * Used by the code generator, to allocate the code in a vector. */ @@ -174,6 +180,7 @@ class PassObserver : public ValueObject { private: void StartPass(const char* pass_name) { + VLOG(compiler) << "Starting pass: " << pass_name; // Dump graph first, then start timer. if (visualizer_enabled_) { visualizer_.DumpGraph(pass_name, /* is_after_pass */ false, graph_in_bad_state_); @@ -262,7 +269,7 @@ class PassScope : public ValueObject { class OptimizingCompiler FINAL : public Compiler { public: explicit OptimizingCompiler(CompilerDriver* driver); - ~OptimizingCompiler(); + ~OptimizingCompiler() OVERRIDE; bool CanCompileMethod(uint32_t method_idx, const DexFile& dex_file) const OVERRIDE; @@ -277,8 +284,13 @@ class OptimizingCompiler FINAL : public Compiler { CompiledMethod* JniCompile(uint32_t access_flags, uint32_t method_idx, - const DexFile& dex_file) const OVERRIDE { - return ArtQuickJniCompileMethod(GetCompilerDriver(), access_flags, method_idx, dex_file); + const DexFile& dex_file, + JniOptimizationFlags optimization_flags) const OVERRIDE { + return ArtQuickJniCompileMethod(GetCompilerDriver(), + access_flags, + method_idx, + dex_file, + optimization_flags); } uintptr_t GetEntryPointOf(ArtMethod* method) const OVERRIDE @@ -302,6 +314,18 @@ class OptimizingCompiler FINAL : public Compiler { SHARED_REQUIRES(Locks::mutator_lock_); private: + void RunOptimizations(HGraph* graph, + CodeGenerator* codegen, + CompilerDriver* driver, + const DexCompilationUnit& dex_compilation_unit, + PassObserver* pass_observer, + StackHandleScopeCollection* handles) const; + + void RunOptimizations(HOptimization* optimizations[], + size_t length, + PassObserver* pass_observer) const; + + private: // Create a 'CompiledMethod' for an optimized graph. CompiledMethod* Emit(ArenaAllocator* arena, CodeVectorAllocator* code_allocator, @@ -329,6 +353,18 @@ class OptimizingCompiler FINAL : public Compiler { ArtMethod* method, bool osr) const; + void MaybeRunInliner(HGraph* graph, + CodeGenerator* codegen, + CompilerDriver* driver, + const DexCompilationUnit& dex_compilation_unit, + PassObserver* pass_observer, + StackHandleScopeCollection* handles) const; + + void RunArchOptimizations(InstructionSet instruction_set, + HGraph* graph, + CodeGenerator* codegen, + PassObserver* pass_observer) const; + std::unique_ptr<OptimizingCompilerStats> compilation_stats_; std::unique_ptr<std::ostream> visualizer_output_; @@ -392,22 +428,143 @@ static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) { || instruction_set == kX86_64; } -static void RunOptimizations(HOptimization* optimizations[], - size_t length, - PassObserver* pass_observer) { +static HOptimization* BuildOptimization( + const std::string& opt_name, + ArenaAllocator* arena, + HGraph* graph, + OptimizingCompilerStats* stats, + CodeGenerator* codegen, + CompilerDriver* driver, + const DexCompilationUnit& dex_compilation_unit, + StackHandleScopeCollection* handles, + SideEffectsAnalysis* most_recent_side_effects, + HInductionVarAnalysis* most_recent_induction) { + if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) { + CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr); + return new (arena) BoundsCheckElimination(graph, + *most_recent_side_effects, + most_recent_induction); + } else if (opt_name == GVNOptimization::kGlobalValueNumberingPassName) { + CHECK(most_recent_side_effects != nullptr); + return new (arena) GVNOptimization(graph, *most_recent_side_effects); + } else if (opt_name == HConstantFolding::kConstantFoldingPassName) { + return new (arena) HConstantFolding(graph); + } else if (opt_name == HDeadCodeElimination::kDeadCodeEliminationPassName) { + return new (arena) HDeadCodeElimination(graph, stats); + } else if (opt_name == HInliner::kInlinerPassName) { + size_t number_of_dex_registers = dex_compilation_unit.GetCodeItem()->registers_size_; + return new (arena) HInliner(graph, // outer_graph + graph, // outermost_graph + codegen, + dex_compilation_unit, // outer_compilation_unit + dex_compilation_unit, // outermost_compilation_unit + driver, + handles, + stats, + number_of_dex_registers, + /* depth */ 0); + } else if (opt_name == HSharpening::kSharpeningPassName) { + return new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver); + } else if (opt_name == HSelectGenerator::kSelectGeneratorPassName) { + return new (arena) HSelectGenerator(graph, stats); + } else if (opt_name == HInductionVarAnalysis::kInductionPassName) { + return new (arena) HInductionVarAnalysis(graph); + } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) { + return new (arena) InstructionSimplifier(graph, stats); + } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) { + return new (arena) IntrinsicsRecognizer(graph, driver, stats); + } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) { + CHECK(most_recent_side_effects != nullptr); + return new (arena) LICM(graph, *most_recent_side_effects, stats); + } else if (opt_name == LoadStoreElimination::kLoadStoreEliminationPassName) { + CHECK(most_recent_side_effects != nullptr); + return new (arena) LoadStoreElimination(graph, *most_recent_side_effects); + } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) { + return new (arena) SideEffectsAnalysis(graph); +#ifdef ART_ENABLE_CODEGEN_arm + } else if (opt_name == arm::DexCacheArrayFixups::kDexCacheArrayFixupsArmPassName) { + return new (arena) arm::DexCacheArrayFixups(graph, stats); + } else if (opt_name == arm::InstructionSimplifierArm::kInstructionSimplifierArmPassName) { + return new (arena) arm::InstructionSimplifierArm(graph, stats); +#endif +#ifdef ART_ENABLE_CODEGEN_arm64 + } else if (opt_name == arm64::InstructionSimplifierArm64::kInstructionSimplifierArm64PassName) { + return new (arena) arm64::InstructionSimplifierArm64(graph, stats); +#endif +#ifdef ART_ENABLE_CODEGEN_mips + } else if (opt_name == mips::DexCacheArrayFixups::kDexCacheArrayFixupsMipsPassName) { + return new (arena) mips::DexCacheArrayFixups(graph, codegen, stats); + } else if (opt_name == mips::PcRelativeFixups::kPcRelativeFixupsMipsPassName) { + return new (arena) mips::PcRelativeFixups(graph, codegen, stats); +#endif +#ifdef ART_ENABLE_CODEGEN_x86 + } else if (opt_name == x86::PcRelativeFixups::kPcRelativeFixupsX86PassName) { + return new (arena) x86::PcRelativeFixups(graph, codegen, stats); + } else if (opt_name == x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName) { + return new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats); +#endif + } + return nullptr; +} + +static ArenaVector<HOptimization*> BuildOptimizations( + const std::vector<std::string>& pass_names, + ArenaAllocator* arena, + HGraph* graph, + OptimizingCompilerStats* stats, + CodeGenerator* codegen, + CompilerDriver* driver, + const DexCompilationUnit& dex_compilation_unit, + StackHandleScopeCollection* handles) { + // Few HOptimizations constructors require SideEffectsAnalysis or HInductionVarAnalysis + // instances. This method assumes that each of them expects the nearest instance preceeding it + // in the pass name list. + SideEffectsAnalysis* most_recent_side_effects = nullptr; + HInductionVarAnalysis* most_recent_induction = nullptr; + ArenaVector<HOptimization*> ret(arena->Adapter()); + for (std::string pass_name : pass_names) { + size_t pos = pass_name.find(kPassNameSeparator); // Strip suffix to get base pass name. + std::string opt_name = pos == std::string::npos ? pass_name : pass_name.substr(0, pos); + + HOptimization* opt = BuildOptimization( + opt_name, + arena, + graph, + stats, + codegen, + driver, + dex_compilation_unit, + handles, + most_recent_side_effects, + most_recent_induction); + CHECK(opt != nullptr) << "Couldn't build optimization: \"" << pass_name << "\""; + ret.push_back(opt); + + if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) { + most_recent_side_effects = down_cast<SideEffectsAnalysis*>(opt); + } else if (opt_name == HInductionVarAnalysis::kInductionPassName) { + most_recent_induction = down_cast<HInductionVarAnalysis*>(opt); + } + } + return ret; +} + +void OptimizingCompiler::RunOptimizations(HOptimization* optimizations[], + size_t length, + PassObserver* pass_observer) const { for (size_t i = 0; i < length; ++i) { PassScope scope(optimizations[i]->GetPassName(), pass_observer); optimizations[i]->Run(); } } -static void MaybeRunInliner(HGraph* graph, - CodeGenerator* codegen, - CompilerDriver* driver, - OptimizingCompilerStats* stats, - const DexCompilationUnit& dex_compilation_unit, - PassObserver* pass_observer, - StackHandleScopeCollection* handles) { +void OptimizingCompiler::MaybeRunInliner(HGraph* graph, + CodeGenerator* codegen, + CompilerDriver* driver, + const DexCompilationUnit& dex_compilation_unit, + PassObserver* pass_observer, + StackHandleScopeCollection* handles) const { + OptimizingCompilerStats* stats = compilation_stats_.get(); const CompilerOptions& compiler_options = driver->GetCompilerOptions(); bool should_inline = (compiler_options.GetInlineDepthLimit() > 0) && (compiler_options.GetInlineMaxCodeUnits() > 0); @@ -416,11 +573,11 @@ static void MaybeRunInliner(HGraph* graph, } size_t number_of_dex_registers = dex_compilation_unit.GetCodeItem()->registers_size_; HInliner* inliner = new (graph->GetArena()) HInliner( - graph, - graph, + graph, // outer_graph + graph, // outermost_graph codegen, - dex_compilation_unit, - dex_compilation_unit, + dex_compilation_unit, // outer_compilation_unit + dex_compilation_unit, // outermost_compilation_unit driver, handles, stats, @@ -431,11 +588,12 @@ static void MaybeRunInliner(HGraph* graph, RunOptimizations(optimizations, arraysize(optimizations), pass_observer); } -static void RunArchOptimizations(InstructionSet instruction_set, - HGraph* graph, - CodeGenerator* codegen, - OptimizingCompilerStats* stats, - PassObserver* pass_observer) { +void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set, + HGraph* graph, + CodeGenerator* codegen, + PassObserver* pass_observer) const { + UNUSED(codegen); // To avoid compilation error when compiling for svelte + OptimizingCompilerStats* stats = compilation_stats_.get(); ArenaAllocator* arena = graph->GetArena(); switch (instruction_set) { #ifdef ART_ENABLE_CODEGEN_arm @@ -444,8 +602,12 @@ static void RunArchOptimizations(InstructionSet instruction_set, arm::DexCacheArrayFixups* fixups = new (arena) arm::DexCacheArrayFixups(graph, stats); arm::InstructionSimplifierArm* simplifier = new (arena) arm::InstructionSimplifierArm(graph, stats); + SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); + GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch"); HOptimization* arm_optimizations[] = { simplifier, + side_effects, + gvn, fixups }; RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer); @@ -457,7 +619,7 @@ static void RunArchOptimizations(InstructionSet instruction_set, arm64::InstructionSimplifierArm64* simplifier = new (arena) arm64::InstructionSimplifierArm64(graph, stats); SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); - GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN_after_arch"); + GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch"); HOptimization* arm64_optimizations[] = { simplifier, side_effects, @@ -472,7 +634,7 @@ static void RunArchOptimizations(InstructionSet instruction_set, mips::PcRelativeFixups* pc_relative_fixups = new (arena) mips::PcRelativeFixups(graph, codegen, stats); mips::DexCacheArrayFixups* dex_cache_array_fixups = - new (arena) mips::DexCacheArrayFixups(graph, stats); + new (arena) mips::DexCacheArrayFixups(graph, codegen, stats); HOptimization* mips_optimizations[] = { pc_relative_fixups, dex_cache_array_fixups @@ -485,13 +647,27 @@ static void RunArchOptimizations(InstructionSet instruction_set, case kX86: { x86::PcRelativeFixups* pc_relative_fixups = new (arena) x86::PcRelativeFixups(graph, codegen, stats); + x86::X86MemoryOperandGeneration* memory_gen = + new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats); HOptimization* x86_optimizations[] = { - pc_relative_fixups + pc_relative_fixups, + memory_gen }; RunOptimizations(x86_optimizations, arraysize(x86_optimizations), pass_observer); break; } #endif +#ifdef ART_ENABLE_CODEGEN_x86_64 + case kX86_64: { + x86::X86MemoryOperandGeneration* memory_gen = + new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats); + HOptimization* x86_64_optimizations[] = { + memory_gen + }; + RunOptimizations(x86_64_optimizations, arraysize(x86_64_optimizations), pass_observer); + break; + } +#endif default: break; } @@ -500,7 +676,8 @@ static void RunArchOptimizations(InstructionSet instruction_set, NO_INLINE // Avoid increasing caller's frame size by large stack-allocated objects. static void AllocateRegisters(HGraph* graph, CodeGenerator* codegen, - PassObserver* pass_observer) { + PassObserver* pass_observer, + RegisterAllocator::Strategy strategy) { { PassScope scope(PrepareForRegisterAllocation::kPrepareForRegisterAllocationPassName, pass_observer); @@ -513,27 +690,42 @@ static void AllocateRegisters(HGraph* graph, } { PassScope scope(RegisterAllocator::kRegisterAllocatorPassName, pass_observer); - RegisterAllocator(graph->GetArena(), codegen, liveness).AllocateRegisters(); + RegisterAllocator::Create(graph->GetArena(), codegen, liveness, strategy)->AllocateRegisters(); } } -static void RunOptimizations(HGraph* graph, - CodeGenerator* codegen, - CompilerDriver* driver, - OptimizingCompilerStats* stats, - const DexCompilationUnit& dex_compilation_unit, - PassObserver* pass_observer, - StackHandleScopeCollection* handles) { +void OptimizingCompiler::RunOptimizations(HGraph* graph, + CodeGenerator* codegen, + CompilerDriver* driver, + const DexCompilationUnit& dex_compilation_unit, + PassObserver* pass_observer, + StackHandleScopeCollection* handles) const { + OptimizingCompilerStats* stats = compilation_stats_.get(); ArenaAllocator* arena = graph->GetArena(); + if (driver->GetCompilerOptions().GetPassesToRun() != nullptr) { + ArenaVector<HOptimization*> optimizations = BuildOptimizations( + *driver->GetCompilerOptions().GetPassesToRun(), + arena, + graph, + stats, + codegen, + driver, + dex_compilation_unit, + handles); + RunOptimizations(&optimizations[0], optimizations.size(), pass_observer); + return; + } + HDeadCodeElimination* dce1 = new (arena) HDeadCodeElimination( - graph, stats, HDeadCodeElimination::kInitialDeadCodeEliminationPassName); + graph, stats, "dead_code_elimination$initial"); HDeadCodeElimination* dce2 = new (arena) HDeadCodeElimination( - graph, stats, HDeadCodeElimination::kFinalDeadCodeEliminationPassName); + graph, stats, "dead_code_elimination$final"); HConstantFolding* fold1 = new (arena) HConstantFolding(graph); InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, stats); HSelectGenerator* select_generator = new (arena) HSelectGenerator(graph, stats); - HConstantFolding* fold2 = new (arena) HConstantFolding(graph, "constant_folding_after_inlining"); - HConstantFolding* fold3 = new (arena) HConstantFolding(graph, "constant_folding_after_bce"); + HConstantFolding* fold2 = new (arena) HConstantFolding( + graph, "constant_folding$after_inlining"); + HConstantFolding* fold3 = new (arena) HConstantFolding(graph, "constant_folding$after_bce"); SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects); LICM* licm = new (arena) LICM(graph, *side_effects, stats); @@ -542,9 +734,9 @@ static void RunOptimizations(HGraph* graph, BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects, induction); HSharpening* sharpening = new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver); InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier( - graph, stats, "instruction_simplifier_after_bce"); + graph, stats, "instruction_simplifier$after_bce"); InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier( - graph, stats, "instruction_simplifier_before_codegen"); + graph, stats, "instruction_simplifier$before_codegen"); IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, driver, stats); HOptimization* optimizations1[] = { @@ -556,7 +748,7 @@ static void RunOptimizations(HGraph* graph, }; RunOptimizations(optimizations1, arraysize(optimizations1), pass_observer); - MaybeRunInliner(graph, codegen, driver, stats, dex_compilation_unit, pass_observer, handles); + MaybeRunInliner(graph, codegen, driver, dex_compilation_unit, pass_observer, handles); HOptimization* optimizations2[] = { // SelectGenerator depends on the InstructionSimplifier removing @@ -579,8 +771,7 @@ static void RunOptimizations(HGraph* graph, }; RunOptimizations(optimizations2, arraysize(optimizations2), pass_observer); - RunArchOptimizations(driver->GetInstructionSet(), graph, codegen, stats, pass_observer); - AllocateRegisters(graph, codegen, pass_observer); + RunArchOptimizations(driver->GetInstructionSet(), graph, codegen, pass_observer); } static ArenaVector<LinkerPatch> EmitAndSortLinkerPatches(CodeGenerator* codegen) { @@ -791,11 +982,14 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* arena, RunOptimizations(graph, codegen.get(), compiler_driver, - compilation_stats_.get(), dex_compilation_unit, &pass_observer, &handles); + RegisterAllocator::Strategy regalloc_strategy = + compiler_options.GetRegisterAllocationStrategy(); + AllocateRegisters(graph, codegen.get(), &pass_observer, regalloc_strategy); + codegen->Compile(code_allocator); pass_observer.DumpDisassembly(); } diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h index 9cc6ea45d0..c8d1ce0bd5 100644 --- a/compiler/optimizing/optimizing_compiler_stats.h +++ b/compiler/optimizing/optimizing_compiler_stats.h @@ -65,6 +65,7 @@ enum MethodCompilationStat { kInlinedInvokeVirtualOrInterface, kImplicitNullCheckGenerated, kExplicitNullCheckGenerated, + kSimplifyIf, kLastStat }; @@ -143,6 +144,7 @@ class OptimizingCompilerStats { case kInlinedInvokeVirtualOrInterface: name = "InlinedInvokeVirtualOrInterface"; break; case kImplicitNullCheckGenerated: name = "ImplicitNullCheckGenerated"; break; case kExplicitNullCheckGenerated: name = "ExplicitNullCheckGenerated"; break; + case kSimplifyIf: name = "SimplifyIf"; break; case kLastStat: LOG(FATAL) << "invalid stat " diff --git a/compiler/optimizing/pc_relative_fixups_mips.cc b/compiler/optimizing/pc_relative_fixups_mips.cc index ba405cdb69..c6acc45581 100644 --- a/compiler/optimizing/pc_relative_fixups_mips.cc +++ b/compiler/optimizing/pc_relative_fixups_mips.cc @@ -37,6 +37,10 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { // entry block) and relieve some pressure on the register allocator // while avoiding recalculation of the base in a loop. base_->MoveBeforeFirstUserAndOutOfLoops(); + // Computing the base for PC-relative literals will clobber RA with + // the NAL instruction on R2. Take a note of this before generating + // the method entry. + codegen_->ClobberRA(); } } @@ -58,6 +62,36 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { DCHECK(base_ != nullptr); } + void VisitLoadClass(HLoadClass* load_class) OVERRIDE { + HLoadClass::LoadKind load_kind = load_class->GetLoadKind(); + switch (load_kind) { + case HLoadClass::LoadKind::kBootImageLinkTimeAddress: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: + // Add a base register for PC-relative literals on R2. + InitializePCRelativeBasePointer(); + load_class->AddSpecialInput(base_); + break; + default: + break; + } + } + + void VisitLoadString(HLoadString* load_string) OVERRIDE { + HLoadString::LoadKind load_kind = load_string->GetLoadKind(); + switch (load_kind) { + case HLoadString::LoadKind::kBootImageLinkTimeAddress: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kBootImageLinkTimePcRelative: + // Add a base register for PC-relative literals on R2. + InitializePCRelativeBasePointer(); + load_string->AddSpecialInput(base_); + break; + default: + break; + } + } + void HandleInvoke(HInvoke* invoke) { // If this is an invoke-static/-direct with PC-relative dex cache array // addressing, we need the PC-relative address base. @@ -77,7 +111,7 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { // method pointer from the invoke. if (invoke_static_or_direct->HasCurrentMethodInput()) { DCHECK(!invoke_static_or_direct->HasPcRelativeDexCache()); - CHECK(!has_extra_input); // TODO: review this. + CHECK(!has_extra_input); return; } @@ -116,7 +150,6 @@ void PcRelativeFixups::Run() { CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen_); if (mips_codegen->GetInstructionSetFeatures().IsR6()) { // Do nothing for R6 because it has PC-relative addressing. - // TODO: review. Move this check into RunArchOptimizations()? return; } if (graph_->HasIrreducibleLoops()) { diff --git a/compiler/optimizing/pc_relative_fixups_mips.h b/compiler/optimizing/pc_relative_fixups_mips.h index 1e8b071bb3..5a7397bf9d 100644 --- a/compiler/optimizing/pc_relative_fixups_mips.h +++ b/compiler/optimizing/pc_relative_fixups_mips.h @@ -32,6 +32,8 @@ class PcRelativeFixups : public HOptimization { : HOptimization(graph, "pc_relative_fixups_mips", stats), codegen_(codegen) {} + static constexpr const char* kPcRelativeFixupsMipsPassName = "pc_relative_fixups_mips"; + void Run() OVERRIDE; private: diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc index 921f3dfff6..ad0921d7e6 100644 --- a/compiler/optimizing/pc_relative_fixups_x86.cc +++ b/compiler/optimizing/pc_relative_fixups_x86.cc @@ -227,6 +227,7 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { case Intrinsics::kMathMaxFloatFloat: case Intrinsics::kMathMinDoubleDouble: case Intrinsics::kMathMinFloatFloat: + case Intrinsics::kMathRoundFloat: if (!base_added) { DCHECK(invoke_static_or_direct != nullptr); DCHECK(!invoke_static_or_direct->HasCurrentMethodInput()); diff --git a/compiler/optimizing/pc_relative_fixups_x86.h b/compiler/optimizing/pc_relative_fixups_x86.h index 03de2fcece..72fa71ea94 100644 --- a/compiler/optimizing/pc_relative_fixups_x86.h +++ b/compiler/optimizing/pc_relative_fixups_x86.h @@ -29,9 +29,11 @@ namespace x86 { class PcRelativeFixups : public HOptimization { public: PcRelativeFixups(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats) - : HOptimization(graph, "pc_relative_fixups_x86", stats), + : HOptimization(graph, kPcRelativeFixupsX86PassName, stats), codegen_(codegen) {} + static constexpr const char* kPcRelativeFixupsX86PassName = "pc_relative_fixups_x86"; + void Run() OVERRIDE; private: diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc index 965d5ee4f9..e96ab1918c 100644 --- a/compiler/optimizing/reference_type_propagation.cc +++ b/compiler/optimizing/reference_type_propagation.cc @@ -16,6 +16,7 @@ #include "reference_type_propagation.h" +#include "base/enums.h" #include "class_linker-inl.h" #include "mirror/class-inl.h" #include "mirror/dex_cache.h" @@ -775,7 +776,7 @@ void ReferenceTypePropagation::RTPVisitor::VisitInvoke(HInvoke* instr) { ClassLinker* cl = Runtime::Current()->GetClassLinker(); mirror::DexCache* dex_cache = FindDexCacheWithHint(soa.Self(), instr->GetDexFile(), hint_dex_cache_); - size_t pointer_size = cl->GetImagePointerSize(); + PointerSize pointer_size = cl->GetImagePointerSize(); ArtMethod* method = dex_cache->GetResolvedMethod(instr->GetDexMethodIndex(), pointer_size); mirror::Class* klass = (method == nullptr) ? nullptr : method->GetReturnType(false, pointer_size); SetClassAsTypeInfo(instr, klass, /* is_exact */ false); diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc new file mode 100644 index 0000000000..34502869e4 --- /dev/null +++ b/compiler/optimizing/register_allocation_resolver.cc @@ -0,0 +1,653 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "register_allocation_resolver.h" + +#include "code_generator.h" +#include "ssa_liveness_analysis.h" + +namespace art { + +RegisterAllocationResolver::RegisterAllocationResolver(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& liveness) + : allocator_(allocator), + codegen_(codegen), + liveness_(liveness) {} + +void RegisterAllocationResolver::Resolve(size_t max_safepoint_live_core_regs, + size_t max_safepoint_live_fp_regs, + size_t reserved_out_slots, + size_t int_spill_slots, + size_t long_spill_slots, + size_t float_spill_slots, + size_t double_spill_slots, + size_t catch_phi_spill_slots, + const ArenaVector<LiveInterval*>& temp_intervals) { + size_t spill_slots = int_spill_slots + + long_spill_slots + + float_spill_slots + + double_spill_slots + + catch_phi_spill_slots; + + // Computes frame size and spill mask. + codegen_->InitializeCodeGeneration(spill_slots, + max_safepoint_live_core_regs, + max_safepoint_live_fp_regs, + reserved_out_slots, // Includes slot(s) for the art method. + codegen_->GetGraph()->GetLinearOrder()); + + // Resolve outputs, including stack locations. + // TODO: Use pointers of Location inside LiveInterval to avoid doing another iteration. + for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) { + HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); + LiveInterval* current = instruction->GetLiveInterval(); + LocationSummary* locations = instruction->GetLocations(); + Location location = locations->Out(); + if (instruction->IsParameterValue()) { + // Now that we know the frame size, adjust the parameter's location. + if (location.IsStackSlot()) { + location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize()); + current->SetSpillSlot(location.GetStackIndex()); + locations->UpdateOut(location); + } else if (location.IsDoubleStackSlot()) { + location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize()); + current->SetSpillSlot(location.GetStackIndex()); + locations->UpdateOut(location); + } else if (current->HasSpillSlot()) { + current->SetSpillSlot(current->GetSpillSlot() + codegen_->GetFrameSize()); + } + } else if (instruction->IsCurrentMethod()) { + // The current method is always at offset 0. + DCHECK(!current->HasSpillSlot() || (current->GetSpillSlot() == 0)); + } else if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) { + DCHECK(current->HasSpillSlot()); + size_t slot = current->GetSpillSlot() + + spill_slots + + reserved_out_slots + - catch_phi_spill_slots; + current->SetSpillSlot(slot * kVRegSize); + } else if (current->HasSpillSlot()) { + // Adjust the stack slot, now that we know the number of them for each type. + // The way this implementation lays out the stack is the following: + // [parameter slots ] + // [catch phi spill slots ] + // [double spill slots ] + // [long spill slots ] + // [float spill slots ] + // [int/ref values ] + // [maximum out values ] (number of arguments for calls) + // [art method ]. + size_t slot = current->GetSpillSlot(); + switch (current->GetType()) { + case Primitive::kPrimDouble: + slot += long_spill_slots; + FALLTHROUGH_INTENDED; + case Primitive::kPrimLong: + slot += float_spill_slots; + FALLTHROUGH_INTENDED; + case Primitive::kPrimFloat: + slot += int_spill_slots; + FALLTHROUGH_INTENDED; + case Primitive::kPrimNot: + case Primitive::kPrimInt: + case Primitive::kPrimChar: + case Primitive::kPrimByte: + case Primitive::kPrimBoolean: + case Primitive::kPrimShort: + slot += reserved_out_slots; + break; + case Primitive::kPrimVoid: + LOG(FATAL) << "Unexpected type for interval " << current->GetType(); + } + current->SetSpillSlot(slot * kVRegSize); + } + + Location source = current->ToLocation(); + + if (location.IsUnallocated()) { + if (location.GetPolicy() == Location::kSameAsFirstInput) { + if (locations->InAt(0).IsUnallocated()) { + locations->SetInAt(0, source); + } else { + DCHECK(locations->InAt(0).Equals(source)); + } + } + locations->UpdateOut(source); + } else { + DCHECK(source.Equals(location)); + } + } + + // Connect siblings and resolve inputs. + for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) { + HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); + ConnectSiblings(instruction->GetLiveInterval(), + max_safepoint_live_core_regs + max_safepoint_live_fp_regs); + } + + // Resolve non-linear control flow across branches. Order does not matter. + for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { + HBasicBlock* block = it.Current(); + if (block->IsCatchBlock() || + (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) { + // Instructions live at the top of catch blocks or irreducible loop header + // were forced to spill. + if (kIsDebugBuild) { + BitVector* live = liveness_.GetLiveInSet(*block); + for (uint32_t idx : live->Indexes()) { + LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval(); + LiveInterval* sibling = interval->GetSiblingAt(block->GetLifetimeStart()); + // `GetSiblingAt` returns the sibling that contains a position, but there could be + // a lifetime hole in it. `CoversSlow` returns whether the interval is live at that + // position. + if ((sibling != nullptr) && sibling->CoversSlow(block->GetLifetimeStart())) { + DCHECK(!sibling->HasRegister()); + } + } + } + } else { + BitVector* live = liveness_.GetLiveInSet(*block); + for (uint32_t idx : live->Indexes()) { + LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval(); + for (HBasicBlock* predecessor : block->GetPredecessors()) { + ConnectSplitSiblings(interval, predecessor, block); + } + } + } + } + + // Resolve phi inputs. Order does not matter. + for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { + HBasicBlock* current = it.Current(); + if (current->IsCatchBlock()) { + // Catch phi values are set at runtime by the exception delivery mechanism. + } else { + for (HInstructionIterator inst_it(current->GetPhis()); !inst_it.Done(); inst_it.Advance()) { + HInstruction* phi = inst_it.Current(); + for (size_t i = 0, e = current->GetPredecessors().size(); i < e; ++i) { + HBasicBlock* predecessor = current->GetPredecessors()[i]; + DCHECK_EQ(predecessor->GetNormalSuccessors().size(), 1u); + HInstruction* input = phi->InputAt(i); + Location source = input->GetLiveInterval()->GetLocationAt( + predecessor->GetLifetimeEnd() - 1); + Location destination = phi->GetLiveInterval()->ToLocation(); + InsertParallelMoveAtExitOf(predecessor, phi, source, destination); + } + } + } + } + + // Resolve temp locations. + for (LiveInterval* temp : temp_intervals) { + if (temp->IsHighInterval()) { + // High intervals can be skipped, they are already handled by the low interval. + continue; + } + HInstruction* at = liveness_.GetTempUser(temp); + size_t temp_index = liveness_.GetTempIndex(temp); + LocationSummary* locations = at->GetLocations(); + switch (temp->GetType()) { + case Primitive::kPrimInt: + locations->SetTempAt(temp_index, Location::RegisterLocation(temp->GetRegister())); + break; + + case Primitive::kPrimDouble: + if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) { + Location location = Location::FpuRegisterPairLocation( + temp->GetRegister(), temp->GetHighInterval()->GetRegister()); + locations->SetTempAt(temp_index, location); + } else { + locations->SetTempAt(temp_index, Location::FpuRegisterLocation(temp->GetRegister())); + } + break; + + default: + LOG(FATAL) << "Unexpected type for temporary location " + << temp->GetType(); + } + } +} + +void RegisterAllocationResolver::ConnectSiblings(LiveInterval* interval, + size_t max_safepoint_live_regs) { + LiveInterval* current = interval; + if (current->HasSpillSlot() + && current->HasRegister() + // Currently, we spill unconditionnally the current method in the code generators. + && !interval->GetDefinedBy()->IsCurrentMethod()) { + // We spill eagerly, so move must be at definition. + InsertMoveAfter(interval->GetDefinedBy(), + interval->ToLocation(), + interval->NeedsTwoSpillSlots() + ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()) + : Location::StackSlot(interval->GetParent()->GetSpillSlot())); + } + UsePosition* use = current->GetFirstUse(); + UsePosition* env_use = current->GetFirstEnvironmentUse(); + + // Walk over all siblings, updating locations of use positions, and + // connecting them when they are adjacent. + do { + Location source = current->ToLocation(); + + // Walk over all uses covered by this interval, and update the location + // information. + + LiveRange* range = current->GetFirstRange(); + while (range != nullptr) { + while (use != nullptr && use->GetPosition() < range->GetStart()) { + DCHECK(use->IsSynthesized()); + use = use->GetNext(); + } + while (use != nullptr && use->GetPosition() <= range->GetEnd()) { + DCHECK(!use->GetIsEnvironment()); + DCHECK(current->CoversSlow(use->GetPosition()) || (use->GetPosition() == range->GetEnd())); + if (!use->IsSynthesized()) { + LocationSummary* locations = use->GetUser()->GetLocations(); + Location expected_location = locations->InAt(use->GetInputIndex()); + // The expected (actual) location may be invalid in case the input is unused. Currently + // this only happens for intrinsics. + if (expected_location.IsValid()) { + if (expected_location.IsUnallocated()) { + locations->SetInAt(use->GetInputIndex(), source); + } else if (!expected_location.IsConstant()) { + AddInputMoveFor(interval->GetDefinedBy(), use->GetUser(), source, expected_location); + } + } else { + DCHECK(use->GetUser()->IsInvoke()); + DCHECK(use->GetUser()->AsInvoke()->GetIntrinsic() != Intrinsics::kNone); + } + } + use = use->GetNext(); + } + + // Walk over the environment uses, and update their locations. + while (env_use != nullptr && env_use->GetPosition() < range->GetStart()) { + env_use = env_use->GetNext(); + } + + while (env_use != nullptr && env_use->GetPosition() <= range->GetEnd()) { + DCHECK(current->CoversSlow(env_use->GetPosition()) + || (env_use->GetPosition() == range->GetEnd())); + HEnvironment* environment = env_use->GetEnvironment(); + environment->SetLocationAt(env_use->GetInputIndex(), source); + env_use = env_use->GetNext(); + } + + range = range->GetNext(); + } + + // If the next interval starts just after this one, and has a register, + // insert a move. + LiveInterval* next_sibling = current->GetNextSibling(); + if (next_sibling != nullptr + && next_sibling->HasRegister() + && current->GetEnd() == next_sibling->GetStart()) { + Location destination = next_sibling->ToLocation(); + InsertParallelMoveAt(current->GetEnd(), interval->GetDefinedBy(), source, destination); + } + + for (SafepointPosition* safepoint_position = current->GetFirstSafepoint(); + safepoint_position != nullptr; + safepoint_position = safepoint_position->GetNext()) { + DCHECK(current->CoversSlow(safepoint_position->GetPosition())); + + LocationSummary* locations = safepoint_position->GetLocations(); + if ((current->GetType() == Primitive::kPrimNot) && current->GetParent()->HasSpillSlot()) { + DCHECK(interval->GetDefinedBy()->IsActualObject()) + << interval->GetDefinedBy()->DebugName() + << "@" << safepoint_position->GetInstruction()->DebugName(); + locations->SetStackBit(current->GetParent()->GetSpillSlot() / kVRegSize); + } + + switch (source.GetKind()) { + case Location::kRegister: { + locations->AddLiveRegister(source); + if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) { + DCHECK_LE(locations->GetNumberOfLiveRegisters(), + max_safepoint_live_regs); + } + if (current->GetType() == Primitive::kPrimNot) { + DCHECK(interval->GetDefinedBy()->IsActualObject()) + << interval->GetDefinedBy()->DebugName() + << "@" << safepoint_position->GetInstruction()->DebugName(); + locations->SetRegisterBit(source.reg()); + } + break; + } + case Location::kFpuRegister: { + locations->AddLiveRegister(source); + break; + } + + case Location::kRegisterPair: + case Location::kFpuRegisterPair: { + locations->AddLiveRegister(source.ToLow()); + locations->AddLiveRegister(source.ToHigh()); + break; + } + case Location::kStackSlot: // Fall-through + case Location::kDoubleStackSlot: // Fall-through + case Location::kConstant: { + // Nothing to do. + break; + } + default: { + LOG(FATAL) << "Unexpected location for object"; + } + } + } + current = next_sibling; + } while (current != nullptr); + + if (kIsDebugBuild) { + // Following uses can only be synthesized uses. + while (use != nullptr) { + DCHECK(use->IsSynthesized()); + use = use->GetNext(); + } + } +} + +static bool IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop( + HInstruction* instruction) { + return instruction->GetBlock()->GetGraph()->HasIrreducibleLoops() && + (instruction->IsConstant() || instruction->IsCurrentMethod()); +} + +void RegisterAllocationResolver::ConnectSplitSiblings(LiveInterval* interval, + HBasicBlock* from, + HBasicBlock* to) const { + if (interval->GetNextSibling() == nullptr) { + // Nothing to connect. The whole range was allocated to the same location. + return; + } + + // Find the intervals that cover `from` and `to`. + size_t destination_position = to->GetLifetimeStart(); + size_t source_position = from->GetLifetimeEnd() - 1; + LiveInterval* destination = interval->GetSiblingAt(destination_position); + LiveInterval* source = interval->GetSiblingAt(source_position); + + if (destination == source) { + // Interval was not split. + return; + } + + LiveInterval* parent = interval->GetParent(); + HInstruction* defined_by = parent->GetDefinedBy(); + if (codegen_->GetGraph()->HasIrreducibleLoops() && + (destination == nullptr || !destination->CoversSlow(destination_position))) { + // Our live_in fixed point calculation has found that the instruction is live + // in the `to` block because it will eventually enter an irreducible loop. Our + // live interval computation however does not compute a fixed point, and + // therefore will not have a location for that instruction for `to`. + // Because the instruction is a constant or the ArtMethod, we don't need to + // do anything: it will be materialized in the irreducible loop. + DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by)) + << defined_by->DebugName() << ":" << defined_by->GetId() + << " " << from->GetBlockId() << " -> " << to->GetBlockId(); + return; + } + + if (!destination->HasRegister()) { + // Values are eagerly spilled. Spill slot already contains appropriate value. + return; + } + + Location location_source; + // `GetSiblingAt` returns the interval whose start and end cover `position`, + // but does not check whether the interval is inactive at that position. + // The only situation where the interval is inactive at that position is in the + // presence of irreducible loops for constants and ArtMethod. + if (codegen_->GetGraph()->HasIrreducibleLoops() && + (source == nullptr || !source->CoversSlow(source_position))) { + DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by)); + if (defined_by->IsConstant()) { + location_source = defined_by->GetLocations()->Out(); + } else { + DCHECK(defined_by->IsCurrentMethod()); + location_source = parent->NeedsTwoSpillSlots() + ? Location::DoubleStackSlot(parent->GetSpillSlot()) + : Location::StackSlot(parent->GetSpillSlot()); + } + } else { + DCHECK(source != nullptr); + DCHECK(source->CoversSlow(source_position)); + DCHECK(destination->CoversSlow(destination_position)); + location_source = source->ToLocation(); + } + + // If `from` has only one successor, we can put the moves at the exit of it. Otherwise + // we need to put the moves at the entry of `to`. + if (from->GetNormalSuccessors().size() == 1) { + InsertParallelMoveAtExitOf(from, + defined_by, + location_source, + destination->ToLocation()); + } else { + DCHECK_EQ(to->GetPredecessors().size(), 1u); + InsertParallelMoveAtEntryOf(to, + defined_by, + location_source, + destination->ToLocation()); + } +} + +static bool IsValidDestination(Location destination) { + return destination.IsRegister() + || destination.IsRegisterPair() + || destination.IsFpuRegister() + || destination.IsFpuRegisterPair() + || destination.IsStackSlot() + || destination.IsDoubleStackSlot(); +} + +void RegisterAllocationResolver::AddMove(HParallelMove* move, + Location source, + Location destination, + HInstruction* instruction, + Primitive::Type type) const { + if (type == Primitive::kPrimLong + && codegen_->ShouldSplitLongMoves() + // The parallel move resolver knows how to deal with long constants. + && !source.IsConstant()) { + move->AddMove(source.ToLow(), destination.ToLow(), Primitive::kPrimInt, instruction); + move->AddMove(source.ToHigh(), destination.ToHigh(), Primitive::kPrimInt, nullptr); + } else { + move->AddMove(source, destination, type, instruction); + } +} + +void RegisterAllocationResolver::AddInputMoveFor(HInstruction* input, + HInstruction* user, + Location source, + Location destination) const { + if (source.Equals(destination)) return; + + DCHECK(!user->IsPhi()); + + HInstruction* previous = user->GetPrevious(); + HParallelMove* move = nullptr; + if (previous == nullptr + || !previous->IsParallelMove() + || previous->GetLifetimePosition() < user->GetLifetimePosition()) { + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(user->GetLifetimePosition()); + user->GetBlock()->InsertInstructionBefore(move, user); + } else { + move = previous->AsParallelMove(); + } + DCHECK_EQ(move->GetLifetimePosition(), user->GetLifetimePosition()); + AddMove(move, source, destination, nullptr, input->GetType()); +} + +static bool IsInstructionStart(size_t position) { + return (position & 1) == 0; +} + +static bool IsInstructionEnd(size_t position) { + return (position & 1) == 1; +} + +void RegisterAllocationResolver::InsertParallelMoveAt(size_t position, + HInstruction* instruction, + Location source, + Location destination) const { + DCHECK(IsValidDestination(destination)) << destination; + if (source.Equals(destination)) return; + + HInstruction* at = liveness_.GetInstructionFromPosition(position / 2); + HParallelMove* move; + if (at == nullptr) { + if (IsInstructionStart(position)) { + // Block boundary, don't do anything the connection of split siblings will handle it. + return; + } else { + // Move must happen before the first instruction of the block. + at = liveness_.GetInstructionFromPosition((position + 1) / 2); + // Note that parallel moves may have already been inserted, so we explicitly + // ask for the first instruction of the block: `GetInstructionFromPosition` does + // not contain the `HParallelMove` instructions. + at = at->GetBlock()->GetFirstInstruction(); + + if (at->GetLifetimePosition() < position) { + // We may insert moves for split siblings and phi spills at the beginning of the block. + // Since this is a different lifetime position, we need to go to the next instruction. + DCHECK(at->IsParallelMove()); + at = at->GetNext(); + } + + if (at->GetLifetimePosition() != position) { + DCHECK_GT(at->GetLifetimePosition(), position); + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(position); + at->GetBlock()->InsertInstructionBefore(move, at); + } else { + DCHECK(at->IsParallelMove()); + move = at->AsParallelMove(); + } + } + } else if (IsInstructionEnd(position)) { + // Move must happen after the instruction. + DCHECK(!at->IsControlFlow()); + move = at->GetNext()->AsParallelMove(); + // This is a parallel move for connecting siblings in a same block. We need to + // differentiate it with moves for connecting blocks, and input moves. + if (move == nullptr || move->GetLifetimePosition() > position) { + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(position); + at->GetBlock()->InsertInstructionBefore(move, at->GetNext()); + } + } else { + // Move must happen before the instruction. + HInstruction* previous = at->GetPrevious(); + if (previous == nullptr + || !previous->IsParallelMove() + || previous->GetLifetimePosition() != position) { + // If the previous is a parallel move, then its position must be lower + // than the given `position`: it was added just after the non-parallel + // move instruction that precedes `instruction`. + DCHECK(previous == nullptr + || !previous->IsParallelMove() + || previous->GetLifetimePosition() < position); + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(position); + at->GetBlock()->InsertInstructionBefore(move, at); + } else { + move = previous->AsParallelMove(); + } + } + DCHECK_EQ(move->GetLifetimePosition(), position); + AddMove(move, source, destination, instruction, instruction->GetType()); +} + +void RegisterAllocationResolver::InsertParallelMoveAtExitOf(HBasicBlock* block, + HInstruction* instruction, + Location source, + Location destination) const { + DCHECK(IsValidDestination(destination)) << destination; + if (source.Equals(destination)) return; + + DCHECK_EQ(block->GetNormalSuccessors().size(), 1u); + HInstruction* last = block->GetLastInstruction(); + // We insert moves at exit for phi predecessors and connecting blocks. + // A block ending with an if or a packed switch cannot branch to a block + // with phis because we do not allow critical edges. It can also not connect + // a split interval between two blocks: the move has to happen in the successor. + DCHECK(!last->IsIf() && !last->IsPackedSwitch()); + HInstruction* previous = last->GetPrevious(); + HParallelMove* move; + // This is a parallel move for connecting blocks. We need to differentiate + // it with moves for connecting siblings in a same block, and output moves. + size_t position = last->GetLifetimePosition(); + if (previous == nullptr || !previous->IsParallelMove() + || previous->AsParallelMove()->GetLifetimePosition() != position) { + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(position); + block->InsertInstructionBefore(move, last); + } else { + move = previous->AsParallelMove(); + } + AddMove(move, source, destination, instruction, instruction->GetType()); +} + +void RegisterAllocationResolver::InsertParallelMoveAtEntryOf(HBasicBlock* block, + HInstruction* instruction, + Location source, + Location destination) const { + DCHECK(IsValidDestination(destination)) << destination; + if (source.Equals(destination)) return; + + HInstruction* first = block->GetFirstInstruction(); + HParallelMove* move = first->AsParallelMove(); + size_t position = block->GetLifetimeStart(); + // This is a parallel move for connecting blocks. We need to differentiate + // it with moves for connecting siblings in a same block, and input moves. + if (move == nullptr || move->GetLifetimePosition() != position) { + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(position); + block->InsertInstructionBefore(move, first); + } + AddMove(move, source, destination, instruction, instruction->GetType()); +} + +void RegisterAllocationResolver::InsertMoveAfter(HInstruction* instruction, + Location source, + Location destination) const { + DCHECK(IsValidDestination(destination)) << destination; + if (source.Equals(destination)) return; + + if (instruction->IsPhi()) { + InsertParallelMoveAtEntryOf(instruction->GetBlock(), instruction, source, destination); + return; + } + + size_t position = instruction->GetLifetimePosition() + 1; + HParallelMove* move = instruction->GetNext()->AsParallelMove(); + // This is a parallel move for moving the output of an instruction. We need + // to differentiate with input moves, moves for connecting siblings in a + // and moves for connecting blocks. + if (move == nullptr || move->GetLifetimePosition() != position) { + move = new (allocator_) HParallelMove(allocator_); + move->SetLifetimePosition(position); + instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext()); + } + AddMove(move, source, destination, instruction, instruction->GetType()); +} + +} // namespace art diff --git a/compiler/optimizing/register_allocation_resolver.h b/compiler/optimizing/register_allocation_resolver.h new file mode 100644 index 0000000000..6ceb9bc955 --- /dev/null +++ b/compiler/optimizing/register_allocation_resolver.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATION_RESOLVER_H_ +#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATION_RESOLVER_H_ + +#include "base/arena_containers.h" +#include "base/value_object.h" +#include "primitive.h" + +namespace art { + +class ArenaAllocator; +class CodeGenerator; +class HBasicBlock; +class HInstruction; +class HParallelMove; +class LiveInterval; +class Location; +class SsaLivenessAnalysis; + +/** + * Reconciles the locations assigned to live intervals with the location + * summary of each instruction, and inserts moves to resolve split intervals, + * nonlinear control flow, and phi inputs. + */ +class RegisterAllocationResolver : ValueObject { + public: + RegisterAllocationResolver(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& liveness); + + void Resolve(size_t max_safepoint_live_core_regs, + size_t max_safepoint_live_fp_regs, + size_t reserved_out_slots, // Includes slot(s) for the art method. + size_t int_spill_slots, + size_t long_spill_slots, + size_t float_spill_slots, + size_t double_spill_slots, + size_t catch_phi_spill_slots, + const ArenaVector<LiveInterval*>& temp_intervals); + + private: + // Connect adjacent siblings within blocks, and resolve inputs along the way. + // Uses max_safepoint_live_regs to check that we did not underestimate the + // number of live registers at safepoints. + void ConnectSiblings(LiveInterval* interval, size_t max_safepoint_live_regs); + + // Connect siblings between block entries and exits. + void ConnectSplitSiblings(LiveInterval* interval, HBasicBlock* from, HBasicBlock* to) const; + + // Helper methods for inserting parallel moves in the graph. + void InsertParallelMoveAtExitOf(HBasicBlock* block, + HInstruction* instruction, + Location source, + Location destination) const; + void InsertParallelMoveAtEntryOf(HBasicBlock* block, + HInstruction* instruction, + Location source, + Location destination) const; + void InsertMoveAfter(HInstruction* instruction, Location source, Location destination) const; + void AddInputMoveFor(HInstruction* input, + HInstruction* user, + Location source, + Location destination) const; + void InsertParallelMoveAt(size_t position, + HInstruction* instruction, + Location source, + Location destination) const; + void AddMove(HParallelMove* move, + Location source, + Location destination, + HInstruction* instruction, + Primitive::Type type) const; + + ArenaAllocator* const allocator_; + CodeGenerator* const codegen_; + const SsaLivenessAnalysis& liveness_; + + DISALLOW_COPY_AND_ASSIGN(RegisterAllocationResolver); +}; + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATION_RESOLVER_H_ diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc index 9d99668484..5b768d5d67 100644 --- a/compiler/optimizing/register_allocator.cc +++ b/compiler/optimizing/register_allocator.cc @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014 The Android Open Source Project + * Copyright (C) 2016 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,65 +21,33 @@ #include "base/bit_vector-inl.h" #include "code_generator.h" +#include "register_allocator_graph_color.h" +#include "register_allocator_linear_scan.h" #include "ssa_liveness_analysis.h" -namespace art { - -static constexpr size_t kMaxLifetimePosition = -1; -static constexpr size_t kDefaultNumberOfSpillSlots = 4; -// For simplicity, we implement register pairs as (reg, reg + 1). -// Note that this is a requirement for double registers on ARM, since we -// allocate SRegister. -static int GetHighForLowRegister(int reg) { return reg + 1; } -static bool IsLowRegister(int reg) { return (reg & 1) == 0; } -static bool IsLowOfUnalignedPairInterval(LiveInterval* low) { - return GetHighForLowRegister(low->GetRegister()) != low->GetHighInterval()->GetRegister(); -} +namespace art { RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator, CodeGenerator* codegen, const SsaLivenessAnalysis& liveness) - : allocator_(allocator), - codegen_(codegen), - liveness_(liveness), - unhandled_core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), - unhandled_fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), - unhandled_(nullptr), - handled_(allocator->Adapter(kArenaAllocRegisterAllocator)), - active_(allocator->Adapter(kArenaAllocRegisterAllocator)), - inactive_(allocator->Adapter(kArenaAllocRegisterAllocator)), - physical_core_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), - physical_fp_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), - temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), - int_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), - long_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), - float_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), - double_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), - catch_phi_spill_slots_(0), - safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)), - processing_core_registers_(false), - number_of_registers_(-1), - registers_array_(nullptr), - blocked_core_registers_(codegen->GetBlockedCoreRegisters()), - blocked_fp_registers_(codegen->GetBlockedFloatingPointRegisters()), - reserved_out_slots_(0), - maximum_number_of_live_core_registers_(0), - maximum_number_of_live_fp_registers_(0) { - temp_intervals_.reserve(4); - int_spill_slots_.reserve(kDefaultNumberOfSpillSlots); - long_spill_slots_.reserve(kDefaultNumberOfSpillSlots); - float_spill_slots_.reserve(kDefaultNumberOfSpillSlots); - double_spill_slots_.reserve(kDefaultNumberOfSpillSlots); + : allocator_(allocator), + codegen_(codegen), + liveness_(liveness) {} - codegen->SetupBlockedRegisters(); - physical_core_register_intervals_.resize(codegen->GetNumberOfCoreRegisters(), nullptr); - physical_fp_register_intervals_.resize(codegen->GetNumberOfFloatingPointRegisters(), nullptr); - // Always reserve for the current method and the graph's max out registers. - // TODO: compute it instead. - // ArtMethod* takes 2 vregs for 64 bits. - reserved_out_slots_ = InstructionSetPointerSize(codegen->GetInstructionSet()) / kVRegSize + - codegen->GetGraph()->GetMaximumNumberOfOutVRegs(); +RegisterAllocator* RegisterAllocator::Create(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& analysis, + Strategy strategy) { + switch (strategy) { + case kRegisterAllocatorLinearScan: + return new (allocator) RegisterAllocatorLinearScan(allocator, codegen, analysis); + case kRegisterAllocatorGraphColor: + return new (allocator) RegisterAllocatorGraphColor(allocator, codegen, analysis); + default: + LOG(FATAL) << "Invalid register allocation strategy: " << strategy; + UNREACHABLE(); + } } bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UNUSED, @@ -93,328 +61,6 @@ bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UN || instruction_set == kX86_64; } -static bool ShouldProcess(bool processing_core_registers, LiveInterval* interval) { - if (interval == nullptr) return false; - bool is_core_register = (interval->GetType() != Primitive::kPrimDouble) - && (interval->GetType() != Primitive::kPrimFloat); - return processing_core_registers == is_core_register; -} - -void RegisterAllocator::AllocateRegisters() { - AllocateRegistersInternal(); - Resolve(); - - if (kIsDebugBuild) { - processing_core_registers_ = true; - ValidateInternal(true); - processing_core_registers_ = false; - ValidateInternal(true); - // Check that the linear order is still correct with regards to lifetime positions. - // Since only parallel moves have been inserted during the register allocation, - // these checks are mostly for making sure these moves have been added correctly. - size_t current_liveness = 0; - for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { - HBasicBlock* block = it.Current(); - for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) { - HInstruction* instruction = inst_it.Current(); - DCHECK_LE(current_liveness, instruction->GetLifetimePosition()); - current_liveness = instruction->GetLifetimePosition(); - } - for (HInstructionIterator inst_it(block->GetInstructions()); - !inst_it.Done(); - inst_it.Advance()) { - HInstruction* instruction = inst_it.Current(); - DCHECK_LE(current_liveness, instruction->GetLifetimePosition()) << instruction->DebugName(); - current_liveness = instruction->GetLifetimePosition(); - } - } - } -} - -void RegisterAllocator::BlockRegister(Location location, size_t start, size_t end) { - int reg = location.reg(); - DCHECK(location.IsRegister() || location.IsFpuRegister()); - LiveInterval* interval = location.IsRegister() - ? physical_core_register_intervals_[reg] - : physical_fp_register_intervals_[reg]; - Primitive::Type type = location.IsRegister() - ? Primitive::kPrimInt - : Primitive::kPrimFloat; - if (interval == nullptr) { - interval = LiveInterval::MakeFixedInterval(allocator_, reg, type); - if (location.IsRegister()) { - physical_core_register_intervals_[reg] = interval; - } else { - physical_fp_register_intervals_[reg] = interval; - } - } - DCHECK(interval->GetRegister() == reg); - interval->AddRange(start, end); -} - -void RegisterAllocator::BlockRegisters(size_t start, size_t end, bool caller_save_only) { - for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) { - if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) { - BlockRegister(Location::RegisterLocation(i), start, end); - } - } - for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) { - if (!caller_save_only || !codegen_->IsFloatingPointCalleeSaveRegister(i)) { - BlockRegister(Location::FpuRegisterLocation(i), start, end); - } - } -} - -void RegisterAllocator::AllocateRegistersInternal() { - // Iterate post-order, to ensure the list is sorted, and the last added interval - // is the one with the lowest start position. - for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { - HBasicBlock* block = it.Current(); - for (HBackwardInstructionIterator back_it(block->GetInstructions()); !back_it.Done(); - back_it.Advance()) { - ProcessInstruction(back_it.Current()); - } - for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) { - ProcessInstruction(inst_it.Current()); - } - - if (block->IsCatchBlock() || - (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) { - // By blocking all registers at the top of each catch block or irreducible loop, we force - // intervals belonging to the live-in set of the catch/header block to be spilled. - // TODO(ngeoffray): Phis in this block could be allocated in register. - size_t position = block->GetLifetimeStart(); - BlockRegisters(position, position + 1); - } - } - - number_of_registers_ = codegen_->GetNumberOfCoreRegisters(); - registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_, - kArenaAllocRegisterAllocator); - processing_core_registers_ = true; - unhandled_ = &unhandled_core_intervals_; - for (LiveInterval* fixed : physical_core_register_intervals_) { - if (fixed != nullptr) { - // Fixed interval is added to inactive_ instead of unhandled_. - // It's also the only type of inactive interval whose start position - // can be after the current interval during linear scan. - // Fixed interval is never split and never moves to unhandled_. - inactive_.push_back(fixed); - } - } - LinearScan(); - - inactive_.clear(); - active_.clear(); - handled_.clear(); - - number_of_registers_ = codegen_->GetNumberOfFloatingPointRegisters(); - registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_, - kArenaAllocRegisterAllocator); - processing_core_registers_ = false; - unhandled_ = &unhandled_fp_intervals_; - for (LiveInterval* fixed : physical_fp_register_intervals_) { - if (fixed != nullptr) { - // Fixed interval is added to inactive_ instead of unhandled_. - // It's also the only type of inactive interval whose start position - // can be after the current interval during linear scan. - // Fixed interval is never split and never moves to unhandled_. - inactive_.push_back(fixed); - } - } - LinearScan(); -} - -void RegisterAllocator::ProcessInstruction(HInstruction* instruction) { - LocationSummary* locations = instruction->GetLocations(); - size_t position = instruction->GetLifetimePosition(); - - if (locations == nullptr) return; - - // Create synthesized intervals for temporaries. - for (size_t i = 0; i < locations->GetTempCount(); ++i) { - Location temp = locations->GetTemp(i); - if (temp.IsRegister() || temp.IsFpuRegister()) { - BlockRegister(temp, position, position + 1); - // Ensure that an explicit temporary register is marked as being allocated. - codegen_->AddAllocatedRegister(temp); - } else { - DCHECK(temp.IsUnallocated()); - switch (temp.GetPolicy()) { - case Location::kRequiresRegister: { - LiveInterval* interval = - LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt); - temp_intervals_.push_back(interval); - interval->AddTempUse(instruction, i); - unhandled_core_intervals_.push_back(interval); - break; - } - - case Location::kRequiresFpuRegister: { - LiveInterval* interval = - LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble); - temp_intervals_.push_back(interval); - interval->AddTempUse(instruction, i); - if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) { - interval->AddHighInterval(/* is_temp */ true); - LiveInterval* high = interval->GetHighInterval(); - temp_intervals_.push_back(high); - unhandled_fp_intervals_.push_back(high); - } - unhandled_fp_intervals_.push_back(interval); - break; - } - - default: - LOG(FATAL) << "Unexpected policy for temporary location " - << temp.GetPolicy(); - } - } - } - - bool core_register = (instruction->GetType() != Primitive::kPrimDouble) - && (instruction->GetType() != Primitive::kPrimFloat); - - if (locations->NeedsSafepoint()) { - if (codegen_->IsLeafMethod()) { - // TODO: We do this here because we do not want the suspend check to artificially - // create live registers. We should find another place, but this is currently the - // simplest. - DCHECK(instruction->IsSuspendCheckEntry()); - instruction->GetBlock()->RemoveInstruction(instruction); - return; - } - safepoints_.push_back(instruction); - if (locations->OnlyCallsOnSlowPath()) { - // We add a synthesized range at this position to record the live registers - // at this position. Ideally, we could just update the safepoints when locations - // are updated, but we currently need to know the full stack size before updating - // locations (because of parameters and the fact that we don't have a frame pointer). - // And knowing the full stack size requires to know the maximum number of live - // registers at calls in slow paths. - // By adding the following interval in the algorithm, we can compute this - // maximum before updating locations. - LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction); - interval->AddRange(position, position + 1); - AddSorted(&unhandled_core_intervals_, interval); - AddSorted(&unhandled_fp_intervals_, interval); - } - } - - if (locations->WillCall()) { - BlockRegisters(position, position + 1, /* caller_save_only */ true); - } - - for (size_t i = 0; i < locations->GetInputCount(); ++i) { - Location input = locations->InAt(i); - if (input.IsRegister() || input.IsFpuRegister()) { - BlockRegister(input, position, position + 1); - } else if (input.IsPair()) { - BlockRegister(input.ToLow(), position, position + 1); - BlockRegister(input.ToHigh(), position, position + 1); - } - } - - LiveInterval* current = instruction->GetLiveInterval(); - if (current == nullptr) return; - - ArenaVector<LiveInterval*>& unhandled = core_register - ? unhandled_core_intervals_ - : unhandled_fp_intervals_; - - DCHECK(unhandled.empty() || current->StartsBeforeOrAt(unhandled.back())); - - if (codegen_->NeedsTwoRegisters(current->GetType())) { - current->AddHighInterval(); - } - - for (size_t safepoint_index = safepoints_.size(); safepoint_index > 0; --safepoint_index) { - HInstruction* safepoint = safepoints_[safepoint_index - 1u]; - size_t safepoint_position = safepoint->GetLifetimePosition(); - - // Test that safepoints are ordered in the optimal way. - DCHECK(safepoint_index == safepoints_.size() || - safepoints_[safepoint_index]->GetLifetimePosition() < safepoint_position); - - if (safepoint_position == current->GetStart()) { - // The safepoint is for this instruction, so the location of the instruction - // does not need to be saved. - DCHECK_EQ(safepoint_index, safepoints_.size()); - DCHECK_EQ(safepoint, instruction); - continue; - } else if (current->IsDeadAt(safepoint_position)) { - break; - } else if (!current->Covers(safepoint_position)) { - // Hole in the interval. - continue; - } - current->AddSafepoint(safepoint); - } - current->ResetSearchCache(); - - // Some instructions define their output in fixed register/stack slot. We need - // to ensure we know these locations before doing register allocation. For a - // given register, we create an interval that covers these locations. The register - // will be unavailable at these locations when trying to allocate one for an - // interval. - // - // The backwards walking ensures the ranges are ordered on increasing start positions. - Location output = locations->Out(); - if (output.IsUnallocated() && output.GetPolicy() == Location::kSameAsFirstInput) { - Location first = locations->InAt(0); - if (first.IsRegister() || first.IsFpuRegister()) { - current->SetFrom(position + 1); - current->SetRegister(first.reg()); - } else if (first.IsPair()) { - current->SetFrom(position + 1); - current->SetRegister(first.low()); - LiveInterval* high = current->GetHighInterval(); - high->SetRegister(first.high()); - high->SetFrom(position + 1); - } - } else if (output.IsRegister() || output.IsFpuRegister()) { - // Shift the interval's start by one to account for the blocked register. - current->SetFrom(position + 1); - current->SetRegister(output.reg()); - BlockRegister(output, position, position + 1); - } else if (output.IsPair()) { - current->SetFrom(position + 1); - current->SetRegister(output.low()); - LiveInterval* high = current->GetHighInterval(); - high->SetRegister(output.high()); - high->SetFrom(position + 1); - BlockRegister(output.ToLow(), position, position + 1); - BlockRegister(output.ToHigh(), position, position + 1); - } else if (output.IsStackSlot() || output.IsDoubleStackSlot()) { - current->SetSpillSlot(output.GetStackIndex()); - } else { - DCHECK(output.IsUnallocated() || output.IsConstant()); - } - - if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) { - AllocateSpillSlotForCatchPhi(instruction->AsPhi()); - } - - // If needed, add interval to the list of unhandled intervals. - if (current->HasSpillSlot() || instruction->IsConstant()) { - // Split just before first register use. - size_t first_register_use = current->FirstRegisterUse(); - if (first_register_use != kNoLifetime) { - LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1); - // Don't add directly to `unhandled`, it needs to be sorted and the start - // of this new interval might be after intervals already in the list. - AddSorted(&unhandled, split); - } else { - // Nothing to do, we won't allocate a register for this value. - } - } else { - // Don't add directly to `unhandled`, temp or safepoint intervals - // for this instruction may have been added, and those can be - // processed first. - AddSorted(&unhandled, current); - } -} - class AllRangesIterator : public ValueObject { public: explicit AllRangesIterator(LiveInterval* interval) @@ -442,36 +88,6 @@ class AllRangesIterator : public ValueObject { DISALLOW_COPY_AND_ASSIGN(AllRangesIterator); }; -bool RegisterAllocator::ValidateInternal(bool log_fatal_on_failure) const { - // To simplify unit testing, we eagerly create the array of intervals, and - // call the helper method. - ArenaVector<LiveInterval*> intervals(allocator_->Adapter(kArenaAllocRegisterAllocatorValidate)); - for (size_t i = 0; i < liveness_.GetNumberOfSsaValues(); ++i) { - HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); - if (ShouldProcess(processing_core_registers_, instruction->GetLiveInterval())) { - intervals.push_back(instruction->GetLiveInterval()); - } - } - - const ArenaVector<LiveInterval*>* physical_register_intervals = processing_core_registers_ - ? &physical_core_register_intervals_ - : &physical_fp_register_intervals_; - for (LiveInterval* fixed : *physical_register_intervals) { - if (fixed != nullptr) { - intervals.push_back(fixed); - } - } - - for (LiveInterval* temp : temp_intervals_) { - if (ShouldProcess(processing_core_registers_, temp)) { - intervals.push_back(temp); - } - } - - return ValidateIntervals(intervals, GetNumberOfSpillSlots(), reserved_out_slots_, *codegen_, - allocator_, processing_core_registers_, log_fatal_on_failure); -} - bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& intervals, size_t number_of_spill_slots, size_t number_of_out_slots, @@ -550,6 +166,19 @@ bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& inte } else { codegen.DumpFloatingPointRegister(message, current->GetRegister()); } + for (LiveInterval* interval : intervals) { + if (interval->HasRegister() + && interval->GetRegister() == current->GetRegister() + && interval->CoversSlow(j)) { + message << std::endl; + if (interval->GetDefinedBy() != nullptr) { + message << interval->GetDefinedBy()->GetKind() << " "; + } else { + message << "physical "; + } + interval->Dump(message); + } + } LOG(FATAL) << message.str(); } else { return false; @@ -564,638 +193,30 @@ bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& inte return true; } -void RegisterAllocator::DumpInterval(std::ostream& stream, LiveInterval* interval) const { - interval->Dump(stream); - stream << ": "; - if (interval->HasRegister()) { - if (interval->IsFloatingPoint()) { - codegen_->DumpFloatingPointRegister(stream, interval->GetRegister()); - } else { - codegen_->DumpCoreRegister(stream, interval->GetRegister()); - } - } else { - stream << "spilled"; - } - stream << std::endl; -} - -void RegisterAllocator::DumpAllIntervals(std::ostream& stream) const { - stream << "inactive: " << std::endl; - for (LiveInterval* inactive_interval : inactive_) { - DumpInterval(stream, inactive_interval); - } - stream << "active: " << std::endl; - for (LiveInterval* active_interval : active_) { - DumpInterval(stream, active_interval); - } - stream << "unhandled: " << std::endl; - auto unhandled = (unhandled_ != nullptr) ? - unhandled_ : &unhandled_core_intervals_; - for (LiveInterval* unhandled_interval : *unhandled) { - DumpInterval(stream, unhandled_interval); - } - stream << "handled: " << std::endl; - for (LiveInterval* handled_interval : handled_) { - DumpInterval(stream, handled_interval); - } -} - -// By the book implementation of a linear scan register allocator. -void RegisterAllocator::LinearScan() { - while (!unhandled_->empty()) { - // (1) Remove interval with the lowest start position from unhandled. - LiveInterval* current = unhandled_->back(); - unhandled_->pop_back(); - - // Make sure the interval is an expected state. - DCHECK(!current->IsFixed() && !current->HasSpillSlot()); - // Make sure we are going in the right order. - DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() >= current->GetStart()); - // Make sure a low interval is always with a high. - DCHECK(!current->IsLowInterval() || unhandled_->back()->IsHighInterval()); - // Make sure a high interval is always with a low. - DCHECK(current->IsLowInterval() || - unhandled_->empty() || - !unhandled_->back()->IsHighInterval()); - - size_t position = current->GetStart(); - - // Remember the inactive_ size here since the ones moved to inactive_ from - // active_ below shouldn't need to be re-checked. - size_t inactive_intervals_to_handle = inactive_.size(); - - // (2) Remove currently active intervals that are dead at this position. - // Move active intervals that have a lifetime hole at this position - // to inactive. - auto active_kept_end = std::remove_if( - active_.begin(), - active_.end(), - [this, position](LiveInterval* interval) { - if (interval->IsDeadAt(position)) { - handled_.push_back(interval); - return true; - } else if (!interval->Covers(position)) { - inactive_.push_back(interval); - return true; - } else { - return false; // Keep this interval. - } - }); - active_.erase(active_kept_end, active_.end()); - - // (3) Remove currently inactive intervals that are dead at this position. - // Move inactive intervals that cover this position to active. - auto inactive_to_handle_end = inactive_.begin() + inactive_intervals_to_handle; - auto inactive_kept_end = std::remove_if( - inactive_.begin(), - inactive_to_handle_end, - [this, position](LiveInterval* interval) { - DCHECK(interval->GetStart() < position || interval->IsFixed()); - if (interval->IsDeadAt(position)) { - handled_.push_back(interval); - return true; - } else if (interval->Covers(position)) { - active_.push_back(interval); - return true; - } else { - return false; // Keep this interval. - } - }); - inactive_.erase(inactive_kept_end, inactive_to_handle_end); - - if (current->IsSlowPathSafepoint()) { - // Synthesized interval to record the maximum number of live registers - // at safepoints. No need to allocate a register for it. - if (processing_core_registers_) { - maximum_number_of_live_core_registers_ = - std::max(maximum_number_of_live_core_registers_, active_.size()); - } else { - maximum_number_of_live_fp_registers_ = - std::max(maximum_number_of_live_fp_registers_, active_.size()); - } - DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() > current->GetStart()); - continue; - } - - if (current->IsHighInterval() && !current->GetLowInterval()->HasRegister()) { - DCHECK(!current->HasRegister()); - // Allocating the low part was unsucessful. The splitted interval for the high part - // will be handled next (it is in the `unhandled_` list). - continue; - } - - // (4) Try to find an available register. - bool success = TryAllocateFreeReg(current); - - // (5) If no register could be found, we need to spill. - if (!success) { - success = AllocateBlockedReg(current); - } - - // (6) If the interval had a register allocated, add it to the list of active - // intervals. - if (success) { - codegen_->AddAllocatedRegister(processing_core_registers_ - ? Location::RegisterLocation(current->GetRegister()) - : Location::FpuRegisterLocation(current->GetRegister())); - active_.push_back(current); - if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) { - current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister())); - } - } - } -} - -static void FreeIfNotCoverAt(LiveInterval* interval, size_t position, size_t* free_until) { - DCHECK(!interval->IsHighInterval()); - // Note that the same instruction may occur multiple times in the input list, - // so `free_until` may have changed already. - // Since `position` is not the current scan position, we need to use CoversSlow. - if (interval->IsDeadAt(position)) { - // Set the register to be free. Note that inactive intervals might later - // update this. - free_until[interval->GetRegister()] = kMaxLifetimePosition; - if (interval->HasHighInterval()) { - DCHECK(interval->GetHighInterval()->IsDeadAt(position)); - free_until[interval->GetHighInterval()->GetRegister()] = kMaxLifetimePosition; - } - } else if (!interval->CoversSlow(position)) { - // The interval becomes inactive at `defined_by`. We make its register - // available only until the next use strictly after `defined_by`. - free_until[interval->GetRegister()] = interval->FirstUseAfter(position); +LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) { + DCHECK_GE(position, interval->GetStart()); + DCHECK(!interval->IsDeadAt(position)); + if (position == interval->GetStart()) { + // Spill slot will be allocated when handling `interval` again. + interval->ClearRegister(); if (interval->HasHighInterval()) { - DCHECK(!interval->GetHighInterval()->CoversSlow(position)); - free_until[interval->GetHighInterval()->GetRegister()] = free_until[interval->GetRegister()]; - } - } -} - -// Find a free register. If multiple are found, pick the register that -// is free the longest. -bool RegisterAllocator::TryAllocateFreeReg(LiveInterval* current) { - size_t* free_until = registers_array_; - - // First set all registers to be free. - for (size_t i = 0; i < number_of_registers_; ++i) { - free_until[i] = kMaxLifetimePosition; - } - - // For each active interval, set its register to not free. - for (LiveInterval* interval : active_) { - DCHECK(interval->HasRegister()); - free_until[interval->GetRegister()] = 0; - } - - // An interval that starts an instruction (that is, it is not split), may - // re-use the registers used by the inputs of that instruciton, based on the - // location summary. - HInstruction* defined_by = current->GetDefinedBy(); - if (defined_by != nullptr && !current->IsSplit()) { - LocationSummary* locations = defined_by->GetLocations(); - if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) { - HInputsRef inputs = defined_by->GetInputs(); - for (size_t i = 0; i < inputs.size(); ++i) { - // Take the last interval of the input. It is the location of that interval - // that will be used at `defined_by`. - LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling(); - // Note that interval may have not been processed yet. - // TODO: Handle non-split intervals last in the work list. - if (locations->InAt(i).IsValid() - && interval->HasRegister() - && interval->SameRegisterKind(*current)) { - // The input must be live until the end of `defined_by`, to comply to - // the linear scan algorithm. So we use `defined_by`'s end lifetime - // position to check whether the input is dead or is inactive after - // `defined_by`. - DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition())); - size_t position = defined_by->GetLifetimePosition() + 1; - FreeIfNotCoverAt(interval, position, free_until); - } - } - } - } - - // For each inactive interval, set its register to be free until - // the next intersection with `current`. - for (LiveInterval* inactive : inactive_) { - // Temp/Slow-path-safepoint interval has no holes. - DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint()); - if (!current->IsSplit() && !inactive->IsFixed()) { - // Neither current nor inactive are fixed. - // Thanks to SSA, a non-split interval starting in a hole of an - // inactive interval should never intersect with that inactive interval. - // Only if it's not fixed though, because fixed intervals don't come from SSA. - DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime); - continue; - } - - DCHECK(inactive->HasRegister()); - if (free_until[inactive->GetRegister()] == 0) { - // Already used by some active interval. No need to intersect. - continue; - } - size_t next_intersection = inactive->FirstIntersectionWith(current); - if (next_intersection != kNoLifetime) { - free_until[inactive->GetRegister()] = - std::min(free_until[inactive->GetRegister()], next_intersection); - } - } - - int reg = kNoRegister; - if (current->HasRegister()) { - // Some instructions have a fixed register output. - reg = current->GetRegister(); - if (free_until[reg] == 0) { - DCHECK(current->IsHighInterval()); - // AllocateBlockedReg will spill the holder of the register. - return false; - } - } else { - DCHECK(!current->IsHighInterval()); - int hint = current->FindFirstRegisterHint(free_until, liveness_); - if ((hint != kNoRegister) - // For simplicity, if the hint we are getting for a pair cannot be used, - // we are just going to allocate a new pair. - && !(current->IsLowInterval() && IsBlocked(GetHighForLowRegister(hint)))) { - DCHECK(!IsBlocked(hint)); - reg = hint; - } else if (current->IsLowInterval()) { - reg = FindAvailableRegisterPair(free_until, current->GetStart()); - } else { - reg = FindAvailableRegister(free_until, current); - } - } - - DCHECK_NE(reg, kNoRegister); - // If we could not find a register, we need to spill. - if (free_until[reg] == 0) { - return false; - } - - if (current->IsLowInterval()) { - // If the high register of this interval is not available, we need to spill. - int high_reg = current->GetHighInterval()->GetRegister(); - if (high_reg == kNoRegister) { - high_reg = GetHighForLowRegister(reg); - } - if (free_until[high_reg] == 0) { - return false; - } - } - - current->SetRegister(reg); - if (!current->IsDeadAt(free_until[reg])) { - // If the register is only available for a subset of live ranges - // covered by `current`, split `current` before the position where - // the register is not available anymore. - LiveInterval* split = SplitBetween(current, current->GetStart(), free_until[reg]); - DCHECK(split != nullptr); - AddSorted(unhandled_, split); - } - return true; -} - -bool RegisterAllocator::IsBlocked(int reg) const { - return processing_core_registers_ - ? blocked_core_registers_[reg] - : blocked_fp_registers_[reg]; -} - -int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const { - int reg = kNoRegister; - // Pick the register pair that is used the last. - for (size_t i = 0; i < number_of_registers_; ++i) { - if (IsBlocked(i)) continue; - if (!IsLowRegister(i)) continue; - int high_register = GetHighForLowRegister(i); - if (IsBlocked(high_register)) continue; - int existing_high_register = GetHighForLowRegister(reg); - if ((reg == kNoRegister) || (next_use[i] >= next_use[reg] - && next_use[high_register] >= next_use[existing_high_register])) { - reg = i; - if (next_use[i] == kMaxLifetimePosition - && next_use[high_register] == kMaxLifetimePosition) { - break; - } - } else if (next_use[reg] <= starting_at || next_use[existing_high_register] <= starting_at) { - // If one of the current register is known to be unavailable, just unconditionally - // try a new one. - reg = i; - } - } - return reg; -} - -bool RegisterAllocator::IsCallerSaveRegister(int reg) const { - return processing_core_registers_ - ? !codegen_->IsCoreCalleeSaveRegister(reg) - : !codegen_->IsFloatingPointCalleeSaveRegister(reg); -} - -int RegisterAllocator::FindAvailableRegister(size_t* next_use, LiveInterval* current) const { - // We special case intervals that do not span a safepoint to try to find a caller-save - // register if one is available. We iterate from 0 to the number of registers, - // so if there are caller-save registers available at the end, we continue the iteration. - bool prefers_caller_save = !current->HasWillCallSafepoint(); - int reg = kNoRegister; - for (size_t i = 0; i < number_of_registers_; ++i) { - if (IsBlocked(i)) { - // Register cannot be used. Continue. - continue; - } - - // Best case: we found a register fully available. - if (next_use[i] == kMaxLifetimePosition) { - if (prefers_caller_save && !IsCallerSaveRegister(i)) { - // We can get shorter encodings on some platforms by using - // small register numbers. So only update the candidate if the previous - // one was not available for the whole method. - if (reg == kNoRegister || next_use[reg] != kMaxLifetimePosition) { - reg = i; - } - // Continue the iteration in the hope of finding a caller save register. - continue; - } else { - reg = i; - // We know the register is good enough. Return it. - break; - } - } - - // If we had no register before, take this one as a reference. - if (reg == kNoRegister) { - reg = i; - continue; - } - - // Pick the register that is used the last. - if (next_use[i] > next_use[reg]) { - reg = i; - continue; - } - } - return reg; -} - -// Remove interval and its other half if any. Return iterator to the following element. -static ArenaVector<LiveInterval*>::iterator RemoveIntervalAndPotentialOtherHalf( - ArenaVector<LiveInterval*>* intervals, ArenaVector<LiveInterval*>::iterator pos) { - DCHECK(intervals->begin() <= pos && pos < intervals->end()); - LiveInterval* interval = *pos; - if (interval->IsLowInterval()) { - DCHECK(pos + 1 < intervals->end()); - DCHECK_EQ(*(pos + 1), interval->GetHighInterval()); - return intervals->erase(pos, pos + 2); - } else if (interval->IsHighInterval()) { - DCHECK(intervals->begin() < pos); - DCHECK_EQ(*(pos - 1), interval->GetLowInterval()); - return intervals->erase(pos - 1, pos + 1); - } else { - return intervals->erase(pos); - } -} - -bool RegisterAllocator::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position, - size_t first_register_use, - size_t* next_use) { - for (auto it = active_.begin(), end = active_.end(); it != end; ++it) { - LiveInterval* active = *it; - DCHECK(active->HasRegister()); - if (active->IsFixed()) continue; - if (active->IsHighInterval()) continue; - if (first_register_use > next_use[active->GetRegister()]) continue; - - // Split the first interval found that is either: - // 1) A non-pair interval. - // 2) A pair interval whose high is not low + 1. - // 3) A pair interval whose low is not even. - if (!active->IsLowInterval() || - IsLowOfUnalignedPairInterval(active) || - !IsLowRegister(active->GetRegister())) { - LiveInterval* split = Split(active, position); - if (split != active) { - handled_.push_back(active); - } - RemoveIntervalAndPotentialOtherHalf(&active_, it); - AddSorted(unhandled_, split); - return true; - } - } - return false; -} - -// Find the register that is used the last, and spill the interval -// that holds it. If the first use of `current` is after that register -// we spill `current` instead. -bool RegisterAllocator::AllocateBlockedReg(LiveInterval* current) { - size_t first_register_use = current->FirstRegisterUse(); - if (current->HasRegister()) { - DCHECK(current->IsHighInterval()); - // The low interval has allocated the register for the high interval. In - // case the low interval had to split both intervals, we may end up in a - // situation where the high interval does not have a register use anymore. - // We must still proceed in order to split currently active and inactive - // uses of the high interval's register, and put the high interval in the - // active set. - DCHECK(first_register_use != kNoLifetime || (current->GetNextSibling() != nullptr)); - } else if (first_register_use == kNoLifetime) { - AllocateSpillSlotFor(current); - return false; - } - - // First set all registers as not being used. - size_t* next_use = registers_array_; - for (size_t i = 0; i < number_of_registers_; ++i) { - next_use[i] = kMaxLifetimePosition; - } - - // For each active interval, find the next use of its register after the - // start of current. - for (LiveInterval* active : active_) { - DCHECK(active->HasRegister()); - if (active->IsFixed()) { - next_use[active->GetRegister()] = current->GetStart(); - } else { - size_t use = active->FirstRegisterUseAfter(current->GetStart()); - if (use != kNoLifetime) { - next_use[active->GetRegister()] = use; - } - } - } - - // For each inactive interval, find the next use of its register after the - // start of current. - for (LiveInterval* inactive : inactive_) { - // Temp/Slow-path-safepoint interval has no holes. - DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint()); - if (!current->IsSplit() && !inactive->IsFixed()) { - // Neither current nor inactive are fixed. - // Thanks to SSA, a non-split interval starting in a hole of an - // inactive interval should never intersect with that inactive interval. - // Only if it's not fixed though, because fixed intervals don't come from SSA. - DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime); - continue; - } - DCHECK(inactive->HasRegister()); - size_t next_intersection = inactive->FirstIntersectionWith(current); - if (next_intersection != kNoLifetime) { - if (inactive->IsFixed()) { - next_use[inactive->GetRegister()] = - std::min(next_intersection, next_use[inactive->GetRegister()]); - } else { - size_t use = inactive->FirstUseAfter(current->GetStart()); - if (use != kNoLifetime) { - next_use[inactive->GetRegister()] = std::min(use, next_use[inactive->GetRegister()]); - } - } - } - } - - int reg = kNoRegister; - bool should_spill = false; - if (current->HasRegister()) { - DCHECK(current->IsHighInterval()); - reg = current->GetRegister(); - // When allocating the low part, we made sure the high register was available. - DCHECK_LT(first_register_use, next_use[reg]); - } else if (current->IsLowInterval()) { - reg = FindAvailableRegisterPair(next_use, first_register_use); - // We should spill if both registers are not available. - should_spill = (first_register_use >= next_use[reg]) - || (first_register_use >= next_use[GetHighForLowRegister(reg)]); - } else { - DCHECK(!current->IsHighInterval()); - reg = FindAvailableRegister(next_use, current); - should_spill = (first_register_use >= next_use[reg]); - } - - DCHECK_NE(reg, kNoRegister); - if (should_spill) { - DCHECK(!current->IsHighInterval()); - bool is_allocation_at_use_site = (current->GetStart() >= (first_register_use - 1)); - if (is_allocation_at_use_site) { - if (!current->IsLowInterval()) { - DumpInterval(std::cerr, current); - DumpAllIntervals(std::cerr); - // This situation has the potential to infinite loop, so we make it a non-debug CHECK. - HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2); - CHECK(false) << "There is not enough registers available for " - << current->GetParent()->GetDefinedBy()->DebugName() << " " - << current->GetParent()->GetDefinedBy()->GetId() - << " at " << first_register_use - 1 << " " - << (at == nullptr ? "" : at->DebugName()); - } - - // If we're allocating a register for `current` because the instruction at - // that position requires it, but we think we should spill, then there are - // non-pair intervals or unaligned pair intervals blocking the allocation. - // We split the first interval found, and put ourselves first in the - // `unhandled_` list. - bool success = TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(), - first_register_use, - next_use); - DCHECK(success); - LiveInterval* existing = unhandled_->back(); - DCHECK(existing->IsHighInterval()); - DCHECK_EQ(existing->GetLowInterval(), current); - unhandled_->push_back(current); - } else { - // If the first use of that instruction is after the last use of the found - // register, we split this interval just before its first register use. - AllocateSpillSlotFor(current); - LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1); - DCHECK(current != split); - AddSorted(unhandled_, split); + interval->GetHighInterval()->ClearRegister(); + } else if (interval->HasLowInterval()) { + interval->GetLowInterval()->ClearRegister(); } - return false; + return interval; } else { - // Use this register and spill the active and inactives interval that - // have that register. - current->SetRegister(reg); - - for (auto it = active_.begin(), end = active_.end(); it != end; ++it) { - LiveInterval* active = *it; - if (active->GetRegister() == reg) { - DCHECK(!active->IsFixed()); - LiveInterval* split = Split(active, current->GetStart()); - if (split != active) { - handled_.push_back(active); - } - RemoveIntervalAndPotentialOtherHalf(&active_, it); - AddSorted(unhandled_, split); - break; - } - } - - // NOTE: Retrieve end() on each iteration because we're removing elements in the loop body. - for (auto it = inactive_.begin(); it != inactive_.end(); ) { - LiveInterval* inactive = *it; - bool erased = false; - if (inactive->GetRegister() == reg) { - if (!current->IsSplit() && !inactive->IsFixed()) { - // Neither current nor inactive are fixed. - // Thanks to SSA, a non-split interval starting in a hole of an - // inactive interval should never intersect with that inactive interval. - // Only if it's not fixed though, because fixed intervals don't come from SSA. - DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime); - } else { - size_t next_intersection = inactive->FirstIntersectionWith(current); - if (next_intersection != kNoLifetime) { - if (inactive->IsFixed()) { - LiveInterval* split = Split(current, next_intersection); - DCHECK_NE(split, current); - AddSorted(unhandled_, split); - } else { - // Split at the start of `current`, which will lead to splitting - // at the end of the lifetime hole of `inactive`. - LiveInterval* split = Split(inactive, current->GetStart()); - // If it's inactive, it must start before the current interval. - DCHECK_NE(split, inactive); - it = RemoveIntervalAndPotentialOtherHalf(&inactive_, it); - erased = true; - handled_.push_back(inactive); - AddSorted(unhandled_, split); - } - } - } - } - // If we have erased the element, `it` already points to the next element. - // Otherwise we need to move to the next element. - if (!erased) { - ++it; - } - } - - return true; - } -} - -void RegisterAllocator::AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval) { - DCHECK(!interval->IsFixed() && !interval->HasSpillSlot()); - size_t insert_at = 0; - for (size_t i = array->size(); i > 0; --i) { - LiveInterval* current = (*array)[i - 1u]; - // High intervals must be processed right after their low equivalent. - if (current->StartsAfter(interval) && !current->IsHighInterval()) { - insert_at = i; - break; - } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) { - // Ensure the slow path interval is the last to be processed at its location: we want the - // interval to know all live registers at this location. - DCHECK(i == 1 || (*array)[i - 2u]->StartsAfter(current)); - insert_at = i; - break; + LiveInterval* new_interval = interval->SplitAt(position); + if (interval->HasHighInterval()) { + LiveInterval* high = interval->GetHighInterval()->SplitAt(position); + new_interval->SetHighInterval(high); + high->SetLowInterval(new_interval); + } else if (interval->HasLowInterval()) { + LiveInterval* low = interval->GetLowInterval()->SplitAt(position); + new_interval->SetLowInterval(low); + low->SetHighInterval(new_interval); } - } - - // Insert the high interval before the low, to ensure the low is processed before. - auto insert_pos = array->begin() + insert_at; - if (interval->HasHighInterval()) { - array->insert(insert_pos, { interval->GetHighInterval(), interval }); - } else if (interval->HasLowInterval()) { - array->insert(insert_pos, { interval, interval->GetLowInterval() }); - } else { - array->insert(insert_pos, interval); + return new_interval; } } @@ -1258,748 +279,4 @@ LiveInterval* RegisterAllocator::SplitBetween(LiveInterval* interval, size_t fro return Split(interval, block_to->GetLifetimeStart()); } -LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) { - DCHECK_GE(position, interval->GetStart()); - DCHECK(!interval->IsDeadAt(position)); - if (position == interval->GetStart()) { - // Spill slot will be allocated when handling `interval` again. - interval->ClearRegister(); - if (interval->HasHighInterval()) { - interval->GetHighInterval()->ClearRegister(); - } else if (interval->HasLowInterval()) { - interval->GetLowInterval()->ClearRegister(); - } - return interval; - } else { - LiveInterval* new_interval = interval->SplitAt(position); - if (interval->HasHighInterval()) { - LiveInterval* high = interval->GetHighInterval()->SplitAt(position); - new_interval->SetHighInterval(high); - high->SetLowInterval(new_interval); - } else if (interval->HasLowInterval()) { - LiveInterval* low = interval->GetLowInterval()->SplitAt(position); - new_interval->SetLowInterval(low); - low->SetHighInterval(new_interval); - } - return new_interval; - } -} - -void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) { - if (interval->IsHighInterval()) { - // The low interval already took care of allocating the spill slot. - DCHECK(!interval->GetLowInterval()->HasRegister()); - DCHECK(interval->GetLowInterval()->GetParent()->HasSpillSlot()); - return; - } - - LiveInterval* parent = interval->GetParent(); - - // An instruction gets a spill slot for its entire lifetime. If the parent - // of this interval already has a spill slot, there is nothing to do. - if (parent->HasSpillSlot()) { - return; - } - - HInstruction* defined_by = parent->GetDefinedBy(); - DCHECK(!defined_by->IsPhi() || !defined_by->AsPhi()->IsCatchPhi()); - - if (defined_by->IsParameterValue()) { - // Parameters have their own stack slot. - parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue())); - return; - } - - if (defined_by->IsCurrentMethod()) { - parent->SetSpillSlot(0); - return; - } - - if (defined_by->IsConstant()) { - // Constants don't need a spill slot. - return; - } - - ArenaVector<size_t>* spill_slots = nullptr; - switch (interval->GetType()) { - case Primitive::kPrimDouble: - spill_slots = &double_spill_slots_; - break; - case Primitive::kPrimLong: - spill_slots = &long_spill_slots_; - break; - case Primitive::kPrimFloat: - spill_slots = &float_spill_slots_; - break; - case Primitive::kPrimNot: - case Primitive::kPrimInt: - case Primitive::kPrimChar: - case Primitive::kPrimByte: - case Primitive::kPrimBoolean: - case Primitive::kPrimShort: - spill_slots = &int_spill_slots_; - break; - case Primitive::kPrimVoid: - LOG(FATAL) << "Unexpected type for interval " << interval->GetType(); - } - - // Find an available spill slot. - size_t slot = 0; - for (size_t e = spill_slots->size(); slot < e; ++slot) { - if ((*spill_slots)[slot] <= parent->GetStart() - && (slot == (e - 1) || (*spill_slots)[slot + 1] <= parent->GetStart())) { - break; - } - } - - size_t end = interval->GetLastSibling()->GetEnd(); - if (parent->NeedsTwoSpillSlots()) { - if (slot + 2u > spill_slots->size()) { - // We need a new spill slot. - spill_slots->resize(slot + 2u, end); - } - (*spill_slots)[slot] = end; - (*spill_slots)[slot + 1] = end; - } else { - if (slot == spill_slots->size()) { - // We need a new spill slot. - spill_slots->push_back(end); - } else { - (*spill_slots)[slot] = end; - } - } - - // Note that the exact spill slot location will be computed when we resolve, - // that is when we know the number of spill slots for each type. - parent->SetSpillSlot(slot); -} - -static bool IsValidDestination(Location destination) { - return destination.IsRegister() - || destination.IsRegisterPair() - || destination.IsFpuRegister() - || destination.IsFpuRegisterPair() - || destination.IsStackSlot() - || destination.IsDoubleStackSlot(); -} - -void RegisterAllocator::AllocateSpillSlotForCatchPhi(HPhi* phi) { - LiveInterval* interval = phi->GetLiveInterval(); - - HInstruction* previous_phi = phi->GetPrevious(); - DCHECK(previous_phi == nullptr || - previous_phi->AsPhi()->GetRegNumber() <= phi->GetRegNumber()) - << "Phis expected to be sorted by vreg number, so that equivalent phis are adjacent."; - - if (phi->IsVRegEquivalentOf(previous_phi)) { - // This is an equivalent of the previous phi. We need to assign the same - // catch phi slot. - DCHECK(previous_phi->GetLiveInterval()->HasSpillSlot()); - interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot()); - } else { - // Allocate a new spill slot for this catch phi. - // TODO: Reuse spill slots when intervals of phis from different catch - // blocks do not overlap. - interval->SetSpillSlot(catch_phi_spill_slots_); - catch_phi_spill_slots_ += interval->NeedsTwoSpillSlots() ? 2 : 1; - } -} - -void RegisterAllocator::AddMove(HParallelMove* move, - Location source, - Location destination, - HInstruction* instruction, - Primitive::Type type) const { - if (type == Primitive::kPrimLong - && codegen_->ShouldSplitLongMoves() - // The parallel move resolver knows how to deal with long constants. - && !source.IsConstant()) { - move->AddMove(source.ToLow(), destination.ToLow(), Primitive::kPrimInt, instruction); - move->AddMove(source.ToHigh(), destination.ToHigh(), Primitive::kPrimInt, nullptr); - } else { - move->AddMove(source, destination, type, instruction); - } -} - -void RegisterAllocator::AddInputMoveFor(HInstruction* input, - HInstruction* user, - Location source, - Location destination) const { - if (source.Equals(destination)) return; - - DCHECK(!user->IsPhi()); - - HInstruction* previous = user->GetPrevious(); - HParallelMove* move = nullptr; - if (previous == nullptr - || !previous->IsParallelMove() - || previous->GetLifetimePosition() < user->GetLifetimePosition()) { - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(user->GetLifetimePosition()); - user->GetBlock()->InsertInstructionBefore(move, user); - } else { - move = previous->AsParallelMove(); - } - DCHECK_EQ(move->GetLifetimePosition(), user->GetLifetimePosition()); - AddMove(move, source, destination, nullptr, input->GetType()); -} - -static bool IsInstructionStart(size_t position) { - return (position & 1) == 0; -} - -static bool IsInstructionEnd(size_t position) { - return (position & 1) == 1; -} - -void RegisterAllocator::InsertParallelMoveAt(size_t position, - HInstruction* instruction, - Location source, - Location destination) const { - DCHECK(IsValidDestination(destination)) << destination; - if (source.Equals(destination)) return; - - HInstruction* at = liveness_.GetInstructionFromPosition(position / 2); - HParallelMove* move; - if (at == nullptr) { - if (IsInstructionStart(position)) { - // Block boundary, don't do anything the connection of split siblings will handle it. - return; - } else { - // Move must happen before the first instruction of the block. - at = liveness_.GetInstructionFromPosition((position + 1) / 2); - // Note that parallel moves may have already been inserted, so we explicitly - // ask for the first instruction of the block: `GetInstructionFromPosition` does - // not contain the `HParallelMove` instructions. - at = at->GetBlock()->GetFirstInstruction(); - - if (at->GetLifetimePosition() < position) { - // We may insert moves for split siblings and phi spills at the beginning of the block. - // Since this is a different lifetime position, we need to go to the next instruction. - DCHECK(at->IsParallelMove()); - at = at->GetNext(); - } - - if (at->GetLifetimePosition() != position) { - DCHECK_GT(at->GetLifetimePosition(), position); - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(position); - at->GetBlock()->InsertInstructionBefore(move, at); - } else { - DCHECK(at->IsParallelMove()); - move = at->AsParallelMove(); - } - } - } else if (IsInstructionEnd(position)) { - // Move must happen after the instruction. - DCHECK(!at->IsControlFlow()); - move = at->GetNext()->AsParallelMove(); - // This is a parallel move for connecting siblings in a same block. We need to - // differentiate it with moves for connecting blocks, and input moves. - if (move == nullptr || move->GetLifetimePosition() > position) { - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(position); - at->GetBlock()->InsertInstructionBefore(move, at->GetNext()); - } - } else { - // Move must happen before the instruction. - HInstruction* previous = at->GetPrevious(); - if (previous == nullptr - || !previous->IsParallelMove() - || previous->GetLifetimePosition() != position) { - // If the previous is a parallel move, then its position must be lower - // than the given `position`: it was added just after the non-parallel - // move instruction that precedes `instruction`. - DCHECK(previous == nullptr - || !previous->IsParallelMove() - || previous->GetLifetimePosition() < position); - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(position); - at->GetBlock()->InsertInstructionBefore(move, at); - } else { - move = previous->AsParallelMove(); - } - } - DCHECK_EQ(move->GetLifetimePosition(), position); - AddMove(move, source, destination, instruction, instruction->GetType()); -} - -void RegisterAllocator::InsertParallelMoveAtExitOf(HBasicBlock* block, - HInstruction* instruction, - Location source, - Location destination) const { - DCHECK(IsValidDestination(destination)) << destination; - if (source.Equals(destination)) return; - - DCHECK_EQ(block->GetNormalSuccessors().size(), 1u); - HInstruction* last = block->GetLastInstruction(); - // We insert moves at exit for phi predecessors and connecting blocks. - // A block ending with an if or a packed switch cannot branch to a block - // with phis because we do not allow critical edges. It can also not connect - // a split interval between two blocks: the move has to happen in the successor. - DCHECK(!last->IsIf() && !last->IsPackedSwitch()); - HInstruction* previous = last->GetPrevious(); - HParallelMove* move; - // This is a parallel move for connecting blocks. We need to differentiate - // it with moves for connecting siblings in a same block, and output moves. - size_t position = last->GetLifetimePosition(); - if (previous == nullptr || !previous->IsParallelMove() - || previous->AsParallelMove()->GetLifetimePosition() != position) { - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(position); - block->InsertInstructionBefore(move, last); - } else { - move = previous->AsParallelMove(); - } - AddMove(move, source, destination, instruction, instruction->GetType()); -} - -void RegisterAllocator::InsertParallelMoveAtEntryOf(HBasicBlock* block, - HInstruction* instruction, - Location source, - Location destination) const { - DCHECK(IsValidDestination(destination)) << destination; - if (source.Equals(destination)) return; - - HInstruction* first = block->GetFirstInstruction(); - HParallelMove* move = first->AsParallelMove(); - size_t position = block->GetLifetimeStart(); - // This is a parallel move for connecting blocks. We need to differentiate - // it with moves for connecting siblings in a same block, and input moves. - if (move == nullptr || move->GetLifetimePosition() != position) { - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(position); - block->InsertInstructionBefore(move, first); - } - AddMove(move, source, destination, instruction, instruction->GetType()); -} - -void RegisterAllocator::InsertMoveAfter(HInstruction* instruction, - Location source, - Location destination) const { - DCHECK(IsValidDestination(destination)) << destination; - if (source.Equals(destination)) return; - - if (instruction->IsPhi()) { - InsertParallelMoveAtEntryOf(instruction->GetBlock(), instruction, source, destination); - return; - } - - size_t position = instruction->GetLifetimePosition() + 1; - HParallelMove* move = instruction->GetNext()->AsParallelMove(); - // This is a parallel move for moving the output of an instruction. We need - // to differentiate with input moves, moves for connecting siblings in a - // and moves for connecting blocks. - if (move == nullptr || move->GetLifetimePosition() != position) { - move = new (allocator_) HParallelMove(allocator_); - move->SetLifetimePosition(position); - instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext()); - } - AddMove(move, source, destination, instruction, instruction->GetType()); -} - -void RegisterAllocator::ConnectSiblings(LiveInterval* interval) { - LiveInterval* current = interval; - if (current->HasSpillSlot() - && current->HasRegister() - // Currently, we spill unconditionnally the current method in the code generators. - && !interval->GetDefinedBy()->IsCurrentMethod()) { - // We spill eagerly, so move must be at definition. - InsertMoveAfter(interval->GetDefinedBy(), - interval->ToLocation(), - interval->NeedsTwoSpillSlots() - ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()) - : Location::StackSlot(interval->GetParent()->GetSpillSlot())); - } - UsePosition* use = current->GetFirstUse(); - UsePosition* env_use = current->GetFirstEnvironmentUse(); - - // Walk over all siblings, updating locations of use positions, and - // connecting them when they are adjacent. - do { - Location source = current->ToLocation(); - - // Walk over all uses covered by this interval, and update the location - // information. - - LiveRange* range = current->GetFirstRange(); - while (range != nullptr) { - while (use != nullptr && use->GetPosition() < range->GetStart()) { - DCHECK(use->IsSynthesized()); - use = use->GetNext(); - } - while (use != nullptr && use->GetPosition() <= range->GetEnd()) { - DCHECK(!use->GetIsEnvironment()); - DCHECK(current->CoversSlow(use->GetPosition()) || (use->GetPosition() == range->GetEnd())); - if (!use->IsSynthesized()) { - LocationSummary* locations = use->GetUser()->GetLocations(); - Location expected_location = locations->InAt(use->GetInputIndex()); - // The expected (actual) location may be invalid in case the input is unused. Currently - // this only happens for intrinsics. - if (expected_location.IsValid()) { - if (expected_location.IsUnallocated()) { - locations->SetInAt(use->GetInputIndex(), source); - } else if (!expected_location.IsConstant()) { - AddInputMoveFor(interval->GetDefinedBy(), use->GetUser(), source, expected_location); - } - } else { - DCHECK(use->GetUser()->IsInvoke()); - DCHECK(use->GetUser()->AsInvoke()->GetIntrinsic() != Intrinsics::kNone); - } - } - use = use->GetNext(); - } - - // Walk over the environment uses, and update their locations. - while (env_use != nullptr && env_use->GetPosition() < range->GetStart()) { - env_use = env_use->GetNext(); - } - - while (env_use != nullptr && env_use->GetPosition() <= range->GetEnd()) { - DCHECK(current->CoversSlow(env_use->GetPosition()) - || (env_use->GetPosition() == range->GetEnd())); - HEnvironment* environment = env_use->GetEnvironment(); - environment->SetLocationAt(env_use->GetInputIndex(), source); - env_use = env_use->GetNext(); - } - - range = range->GetNext(); - } - - // If the next interval starts just after this one, and has a register, - // insert a move. - LiveInterval* next_sibling = current->GetNextSibling(); - if (next_sibling != nullptr - && next_sibling->HasRegister() - && current->GetEnd() == next_sibling->GetStart()) { - Location destination = next_sibling->ToLocation(); - InsertParallelMoveAt(current->GetEnd(), interval->GetDefinedBy(), source, destination); - } - - for (SafepointPosition* safepoint_position = current->GetFirstSafepoint(); - safepoint_position != nullptr; - safepoint_position = safepoint_position->GetNext()) { - DCHECK(current->CoversSlow(safepoint_position->GetPosition())); - - LocationSummary* locations = safepoint_position->GetLocations(); - if ((current->GetType() == Primitive::kPrimNot) && current->GetParent()->HasSpillSlot()) { - DCHECK(interval->GetDefinedBy()->IsActualObject()) - << interval->GetDefinedBy()->DebugName() - << "@" << safepoint_position->GetInstruction()->DebugName(); - locations->SetStackBit(current->GetParent()->GetSpillSlot() / kVRegSize); - } - - switch (source.GetKind()) { - case Location::kRegister: { - locations->AddLiveRegister(source); - if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) { - DCHECK_LE(locations->GetNumberOfLiveRegisters(), - maximum_number_of_live_core_registers_ + - maximum_number_of_live_fp_registers_); - } - if (current->GetType() == Primitive::kPrimNot) { - DCHECK(interval->GetDefinedBy()->IsActualObject()) - << interval->GetDefinedBy()->DebugName() - << "@" << safepoint_position->GetInstruction()->DebugName(); - locations->SetRegisterBit(source.reg()); - } - break; - } - case Location::kFpuRegister: { - locations->AddLiveRegister(source); - break; - } - - case Location::kRegisterPair: - case Location::kFpuRegisterPair: { - locations->AddLiveRegister(source.ToLow()); - locations->AddLiveRegister(source.ToHigh()); - break; - } - case Location::kStackSlot: // Fall-through - case Location::kDoubleStackSlot: // Fall-through - case Location::kConstant: { - // Nothing to do. - break; - } - default: { - LOG(FATAL) << "Unexpected location for object"; - } - } - } - current = next_sibling; - } while (current != nullptr); - - if (kIsDebugBuild) { - // Following uses can only be synthesized uses. - while (use != nullptr) { - DCHECK(use->IsSynthesized()); - use = use->GetNext(); - } - } -} - -static bool IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop( - HInstruction* instruction) { - return instruction->GetBlock()->GetGraph()->HasIrreducibleLoops() && - (instruction->IsConstant() || instruction->IsCurrentMethod()); -} - -void RegisterAllocator::ConnectSplitSiblings(LiveInterval* interval, - HBasicBlock* from, - HBasicBlock* to) const { - if (interval->GetNextSibling() == nullptr) { - // Nothing to connect. The whole range was allocated to the same location. - return; - } - - // Find the intervals that cover `from` and `to`. - size_t destination_position = to->GetLifetimeStart(); - size_t source_position = from->GetLifetimeEnd() - 1; - LiveInterval* destination = interval->GetSiblingAt(destination_position); - LiveInterval* source = interval->GetSiblingAt(source_position); - - if (destination == source) { - // Interval was not split. - return; - } - - LiveInterval* parent = interval->GetParent(); - HInstruction* defined_by = parent->GetDefinedBy(); - if (codegen_->GetGraph()->HasIrreducibleLoops() && - (destination == nullptr || !destination->CoversSlow(destination_position))) { - // Our live_in fixed point calculation has found that the instruction is live - // in the `to` block because it will eventually enter an irreducible loop. Our - // live interval computation however does not compute a fixed point, and - // therefore will not have a location for that instruction for `to`. - // Because the instruction is a constant or the ArtMethod, we don't need to - // do anything: it will be materialized in the irreducible loop. - DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by)) - << defined_by->DebugName() << ":" << defined_by->GetId() - << " " << from->GetBlockId() << " -> " << to->GetBlockId(); - return; - } - - if (!destination->HasRegister()) { - // Values are eagerly spilled. Spill slot already contains appropriate value. - return; - } - - Location location_source; - // `GetSiblingAt` returns the interval whose start and end cover `position`, - // but does not check whether the interval is inactive at that position. - // The only situation where the interval is inactive at that position is in the - // presence of irreducible loops for constants and ArtMethod. - if (codegen_->GetGraph()->HasIrreducibleLoops() && - (source == nullptr || !source->CoversSlow(source_position))) { - DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by)); - if (defined_by->IsConstant()) { - location_source = defined_by->GetLocations()->Out(); - } else { - DCHECK(defined_by->IsCurrentMethod()); - location_source = parent->NeedsTwoSpillSlots() - ? Location::DoubleStackSlot(parent->GetSpillSlot()) - : Location::StackSlot(parent->GetSpillSlot()); - } - } else { - DCHECK(source != nullptr); - DCHECK(source->CoversSlow(source_position)); - DCHECK(destination->CoversSlow(destination_position)); - location_source = source->ToLocation(); - } - - // If `from` has only one successor, we can put the moves at the exit of it. Otherwise - // we need to put the moves at the entry of `to`. - if (from->GetNormalSuccessors().size() == 1) { - InsertParallelMoveAtExitOf(from, - defined_by, - location_source, - destination->ToLocation()); - } else { - DCHECK_EQ(to->GetPredecessors().size(), 1u); - InsertParallelMoveAtEntryOf(to, - defined_by, - location_source, - destination->ToLocation()); - } -} - -void RegisterAllocator::Resolve() { - codegen_->InitializeCodeGeneration(GetNumberOfSpillSlots(), - maximum_number_of_live_core_registers_, - maximum_number_of_live_fp_registers_, - reserved_out_slots_, - codegen_->GetGraph()->GetLinearOrder()); - - // Adjust the Out Location of instructions. - // TODO: Use pointers of Location inside LiveInterval to avoid doing another iteration. - for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) { - HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); - LiveInterval* current = instruction->GetLiveInterval(); - LocationSummary* locations = instruction->GetLocations(); - Location location = locations->Out(); - if (instruction->IsParameterValue()) { - // Now that we know the frame size, adjust the parameter's location. - if (location.IsStackSlot()) { - location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize()); - current->SetSpillSlot(location.GetStackIndex()); - locations->UpdateOut(location); - } else if (location.IsDoubleStackSlot()) { - location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize()); - current->SetSpillSlot(location.GetStackIndex()); - locations->UpdateOut(location); - } else if (current->HasSpillSlot()) { - current->SetSpillSlot(current->GetSpillSlot() + codegen_->GetFrameSize()); - } - } else if (instruction->IsCurrentMethod()) { - // The current method is always at offset 0. - DCHECK(!current->HasSpillSlot() || (current->GetSpillSlot() == 0)); - } else if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) { - DCHECK(current->HasSpillSlot()); - size_t slot = current->GetSpillSlot() - + GetNumberOfSpillSlots() - + reserved_out_slots_ - - catch_phi_spill_slots_; - current->SetSpillSlot(slot * kVRegSize); - } else if (current->HasSpillSlot()) { - // Adjust the stack slot, now that we know the number of them for each type. - // The way this implementation lays out the stack is the following: - // [parameter slots ] - // [catch phi spill slots ] - // [double spill slots ] - // [long spill slots ] - // [float spill slots ] - // [int/ref values ] - // [maximum out values ] (number of arguments for calls) - // [art method ]. - size_t slot = current->GetSpillSlot(); - switch (current->GetType()) { - case Primitive::kPrimDouble: - slot += long_spill_slots_.size(); - FALLTHROUGH_INTENDED; - case Primitive::kPrimLong: - slot += float_spill_slots_.size(); - FALLTHROUGH_INTENDED; - case Primitive::kPrimFloat: - slot += int_spill_slots_.size(); - FALLTHROUGH_INTENDED; - case Primitive::kPrimNot: - case Primitive::kPrimInt: - case Primitive::kPrimChar: - case Primitive::kPrimByte: - case Primitive::kPrimBoolean: - case Primitive::kPrimShort: - slot += reserved_out_slots_; - break; - case Primitive::kPrimVoid: - LOG(FATAL) << "Unexpected type for interval " << current->GetType(); - } - current->SetSpillSlot(slot * kVRegSize); - } - - Location source = current->ToLocation(); - - if (location.IsUnallocated()) { - if (location.GetPolicy() == Location::kSameAsFirstInput) { - if (locations->InAt(0).IsUnallocated()) { - locations->SetInAt(0, source); - } else { - DCHECK(locations->InAt(0).Equals(source)); - } - } - locations->UpdateOut(source); - } else { - DCHECK(source.Equals(location)); - } - } - - // Connect siblings. - for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) { - HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); - ConnectSiblings(instruction->GetLiveInterval()); - } - - // Resolve non-linear control flow across branches. Order does not matter. - for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { - HBasicBlock* block = it.Current(); - if (block->IsCatchBlock() || - (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) { - // Instructions live at the top of catch blocks or irreducible loop header - // were forced to spill. - if (kIsDebugBuild) { - BitVector* live = liveness_.GetLiveInSet(*block); - for (uint32_t idx : live->Indexes()) { - LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval(); - LiveInterval* sibling = interval->GetSiblingAt(block->GetLifetimeStart()); - // `GetSiblingAt` returns the sibling that contains a position, but there could be - // a lifetime hole in it. `CoversSlow` returns whether the interval is live at that - // position. - if ((sibling != nullptr) && sibling->CoversSlow(block->GetLifetimeStart())) { - DCHECK(!sibling->HasRegister()); - } - } - } - } else { - BitVector* live = liveness_.GetLiveInSet(*block); - for (uint32_t idx : live->Indexes()) { - LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval(); - for (HBasicBlock* predecessor : block->GetPredecessors()) { - ConnectSplitSiblings(interval, predecessor, block); - } - } - } - } - - // Resolve phi inputs. Order does not matter. - for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { - HBasicBlock* current = it.Current(); - if (current->IsCatchBlock()) { - // Catch phi values are set at runtime by the exception delivery mechanism. - } else { - for (HInstructionIterator inst_it(current->GetPhis()); !inst_it.Done(); inst_it.Advance()) { - HInstruction* phi = inst_it.Current(); - for (size_t i = 0, e = current->GetPredecessors().size(); i < e; ++i) { - HBasicBlock* predecessor = current->GetPredecessors()[i]; - DCHECK_EQ(predecessor->GetNormalSuccessors().size(), 1u); - HInstruction* input = phi->InputAt(i); - Location source = input->GetLiveInterval()->GetLocationAt( - predecessor->GetLifetimeEnd() - 1); - Location destination = phi->GetLiveInterval()->ToLocation(); - InsertParallelMoveAtExitOf(predecessor, phi, source, destination); - } - } - } - } - - // Assign temp locations. - for (LiveInterval* temp : temp_intervals_) { - if (temp->IsHighInterval()) { - // High intervals can be skipped, they are already handled by the low interval. - continue; - } - HInstruction* at = liveness_.GetTempUser(temp); - size_t temp_index = liveness_.GetTempIndex(temp); - LocationSummary* locations = at->GetLocations(); - switch (temp->GetType()) { - case Primitive::kPrimInt: - locations->SetTempAt(temp_index, Location::RegisterLocation(temp->GetRegister())); - break; - - case Primitive::kPrimDouble: - if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) { - Location location = Location::FpuRegisterPairLocation( - temp->GetRegister(), temp->GetHighInterval()->GetRegister()); - locations->SetTempAt(temp_index, location); - } else { - locations->SetTempAt(temp_index, Location::FpuRegisterLocation(temp->GetRegister())); - } - break; - - default: - LOG(FATAL) << "Unexpected type for temporary location " - << temp->GetType(); - } - } -} - } // namespace art diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h index 58600b789b..7e1fff8e2b 100644 --- a/compiler/optimizing/register_allocator.h +++ b/compiler/optimizing/register_allocator.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014 The Android Open Source Project + * Copyright (C) 2016 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "arch/instruction_set.h" #include "base/arena_containers.h" +#include "base/arena_object.h" #include "base/macros.h" #include "primitive.h" @@ -29,36 +30,41 @@ class HBasicBlock; class HGraph; class HInstruction; class HParallelMove; -class HPhi; class LiveInterval; class Location; class SsaLivenessAnalysis; /** - * An implementation of a linear scan register allocator on an `HGraph` with SSA form. + * Base class for any register allocator. */ -class RegisterAllocator { +class RegisterAllocator : public ArenaObject<kArenaAllocRegisterAllocator> { public: - RegisterAllocator(ArenaAllocator* allocator, - CodeGenerator* codegen, - const SsaLivenessAnalysis& analysis); + enum Strategy { + kRegisterAllocatorLinearScan, + kRegisterAllocatorGraphColor + }; + + static constexpr Strategy kRegisterAllocatorDefault = kRegisterAllocatorLinearScan; + + static RegisterAllocator* Create(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& analysis, + Strategy strategy = kRegisterAllocatorDefault); + + virtual ~RegisterAllocator() = default; // Main entry point for the register allocator. Given the liveness analysis, // allocates registers to live intervals. - void AllocateRegisters(); + virtual void AllocateRegisters() = 0; // Validate that the register allocator did not allocate the same register to - // intervals that intersect each other. Returns false if it did not. - bool Validate(bool log_fatal_on_failure) { - processing_core_registers_ = true; - if (!ValidateInternal(log_fatal_on_failure)) { - return false; - } - processing_core_registers_ = false; - return ValidateInternal(log_fatal_on_failure); - } - - // Helper method for validation. Used by unit testing. + // intervals that intersect each other. Returns false if it failed. + virtual bool Validate(bool log_fatal_on_failure) = 0; + + static bool CanAllocateRegistersFor(const HGraph& graph, + InstructionSet instruction_set); + + // Verifies that live intervals do not conflict. Used by unit testing. static bool ValidateIntervals(const ArenaVector<LiveInterval*>& intervals, size_t number_of_spill_slots, size_t number_of_out_slots, @@ -67,178 +73,25 @@ class RegisterAllocator { bool processing_core_registers, bool log_fatal_on_failure); - static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set); - - size_t GetNumberOfSpillSlots() const { - return int_spill_slots_.size() - + long_spill_slots_.size() - + float_spill_slots_.size() - + double_spill_slots_.size() - + catch_phi_spill_slots_; - } - static constexpr const char* kRegisterAllocatorPassName = "register"; - private: - // Main methods of the allocator. - void LinearScan(); - bool TryAllocateFreeReg(LiveInterval* interval); - bool AllocateBlockedReg(LiveInterval* interval); - void Resolve(); - - // Add `interval` in the given sorted list. - static void AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval); + protected: + RegisterAllocator(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& analysis); // Split `interval` at the position `position`. The new interval starts at `position`. - LiveInterval* Split(LiveInterval* interval, size_t position); + // If `position` is at the start of `interval`, returns `interval` with its + // register location(s) cleared. + static LiveInterval* Split(LiveInterval* interval, size_t position); // Split `interval` at a position between `from` and `to`. The method will try // to find an optimal split position. LiveInterval* SplitBetween(LiveInterval* interval, size_t from, size_t to); - // Returns whether `reg` is blocked by the code generator. - bool IsBlocked(int reg) const; - - // Update the interval for the register in `location` to cover [start, end). - void BlockRegister(Location location, size_t start, size_t end); - void BlockRegisters(size_t start, size_t end, bool caller_save_only = false); - - // Allocate a spill slot for the given interval. Should be called in linear - // order of interval starting positions. - void AllocateSpillSlotFor(LiveInterval* interval); - - // Allocate a spill slot for the given catch phi. Will allocate the same slot - // for phis which share the same vreg. Must be called in reverse linear order - // of lifetime positions and ascending vreg numbers for correctness. - void AllocateSpillSlotForCatchPhi(HPhi* phi); - - // Connect adjacent siblings within blocks. - void ConnectSiblings(LiveInterval* interval); - - // Connect siblings between block entries and exits. - void ConnectSplitSiblings(LiveInterval* interval, HBasicBlock* from, HBasicBlock* to) const; - - // Helper methods to insert parallel moves in the graph. - void InsertParallelMoveAtExitOf(HBasicBlock* block, - HInstruction* instruction, - Location source, - Location destination) const; - void InsertParallelMoveAtEntryOf(HBasicBlock* block, - HInstruction* instruction, - Location source, - Location destination) const; - void InsertMoveAfter(HInstruction* instruction, Location source, Location destination) const; - void AddInputMoveFor(HInstruction* input, - HInstruction* user, - Location source, - Location destination) const; - void InsertParallelMoveAt(size_t position, - HInstruction* instruction, - Location source, - Location destination) const; - - void AddMove(HParallelMove* move, - Location source, - Location destination, - HInstruction* instruction, - Primitive::Type type) const; - - // Helper methods. - void AllocateRegistersInternal(); - void ProcessInstruction(HInstruction* instruction); - bool ValidateInternal(bool log_fatal_on_failure) const; - void DumpInterval(std::ostream& stream, LiveInterval* interval) const; - void DumpAllIntervals(std::ostream& stream) const; - int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const; - int FindAvailableRegister(size_t* next_use, LiveInterval* current) const; - bool IsCallerSaveRegister(int reg) const; - - // Try splitting an active non-pair or unaligned pair interval at the given `position`. - // Returns whether it was successful at finding such an interval. - bool TrySplitNonPairOrUnalignedPairIntervalAt(size_t position, - size_t first_register_use, - size_t* next_use); - ArenaAllocator* const allocator_; CodeGenerator* const codegen_; const SsaLivenessAnalysis& liveness_; - - // List of intervals for core registers that must be processed, ordered by start - // position. Last entry is the interval that has the lowest start position. - // This list is initially populated before doing the linear scan. - ArenaVector<LiveInterval*> unhandled_core_intervals_; - - // List of intervals for floating-point registers. Same comments as above. - ArenaVector<LiveInterval*> unhandled_fp_intervals_; - - // Currently processed list of unhandled intervals. Either `unhandled_core_intervals_` - // or `unhandled_fp_intervals_`. - ArenaVector<LiveInterval*>* unhandled_; - - // List of intervals that have been processed. - ArenaVector<LiveInterval*> handled_; - - // List of intervals that are currently active when processing a new live interval. - // That is, they have a live range that spans the start of the new interval. - ArenaVector<LiveInterval*> active_; - - // List of intervals that are currently inactive when processing a new live interval. - // That is, they have a lifetime hole that spans the start of the new interval. - ArenaVector<LiveInterval*> inactive_; - - // Fixed intervals for physical registers. Such intervals cover the positions - // where an instruction requires a specific register. - ArenaVector<LiveInterval*> physical_core_register_intervals_; - ArenaVector<LiveInterval*> physical_fp_register_intervals_; - - // Intervals for temporaries. Such intervals cover the positions - // where an instruction requires a temporary. - ArenaVector<LiveInterval*> temp_intervals_; - - // The spill slots allocated for live intervals. We ensure spill slots - // are typed to avoid (1) doing moves and swaps between two different kinds - // of registers, and (2) swapping between a single stack slot and a double - // stack slot. This simplifies the parallel move resolver. - ArenaVector<size_t> int_spill_slots_; - ArenaVector<size_t> long_spill_slots_; - ArenaVector<size_t> float_spill_slots_; - ArenaVector<size_t> double_spill_slots_; - - // Spill slots allocated to catch phis. This category is special-cased because - // (1) slots are allocated prior to linear scan and in reverse linear order, - // (2) equivalent phis need to share slots despite having different types. - size_t catch_phi_spill_slots_; - - // Instructions that need a safepoint. - ArenaVector<HInstruction*> safepoints_; - - // True if processing core registers. False if processing floating - // point registers. - bool processing_core_registers_; - - // Number of registers for the current register kind (core or floating point). - size_t number_of_registers_; - - // Temporary array, allocated ahead of time for simplicity. - size_t* registers_array_; - - // Blocked registers, as decided by the code generator. - bool* const blocked_core_registers_; - bool* const blocked_fp_registers_; - - // Slots reserved for out arguments. - size_t reserved_out_slots_; - - // The maximum live core registers at safepoints. - size_t maximum_number_of_live_core_registers_; - - // The maximum live FP registers at safepoints. - size_t maximum_number_of_live_fp_registers_; - - ART_FRIEND_TEST(RegisterAllocatorTest, FreeUntil); - ART_FRIEND_TEST(RegisterAllocatorTest, SpillInactive); - - DISALLOW_COPY_AND_ASSIGN(RegisterAllocator); }; } // namespace art diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc new file mode 100644 index 0000000000..a21595fe03 --- /dev/null +++ b/compiler/optimizing/register_allocator_graph_color.cc @@ -0,0 +1,2105 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "register_allocator_graph_color.h" + +#include "code_generator.h" +#include "register_allocation_resolver.h" +#include "ssa_liveness_analysis.h" +#include "thread-inl.h" + +namespace art { + +// Highest number of registers that we support for any platform. This can be used for std::bitset, +// for example, which needs to know its size at compile time. +static constexpr size_t kMaxNumRegs = 32; + +// The maximum number of graph coloring attempts before triggering a DCHECK. +// This is meant to catch changes to the graph coloring algorithm that undermine its forward +// progress guarantees. Forward progress for the algorithm means splitting live intervals on +// every graph coloring attempt so that eventually the interference graph will be sparse enough +// to color. The main threat to forward progress is trying to split short intervals which cannot be +// split further; this could cause infinite looping because the interference graph would never +// change. This is avoided by prioritizing short intervals before long ones, so that long +// intervals are split when coloring fails. +static constexpr size_t kMaxGraphColoringAttemptsDebug = 100; + +// We always want to avoid spilling inside loops. +static constexpr size_t kLoopSpillWeightMultiplier = 10; + +// If we avoid moves in single jump blocks, we can avoid jumps to jumps. +static constexpr size_t kSingleJumpBlockWeightMultiplier = 2; + +// We avoid moves in blocks that dominate the exit block, since these blocks will +// be executed on every path through the method. +static constexpr size_t kDominatesExitBlockWeightMultiplier = 2; + +enum class CoalesceKind { + kAdjacentSibling, // Prevents moves at interval split points. + kFixedOutputSibling, // Prevents moves from a fixed output location. + kFixedInput, // Prevents moves into a fixed input location. + kNonlinearControlFlow, // Prevents moves between blocks. + kPhi, // Prevents phi resolution moves. + kFirstInput, // Prevents a single input move. + kAnyInput, // May lead to better instruction selection / smaller encodings. +}; + +std::ostream& operator<<(std::ostream& os, const CoalesceKind& kind) { + return os << static_cast<typename std::underlying_type<CoalesceKind>::type>(kind); +} + +static size_t LoopDepthAt(HBasicBlock* block) { + HLoopInformation* loop_info = block->GetLoopInformation(); + size_t depth = 0; + while (loop_info != nullptr) { + ++depth; + loop_info = loop_info->GetPreHeader()->GetLoopInformation(); + } + return depth; +} + +// Return the runtime cost of inserting a move instruction at the specified location. +static size_t CostForMoveAt(size_t position, const SsaLivenessAnalysis& liveness) { + HBasicBlock* block = liveness.GetBlockFromPosition(position / 2); + DCHECK(block != nullptr); + size_t cost = 1; + if (block->IsSingleJump()) { + cost *= kSingleJumpBlockWeightMultiplier; + } + if (block->Dominates(block->GetGraph()->GetExitBlock())) { + cost *= kDominatesExitBlockWeightMultiplier; + } + for (size_t loop_depth = LoopDepthAt(block); loop_depth > 0; --loop_depth) { + cost *= kLoopSpillWeightMultiplier; + } + return cost; +} + +// In general, we estimate coalesce priority by whether it will definitely avoid a move, +// and by how likely it is to create an interference graph that's harder to color. +static size_t ComputeCoalescePriority(CoalesceKind kind, + size_t position, + const SsaLivenessAnalysis& liveness) { + if (kind == CoalesceKind::kAnyInput) { + // This type of coalescing can affect instruction selection, but not moves, so we + // give it the lowest priority. + return 0; + } else { + return CostForMoveAt(position, liveness); + } +} + +enum class CoalesceStage { + kWorklist, // Currently in the iterative coalescing worklist. + kActive, // Not in a worklist, but could be considered again during iterative coalescing. + kInactive, // No longer considered until last-chance coalescing. + kDefunct, // Either the two nodes interfere, or have already been coalesced. +}; + +std::ostream& operator<<(std::ostream& os, const CoalesceStage& stage) { + return os << static_cast<typename std::underlying_type<CoalesceStage>::type>(stage); +} + +// Represents a coalesce opportunity between two nodes. +struct CoalesceOpportunity : public ArenaObject<kArenaAllocRegisterAllocator> { + CoalesceOpportunity(InterferenceNode* a, + InterferenceNode* b, + CoalesceKind kind, + size_t position, + const SsaLivenessAnalysis& liveness) + : node_a(a), + node_b(b), + stage(CoalesceStage::kWorklist), + priority(ComputeCoalescePriority(kind, position, liveness)) {} + + // Compare two coalesce opportunities based on their priority. + // Return true if lhs has a lower priority than that of rhs. + static bool CmpPriority(const CoalesceOpportunity* lhs, + const CoalesceOpportunity* rhs) { + return lhs->priority < rhs->priority; + } + + InterferenceNode* const node_a; + InterferenceNode* const node_b; + + // The current stage of this coalesce opportunity, indicating whether it is in a worklist, + // and whether it should still be considered. + CoalesceStage stage; + + // The priority of this coalesce opportunity, based on heuristics. + const size_t priority; +}; + +enum class NodeStage { + kInitial, // Uninitialized. + kPrecolored, // Marks fixed nodes. + kSafepoint, // Marks safepoint nodes. + kPrunable, // Marks uncolored nodes in the interference graph. + kSimplifyWorklist, // Marks non-move-related nodes with degree less than the number of registers. + kFreezeWorklist, // Marks move-related nodes with degree less than the number of registers. + kSpillWorklist, // Marks nodes with degree greater or equal to the number of registers. + kPruned // Marks nodes already pruned from the interference graph. +}; + +std::ostream& operator<<(std::ostream& os, const NodeStage& stage) { + return os << static_cast<typename std::underlying_type<NodeStage>::type>(stage); +} + +// Returns the estimated cost of spilling a particular live interval. +static float ComputeSpillWeight(LiveInterval* interval, const SsaLivenessAnalysis& liveness) { + if (interval->HasRegister()) { + // Intervals with a fixed register cannot be spilled. + return std::numeric_limits<float>::min(); + } + + size_t length = interval->GetLength(); + if (length == 1) { + // Tiny intervals should have maximum priority, since they cannot be split any further. + return std::numeric_limits<float>::max(); + } + + size_t use_weight = 0; + if (interval->GetDefinedBy() != nullptr && interval->DefinitionRequiresRegister()) { + // Cost for spilling at a register definition point. + use_weight += CostForMoveAt(interval->GetStart() + 1, liveness); + } + + UsePosition* use = interval->GetFirstUse(); + while (use != nullptr && use->GetPosition() <= interval->GetStart()) { + // Skip uses before the start of this live interval. + use = use->GetNext(); + } + + while (use != nullptr && use->GetPosition() <= interval->GetEnd()) { + if (use->GetUser() != nullptr && use->RequiresRegister()) { + // Cost for spilling at a register use point. + use_weight += CostForMoveAt(use->GetUser()->GetLifetimePosition() - 1, liveness); + } + use = use->GetNext(); + } + + // We divide by the length of the interval because we want to prioritize + // short intervals; we do not benefit much if we split them further. + return static_cast<float>(use_weight) / static_cast<float>(length); +} + +// Interference nodes make up the interference graph, which is the primary data structure in +// graph coloring register allocation. Each node represents a single live interval, and contains +// a set of adjacent nodes corresponding to intervals overlapping with its own. To save memory, +// pre-colored nodes never contain outgoing edges (only incoming ones). +// +// As nodes are pruned from the interference graph, incoming edges of the pruned node are removed, +// but outgoing edges remain in order to later color the node based on the colors of its neighbors. +// +// Note that a pair interval is represented by a single node in the interference graph, which +// essentially requires two colors. One consequence of this is that the degree of a node is not +// necessarily equal to the number of adjacent nodes--instead, the degree reflects the maximum +// number of colors with which a node could interfere. We model this by giving edges different +// weights (1 or 2) to control how much it increases the degree of adjacent nodes. +// For example, the edge between two single nodes will have weight 1. On the other hand, +// the edge between a single node and a pair node will have weight 2. This is because the pair +// node could block up to two colors for the single node, and because the single node could +// block an entire two-register aligned slot for the pair node. +// The degree is defined this way because we use it to decide whether a node is guaranteed a color, +// and thus whether it is safe to prune it from the interference graph early on. +class InterferenceNode : public ArenaObject<kArenaAllocRegisterAllocator> { + public: + InterferenceNode(ArenaAllocator* allocator, + LiveInterval* interval, + const SsaLivenessAnalysis& liveness) + : stage(NodeStage::kInitial), + interval_(interval), + adjacent_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), + coalesce_opportunities_(allocator->Adapter(kArenaAllocRegisterAllocator)), + out_degree_(interval->HasRegister() ? std::numeric_limits<size_t>::max() : 0), + alias_(this), + spill_weight_(ComputeSpillWeight(interval, liveness)), + requires_color_(interval->RequiresRegister()), + needs_spill_slot_(false) { + DCHECK(!interval->IsHighInterval()) << "Pair nodes should be represented by the low interval"; + } + + void AddInterference(InterferenceNode* other, bool guaranteed_not_interfering_yet) { + DCHECK(!IsPrecolored()) << "To save memory, fixed nodes should not have outgoing interferences"; + DCHECK_NE(this, other) << "Should not create self loops in the interference graph"; + DCHECK_EQ(this, alias_) << "Should not add interferences to a node that aliases another"; + DCHECK_NE(stage, NodeStage::kPruned); + DCHECK_NE(other->stage, NodeStage::kPruned); + if (guaranteed_not_interfering_yet) { + DCHECK(std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other) + == adjacent_nodes_.end()); + adjacent_nodes_.push_back(other); + out_degree_ += EdgeWeightWith(other); + } else { + auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other); + if (it == adjacent_nodes_.end()) { + adjacent_nodes_.push_back(other); + out_degree_ += EdgeWeightWith(other); + } + } + } + + void RemoveInterference(InterferenceNode* other) { + DCHECK_EQ(this, alias_) << "Should not remove interferences from a coalesced node"; + DCHECK_EQ(other->stage, NodeStage::kPruned) << "Should only remove interferences when pruning"; + auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other); + if (it != adjacent_nodes_.end()) { + adjacent_nodes_.erase(it); + out_degree_ -= EdgeWeightWith(other); + } + } + + bool ContainsInterference(InterferenceNode* other) const { + DCHECK(!IsPrecolored()) << "Should not query fixed nodes for interferences"; + DCHECK_EQ(this, alias_) << "Should not query a coalesced node for interferences"; + auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other); + return it != adjacent_nodes_.end(); + } + + LiveInterval* GetInterval() const { + return interval_; + } + + const ArenaVector<InterferenceNode*>& GetAdjacentNodes() const { + return adjacent_nodes_; + } + + size_t GetOutDegree() const { + // Pre-colored nodes have infinite degree. + DCHECK(!IsPrecolored() || out_degree_ == std::numeric_limits<size_t>::max()); + return out_degree_; + } + + void AddCoalesceOpportunity(CoalesceOpportunity* opportunity) { + coalesce_opportunities_.push_back(opportunity); + } + + void ClearCoalesceOpportunities() { + coalesce_opportunities_.clear(); + } + + bool IsMoveRelated() const { + for (CoalesceOpportunity* opportunity : coalesce_opportunities_) { + if (opportunity->stage == CoalesceStage::kWorklist || + opportunity->stage == CoalesceStage::kActive) { + return true; + } + } + return false; + } + + // Return whether this node already has a color. + // Used to find fixed nodes in the interference graph before coloring. + bool IsPrecolored() const { + return interval_->HasRegister(); + } + + bool IsPair() const { + return interval_->HasHighInterval(); + } + + void SetAlias(InterferenceNode* rep) { + DCHECK_NE(rep->stage, NodeStage::kPruned); + DCHECK_EQ(this, alias_) << "Should only set a node's alias once"; + alias_ = rep; + } + + InterferenceNode* GetAlias() { + if (alias_ != this) { + // Recurse in order to flatten tree of alias pointers. + alias_ = alias_->GetAlias(); + } + return alias_; + } + + const ArenaVector<CoalesceOpportunity*>& GetCoalesceOpportunities() const { + return coalesce_opportunities_; + } + + float GetSpillWeight() const { + return spill_weight_; + } + + bool RequiresColor() const { + return requires_color_; + } + + // We give extra weight to edges adjacent to pair nodes. See the general comment on the + // interference graph above. + size_t EdgeWeightWith(const InterferenceNode* other) const { + return (IsPair() || other->IsPair()) ? 2 : 1; + } + + bool NeedsSpillSlot() const { + return needs_spill_slot_; + } + + void SetNeedsSpillSlot() { + needs_spill_slot_ = true; + } + + // The current stage of this node, indicating which worklist it belongs to. + NodeStage stage; + + private: + // The live interval that this node represents. + LiveInterval* const interval_; + + // All nodes interfering with this one. + // We use an unsorted vector as a set, since a tree or hash set is too heavy for the + // set sizes that we encounter. Using a vector leads to much better performance. + ArenaVector<InterferenceNode*> adjacent_nodes_; + + // Interference nodes that this node should be coalesced with to reduce moves. + ArenaVector<CoalesceOpportunity*> coalesce_opportunities_; + + // The maximum number of colors with which this node could interfere. This could be more than + // the number of adjacent nodes if this is a pair node, or if some adjacent nodes are pair nodes. + // We use "out" degree because incoming edges come from nodes already pruned from the graph, + // and do not affect the coloring of this node. + // Pre-colored nodes are treated as having infinite degree. + size_t out_degree_; + + // The node representing this node in the interference graph. + // Initially set to `this`, and only changed if this node is coalesced into another. + InterferenceNode* alias_; + + // The cost of splitting and spilling this interval to the stack. + // Nodes with a higher spill weight should be prioritized when assigning registers. + // This is essentially based on use density and location; short intervals with many uses inside + // deeply nested loops have a high spill weight. + const float spill_weight_; + + const bool requires_color_; + + bool needs_spill_slot_; + + DISALLOW_COPY_AND_ASSIGN(InterferenceNode); +}; + +// The order in which we color nodes is important. To guarantee forward progress, +// we prioritize intervals that require registers, and after that we prioritize +// short intervals. That way, if we fail to color a node, it either won't require a +// register, or it will be a long interval that can be split in order to make the +// interference graph sparser. +// To improve code quality, we prioritize intervals used frequently in deeply nested loops. +// (This metric is secondary to the forward progress requirements above.) +// TODO: May also want to consider: +// - Constants (since they can be rematerialized) +// - Allocated spill slots +static bool HasGreaterNodePriority(const InterferenceNode* lhs, + const InterferenceNode* rhs) { + // (1) Prioritize the node that requires a color. + if (lhs->RequiresColor() != rhs->RequiresColor()) { + return lhs->RequiresColor(); + } + + // (2) Prioritize the interval that has a higher spill weight. + return lhs->GetSpillWeight() > rhs->GetSpillWeight(); +} + +// A ColoringIteration holds the many data structures needed for a single graph coloring attempt, +// and provides methods for each phase of the attempt. +class ColoringIteration { + public: + ColoringIteration(RegisterAllocatorGraphColor* register_allocator, + ArenaAllocator* allocator, + bool processing_core_regs, + size_t num_regs) + : register_allocator_(register_allocator), + allocator_(allocator), + processing_core_regs_(processing_core_regs), + num_regs_(num_regs), + interval_node_map_(allocator->Adapter(kArenaAllocRegisterAllocator)), + prunable_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), + pruned_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), + simplify_worklist_(allocator->Adapter(kArenaAllocRegisterAllocator)), + freeze_worklist_(allocator->Adapter(kArenaAllocRegisterAllocator)), + spill_worklist_(HasGreaterNodePriority, allocator->Adapter(kArenaAllocRegisterAllocator)), + coalesce_worklist_(CoalesceOpportunity::CmpPriority, + allocator->Adapter(kArenaAllocRegisterAllocator)) {} + + // Use the intervals collected from instructions to construct an + // interference graph mapping intervals to adjacency lists. + // Also, collect synthesized safepoint nodes, used to keep + // track of live intervals across safepoints. + // TODO: Should build safepoints elsewhere. + void BuildInterferenceGraph(const ArenaVector<LiveInterval*>& intervals, + const ArenaVector<InterferenceNode*>& physical_nodes, + ArenaVector<InterferenceNode*>* safepoints); + + // Add coalesce opportunities to interference nodes. + void FindCoalesceOpportunities(); + + // Prune nodes from the interference graph to be colored later. Build + // a stack (pruned_nodes) containing these intervals in an order determined + // by various heuristics. + void PruneInterferenceGraph(); + + // Process pruned_intervals_ to color the interference graph, spilling when + // necessary. Returns true if successful. Else, some intervals have been + // split, and the interference graph should be rebuilt for another attempt. + bool ColorInterferenceGraph(); + + // Return prunable nodes. + // The register allocator will need to access prunable nodes after coloring + // in order to tell the code generator which registers have been assigned. + const ArenaVector<InterferenceNode*>& GetPrunableNodes() const { + return prunable_nodes_; + } + + private: + // Create a coalesce opportunity between two nodes. + void CreateCoalesceOpportunity(InterferenceNode* a, + InterferenceNode* b, + CoalesceKind kind, + size_t position); + + // Add an edge in the interference graph, if valid. + // Note that `guaranteed_not_interfering_yet` is used to optimize adjacency set insertion + // when possible. + void AddPotentialInterference(InterferenceNode* from, + InterferenceNode* to, + bool guaranteed_not_interfering_yet, + bool both_directions = true); + + // Invalidate all coalesce opportunities this node has, so that it (and possibly its neighbors) + // may be pruned from the interference graph. + void FreezeMoves(InterferenceNode* node); + + // Prune a node from the interference graph, updating worklists if necessary. + void PruneNode(InterferenceNode* node); + + // Add coalesce opportunities associated with this node to the coalesce worklist. + void EnableCoalesceOpportunities(InterferenceNode* node); + + // If needed, from `node` from the freeze worklist to the simplify worklist. + void CheckTransitionFromFreezeWorklist(InterferenceNode* node); + + // Return true if `into` is colored, and `from` can be coalesced with `into` conservatively. + bool PrecoloredHeuristic(InterferenceNode* from, InterferenceNode* into); + + // Return true if `from` and `into` are uncolored, and can be coalesced conservatively. + bool UncoloredHeuristic(InterferenceNode* from, InterferenceNode* into); + + void Coalesce(CoalesceOpportunity* opportunity); + + // Merge `from` into `into` in the interference graph. + void Combine(InterferenceNode* from, InterferenceNode* into); + + // A reference to the register allocator instance, + // needed to split intervals and assign spill slots. + RegisterAllocatorGraphColor* register_allocator_; + + // An arena allocator used for a single graph coloring attempt. + ArenaAllocator* allocator_; + + const bool processing_core_regs_; + + const size_t num_regs_; + + // A map from live intervals to interference nodes. + ArenaHashMap<LiveInterval*, InterferenceNode*> interval_node_map_; + + // Uncolored nodes that should be pruned from the interference graph. + ArenaVector<InterferenceNode*> prunable_nodes_; + + // A stack of nodes pruned from the interference graph, waiting to be pruned. + ArenaStdStack<InterferenceNode*> pruned_nodes_; + + // A queue containing low degree, non-move-related nodes that can pruned immediately. + ArenaDeque<InterferenceNode*> simplify_worklist_; + + // A queue containing low degree, move-related nodes. + ArenaDeque<InterferenceNode*> freeze_worklist_; + + // A queue containing high degree nodes. + // If we have to prune from the spill worklist, we cannot guarantee + // the pruned node a color, so we order the worklist by priority. + ArenaPriorityQueue<InterferenceNode*, decltype(&HasGreaterNodePriority)> spill_worklist_; + + // A queue containing coalesce opportunities. + // We order the coalesce worklist by priority, since some coalesce opportunities (e.g., those + // inside of loops) are more important than others. + ArenaPriorityQueue<CoalesceOpportunity*, + decltype(&CoalesceOpportunity::CmpPriority)> coalesce_worklist_; + + DISALLOW_COPY_AND_ASSIGN(ColoringIteration); +}; + +static bool IsCoreInterval(LiveInterval* interval) { + return !Primitive::IsFloatingPointType(interval->GetType()); +} + +static size_t ComputeReservedArtMethodSlots(const CodeGenerator& codegen) { + return static_cast<size_t>(InstructionSetPointerSize(codegen.GetInstructionSet())) / kVRegSize; +} + +RegisterAllocatorGraphColor::RegisterAllocatorGraphColor(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& liveness, + bool iterative_move_coalescing) + : RegisterAllocator(allocator, codegen, liveness), + iterative_move_coalescing_(iterative_move_coalescing), + core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)), + physical_core_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), + physical_fp_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), + num_int_spill_slots_(0), + num_double_spill_slots_(0), + num_float_spill_slots_(0), + num_long_spill_slots_(0), + catch_phi_spill_slot_counter_(0), + reserved_art_method_slots_(ComputeReservedArtMethodSlots(*codegen)), + reserved_out_slots_(codegen->GetGraph()->GetMaximumNumberOfOutVRegs()), + number_of_globally_blocked_core_regs_(0), + number_of_globally_blocked_fp_regs_(0), + max_safepoint_live_core_regs_(0), + max_safepoint_live_fp_regs_(0) { + // Before we ask for blocked registers, set them up in the code generator. + codegen->SetupBlockedRegisters(); + + // Initialize physical core register live intervals and blocked registers. + // This includes globally blocked registers, such as the stack pointer. + physical_core_nodes_.resize(codegen_->GetNumberOfCoreRegisters(), nullptr); + for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) { + LiveInterval* interval = LiveInterval::MakeFixedInterval(allocator_, i, Primitive::kPrimInt); + physical_core_nodes_[i] = + new (allocator_) InterferenceNode(allocator_, interval, liveness); + physical_core_nodes_[i]->stage = NodeStage::kPrecolored; + core_intervals_.push_back(interval); + if (codegen_->IsBlockedCoreRegister(i)) { + ++number_of_globally_blocked_core_regs_; + interval->AddRange(0, liveness.GetMaxLifetimePosition()); + } + } + // Initialize physical floating point register live intervals and blocked registers. + physical_fp_nodes_.resize(codegen_->GetNumberOfFloatingPointRegisters(), nullptr); + for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) { + LiveInterval* interval = LiveInterval::MakeFixedInterval(allocator_, i, Primitive::kPrimFloat); + physical_fp_nodes_[i] = + new (allocator_) InterferenceNode(allocator_, interval, liveness); + physical_fp_nodes_[i]->stage = NodeStage::kPrecolored; + fp_intervals_.push_back(interval); + if (codegen_->IsBlockedFloatingPointRegister(i)) { + ++number_of_globally_blocked_fp_regs_; + interval->AddRange(0, liveness.GetMaxLifetimePosition()); + } + } +} + +void RegisterAllocatorGraphColor::AllocateRegisters() { + // (1) Collect and prepare live intervals. + ProcessInstructions(); + + for (bool processing_core_regs : {true, false}) { + ArenaVector<LiveInterval*>& intervals = processing_core_regs + ? core_intervals_ + : fp_intervals_; + size_t num_registers = processing_core_regs + ? codegen_->GetNumberOfCoreRegisters() + : codegen_->GetNumberOfFloatingPointRegisters(); + + size_t attempt = 0; + while (true) { + ++attempt; + DCHECK(attempt <= kMaxGraphColoringAttemptsDebug) + << "Exceeded debug max graph coloring register allocation attempts. " + << "This could indicate that the register allocator is not making forward progress, " + << "which could be caused by prioritizing the wrong live intervals. (Short intervals " + << "should be prioritized over long ones, because they cannot be split further.)"; + + // Many data structures are cleared between graph coloring attempts, so we reduce + // total memory usage by using a new arena allocator for each attempt. + ArenaAllocator coloring_attempt_allocator(allocator_->GetArenaPool()); + ColoringIteration iteration(this, + &coloring_attempt_allocator, + processing_core_regs, + num_registers); + + // (2) Build the interference graph. Also gather safepoints. + ArenaVector<InterferenceNode*> safepoints( + coloring_attempt_allocator.Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<InterferenceNode*>& physical_nodes = processing_core_regs + ? physical_core_nodes_ + : physical_fp_nodes_; + iteration.BuildInterferenceGraph(intervals, physical_nodes, &safepoints); + + // (3) Add coalesce opportunities. + // If we have tried coloring the graph a suspiciously high number of times, give + // up on move coalescing, just in case the coalescing heuristics are not conservative. + // (This situation will be caught if DCHECKs are turned on.) + if (iterative_move_coalescing_ && attempt <= kMaxGraphColoringAttemptsDebug) { + iteration.FindCoalesceOpportunities(); + } + + // (4) Prune all uncolored nodes from interference graph. + iteration.PruneInterferenceGraph(); + + // (5) Color pruned nodes based on interferences. + bool successful = iteration.ColorInterferenceGraph(); + + // We manually clear coalesce opportunities for physical nodes, + // since they persist across coloring attempts. + for (InterferenceNode* node : physical_core_nodes_) { + node->ClearCoalesceOpportunities(); + } + for (InterferenceNode* node : physical_fp_nodes_) { + node->ClearCoalesceOpportunities(); + } + + if (successful) { + // Assign spill slots. + AllocateSpillSlots(iteration.GetPrunableNodes()); + + // Compute the maximum number of live registers across safepoints. + // Notice that we do not count globally blocked registers, such as the stack pointer. + if (safepoints.size() > 0) { + size_t max_safepoint_live_regs = ComputeMaxSafepointLiveRegisters(safepoints); + if (processing_core_regs) { + max_safepoint_live_core_regs_ = + max_safepoint_live_regs - number_of_globally_blocked_core_regs_; + } else { + max_safepoint_live_fp_regs_= + max_safepoint_live_regs - number_of_globally_blocked_fp_regs_; + } + } + + // Tell the code generator which registers were allocated. + // We only look at prunable_nodes because we already told the code generator about + // fixed intervals while processing instructions. We also ignore the fixed intervals + // placed at the top of catch blocks. + for (InterferenceNode* node : iteration.GetPrunableNodes()) { + LiveInterval* interval = node->GetInterval(); + if (interval->HasRegister()) { + Location low_reg = processing_core_regs + ? Location::RegisterLocation(interval->GetRegister()) + : Location::FpuRegisterLocation(interval->GetRegister()); + codegen_->AddAllocatedRegister(low_reg); + if (interval->HasHighInterval()) { + LiveInterval* high = interval->GetHighInterval(); + DCHECK(high->HasRegister()); + Location high_reg = processing_core_regs + ? Location::RegisterLocation(high->GetRegister()) + : Location::FpuRegisterLocation(high->GetRegister()); + codegen_->AddAllocatedRegister(high_reg); + } + } else { + DCHECK(!interval->HasHighInterval() || !interval->GetHighInterval()->HasRegister()); + } + } + + break; + } + } // while unsuccessful + } // for processing_core_instructions + + // (6) Resolve locations and deconstruct SSA form. + RegisterAllocationResolver(allocator_, codegen_, liveness_) + .Resolve(max_safepoint_live_core_regs_, + max_safepoint_live_fp_regs_, + reserved_art_method_slots_ + reserved_out_slots_, + num_int_spill_slots_, + num_long_spill_slots_, + num_float_spill_slots_, + num_double_spill_slots_, + catch_phi_spill_slot_counter_, + temp_intervals_); + + if (kIsDebugBuild) { + Validate(/*log_fatal_on_failure*/ true); + } +} + +bool RegisterAllocatorGraphColor::Validate(bool log_fatal_on_failure) { + for (bool processing_core_regs : {true, false}) { + ArenaVector<LiveInterval*> intervals( + allocator_->Adapter(kArenaAllocRegisterAllocatorValidate)); + for (size_t i = 0; i < liveness_.GetNumberOfSsaValues(); ++i) { + HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); + LiveInterval* interval = instruction->GetLiveInterval(); + if (interval != nullptr && IsCoreInterval(interval) == processing_core_regs) { + intervals.push_back(instruction->GetLiveInterval()); + } + } + + ArenaVector<InterferenceNode*>& physical_nodes = processing_core_regs + ? physical_core_nodes_ + : physical_fp_nodes_; + for (InterferenceNode* fixed : physical_nodes) { + LiveInterval* interval = fixed->GetInterval(); + if (interval->GetFirstRange() != nullptr) { + // Ideally we would check fixed ranges as well, but currently there are times when + // two fixed intervals for the same register will overlap. For example, a fixed input + // and a fixed output may sometimes share the same register, in which there will be two + // fixed intervals for the same place. + } + } + + for (LiveInterval* temp : temp_intervals_) { + if (IsCoreInterval(temp) == processing_core_regs) { + intervals.push_back(temp); + } + } + + size_t spill_slots = num_int_spill_slots_ + + num_long_spill_slots_ + + num_float_spill_slots_ + + num_double_spill_slots_ + + catch_phi_spill_slot_counter_; + bool ok = ValidateIntervals(intervals, + spill_slots, + reserved_art_method_slots_ + reserved_out_slots_, + *codegen_, + allocator_, + processing_core_regs, + log_fatal_on_failure); + if (!ok) { + return false; + } + } // for processing_core_regs + + return true; +} + +void RegisterAllocatorGraphColor::ProcessInstructions() { + for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { + HBasicBlock* block = it.Current(); + + // Note that we currently depend on this ordering, since some helper + // code is designed for linear scan register allocation. + for (HBackwardInstructionIterator instr_it(block->GetInstructions()); + !instr_it.Done(); + instr_it.Advance()) { + ProcessInstruction(instr_it.Current()); + } + + for (HInstructionIterator phi_it(block->GetPhis()); !phi_it.Done(); phi_it.Advance()) { + ProcessInstruction(phi_it.Current()); + } + + if (block->IsCatchBlock() + || (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) { + // By blocking all registers at the top of each catch block or irreducible loop, we force + // intervals belonging to the live-in set of the catch/header block to be spilled. + // TODO(ngeoffray): Phis in this block could be allocated in register. + size_t position = block->GetLifetimeStart(); + BlockRegisters(position, position + 1); + } + } +} + +void RegisterAllocatorGraphColor::ProcessInstruction(HInstruction* instruction) { + LocationSummary* locations = instruction->GetLocations(); + if (locations == nullptr) { + return; + } + if (locations->NeedsSafepoint() && codegen_->IsLeafMethod()) { + // We do this here because we do not want the suspend check to artificially + // create live registers. + DCHECK(instruction->IsSuspendCheckEntry()); + DCHECK_EQ(locations->GetTempCount(), 0u); + instruction->GetBlock()->RemoveInstruction(instruction); + return; + } + + CheckForTempLiveIntervals(instruction); + CheckForSafepoint(instruction); + if (instruction->GetLocations()->WillCall()) { + // If a call will happen, create fixed intervals for caller-save registers. + // TODO: Note that it may be beneficial to later split intervals at this point, + // so that we allow last-minute moves from a caller-save register + // to a callee-save register. + BlockRegisters(instruction->GetLifetimePosition(), + instruction->GetLifetimePosition() + 1, + /*caller_save_only*/ true); + } + CheckForFixedInputs(instruction); + + LiveInterval* interval = instruction->GetLiveInterval(); + if (interval == nullptr) { + // Instructions lacking a valid output location do not have a live interval. + DCHECK(!locations->Out().IsValid()); + return; + } + + // Low intervals act as representatives for their corresponding high interval. + DCHECK(!interval->IsHighInterval()); + if (codegen_->NeedsTwoRegisters(interval->GetType())) { + interval->AddHighInterval(); + } + AddSafepointsFor(instruction); + CheckForFixedOutput(instruction); + AllocateSpillSlotForCatchPhi(instruction); + + ArenaVector<LiveInterval*>& intervals = IsCoreInterval(interval) + ? core_intervals_ + : fp_intervals_; + if (interval->HasSpillSlot() || instruction->IsConstant()) { + // Note that if an interval already has a spill slot, then its value currently resides + // in the stack (e.g., parameters). Thus we do not have to allocate a register until its first + // register use. This is also true for constants, which can be materialized at any point. + size_t first_register_use = interval->FirstRegisterUse(); + if (first_register_use != kNoLifetime) { + LiveInterval* split = SplitBetween(interval, interval->GetStart(), first_register_use - 1); + intervals.push_back(split); + } else { + // We won't allocate a register for this value. + } + } else { + intervals.push_back(interval); + } +} + +void RegisterAllocatorGraphColor::CheckForFixedInputs(HInstruction* instruction) { + // We simply block physical registers where necessary. + // TODO: Ideally we would coalesce the physical register with the register + // allocated to the input value, but this can be tricky if, e.g., there + // could be multiple physical register uses of the same value at the + // same instruction. Furthermore, there's currently no distinction between + // fixed inputs to a call (which will be clobbered) and other fixed inputs (which + // may not be clobbered). + LocationSummary* locations = instruction->GetLocations(); + size_t position = instruction->GetLifetimePosition(); + for (size_t i = 0; i < locations->GetInputCount(); ++i) { + Location input = locations->InAt(i); + if (input.IsRegister() || input.IsFpuRegister()) { + BlockRegister(input, position, position + 1); + codegen_->AddAllocatedRegister(input); + } else if (input.IsPair()) { + BlockRegister(input.ToLow(), position, position + 1); + BlockRegister(input.ToHigh(), position, position + 1); + codegen_->AddAllocatedRegister(input.ToLow()); + codegen_->AddAllocatedRegister(input.ToHigh()); + } + } +} + +void RegisterAllocatorGraphColor::CheckForFixedOutput(HInstruction* instruction) { + // If an instruction has a fixed output location, we give the live interval a register and then + // proactively split it just after the definition point to avoid creating too many interferences + // with a fixed node. + LiveInterval* interval = instruction->GetLiveInterval(); + Location out = interval->GetDefinedBy()->GetLocations()->Out(); + size_t position = instruction->GetLifetimePosition(); + DCHECK_GE(interval->GetEnd() - position, 2u); + + if (out.IsUnallocated() && out.GetPolicy() == Location::kSameAsFirstInput) { + out = instruction->GetLocations()->InAt(0); + } + + if (out.IsRegister() || out.IsFpuRegister()) { + interval->SetRegister(out.reg()); + codegen_->AddAllocatedRegister(out); + Split(interval, position + 1); + } else if (out.IsPair()) { + interval->SetRegister(out.low()); + interval->GetHighInterval()->SetRegister(out.high()); + codegen_->AddAllocatedRegister(out.ToLow()); + codegen_->AddAllocatedRegister(out.ToHigh()); + Split(interval, position + 1); + } else if (out.IsStackSlot() || out.IsDoubleStackSlot()) { + interval->SetSpillSlot(out.GetStackIndex()); + } else { + DCHECK(out.IsUnallocated() || out.IsConstant()); + } +} + +void RegisterAllocatorGraphColor::AddSafepointsFor(HInstruction* instruction) { + LiveInterval* interval = instruction->GetLiveInterval(); + for (size_t safepoint_index = safepoints_.size(); safepoint_index > 0; --safepoint_index) { + HInstruction* safepoint = safepoints_[safepoint_index - 1u]; + size_t safepoint_position = safepoint->GetLifetimePosition(); + + // Test that safepoints_ are ordered in the optimal way. + DCHECK(safepoint_index == safepoints_.size() || + safepoints_[safepoint_index]->GetLifetimePosition() < safepoint_position); + + if (safepoint_position == interval->GetStart()) { + // The safepoint is for this instruction, so the location of the instruction + // does not need to be saved. + DCHECK_EQ(safepoint_index, safepoints_.size()); + DCHECK_EQ(safepoint, instruction); + continue; + } else if (interval->IsDeadAt(safepoint_position)) { + break; + } else if (!interval->Covers(safepoint_position)) { + // Hole in the interval. + continue; + } + interval->AddSafepoint(safepoint); + } +} + +void RegisterAllocatorGraphColor::CheckForTempLiveIntervals(HInstruction* instruction) { + LocationSummary* locations = instruction->GetLocations(); + size_t position = instruction->GetLifetimePosition(); + for (size_t i = 0; i < locations->GetTempCount(); ++i) { + Location temp = locations->GetTemp(i); + if (temp.IsRegister() || temp.IsFpuRegister()) { + BlockRegister(temp, position, position + 1); + codegen_->AddAllocatedRegister(temp); + } else { + DCHECK(temp.IsUnallocated()); + switch (temp.GetPolicy()) { + case Location::kRequiresRegister: { + LiveInterval* interval = + LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt); + interval->AddTempUse(instruction, i); + core_intervals_.push_back(interval); + temp_intervals_.push_back(interval); + break; + } + + case Location::kRequiresFpuRegister: { + LiveInterval* interval = + LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble); + interval->AddTempUse(instruction, i); + fp_intervals_.push_back(interval); + temp_intervals_.push_back(interval); + if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) { + interval->AddHighInterval(/*is_temp*/ true); + temp_intervals_.push_back(interval->GetHighInterval()); + } + break; + } + + default: + LOG(FATAL) << "Unexpected policy for temporary location " + << temp.GetPolicy(); + } + } + } +} + +void RegisterAllocatorGraphColor::CheckForSafepoint(HInstruction* instruction) { + LocationSummary* locations = instruction->GetLocations(); + size_t position = instruction->GetLifetimePosition(); + + if (locations->NeedsSafepoint()) { + safepoints_.push_back(instruction); + if (locations->OnlyCallsOnSlowPath()) { + // We add a synthesized range at this position to record the live registers + // at this position. Ideally, we could just update the safepoints when locations + // are updated, but we currently need to know the full stack size before updating + // locations (because of parameters and the fact that we don't have a frame pointer). + // And knowing the full stack size requires to know the maximum number of live + // registers at calls in slow paths. + // By adding the following interval in the algorithm, we can compute this + // maximum before updating locations. + LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction); + interval->AddRange(position, position + 1); + core_intervals_.push_back(interval); + fp_intervals_.push_back(interval); + } + } +} + +LiveInterval* RegisterAllocatorGraphColor::TrySplit(LiveInterval* interval, size_t position) { + if (interval->GetStart() < position && position < interval->GetEnd()) { + return Split(interval, position); + } else { + return interval; + } +} + +void RegisterAllocatorGraphColor::SplitAtRegisterUses(LiveInterval* interval) { + DCHECK(!interval->IsHighInterval()); + + // Split just after a register definition. + if (interval->IsParent() && interval->DefinitionRequiresRegister()) { + interval = TrySplit(interval, interval->GetStart() + 1); + } + + UsePosition* use = interval->GetFirstUse(); + while (use != nullptr && use->GetPosition() < interval->GetStart()) { + use = use->GetNext(); + } + + // Split around register uses. + size_t end = interval->GetEnd(); + while (use != nullptr && use->GetPosition() <= end) { + if (use->RequiresRegister()) { + size_t position = use->GetPosition(); + interval = TrySplit(interval, position - 1); + if (liveness_.GetInstructionFromPosition(position / 2)->IsControlFlow()) { + // If we are at the very end of a basic block, we cannot split right + // at the use. Split just after instead. + interval = TrySplit(interval, position + 1); + } else { + interval = TrySplit(interval, position); + } + } + use = use->GetNext(); + } +} + +void RegisterAllocatorGraphColor::AllocateSpillSlotForCatchPhi(HInstruction* instruction) { + if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) { + HPhi* phi = instruction->AsPhi(); + LiveInterval* interval = phi->GetLiveInterval(); + + HInstruction* previous_phi = phi->GetPrevious(); + DCHECK(previous_phi == nullptr || + previous_phi->AsPhi()->GetRegNumber() <= phi->GetRegNumber()) + << "Phis expected to be sorted by vreg number, " + << "so that equivalent phis are adjacent."; + + if (phi->IsVRegEquivalentOf(previous_phi)) { + // Assign the same spill slot. + DCHECK(previous_phi->GetLiveInterval()->HasSpillSlot()); + interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot()); + } else { + interval->SetSpillSlot(catch_phi_spill_slot_counter_); + catch_phi_spill_slot_counter_ += interval->NeedsTwoSpillSlots() ? 2 : 1; + } + } +} + +void RegisterAllocatorGraphColor::BlockRegister(Location location, + size_t start, + size_t end) { + DCHECK(location.IsRegister() || location.IsFpuRegister()); + int reg = location.reg(); + LiveInterval* interval = location.IsRegister() + ? physical_core_nodes_[reg]->GetInterval() + : physical_fp_nodes_[reg]->GetInterval(); + DCHECK(interval->GetRegister() == reg); + bool blocked_by_codegen = location.IsRegister() + ? codegen_->IsBlockedCoreRegister(reg) + : codegen_->IsBlockedFloatingPointRegister(reg); + if (blocked_by_codegen) { + // We've already blocked this register for the entire method. (And adding a + // range inside another range violates the preconditions of AddRange). + } else { + interval->AddRange(start, end); + } +} + +void RegisterAllocatorGraphColor::BlockRegisters(size_t start, size_t end, bool caller_save_only) { + for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) { + if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) { + BlockRegister(Location::RegisterLocation(i), start, end); + } + } + for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) { + if (!caller_save_only || !codegen_->IsFloatingPointCalleeSaveRegister(i)) { + BlockRegister(Location::FpuRegisterLocation(i), start, end); + } + } +} + +void ColoringIteration::AddPotentialInterference(InterferenceNode* from, + InterferenceNode* to, + bool guaranteed_not_interfering_yet, + bool both_directions) { + if (from->IsPrecolored()) { + // We save space by ignoring outgoing edges from fixed nodes. + } else if (to->GetInterval()->IsSlowPathSafepoint()) { + // Safepoint intervals are only there to count max live registers, + // so no need to give them incoming interference edges. + // This is also necessary for correctness, because we don't want nodes + // to remove themselves from safepoint adjacency sets when they're pruned. + } else if (to->IsPrecolored()) { + // It is important that only a single node represents a given fixed register in the + // interference graph. We retrieve that node here. + const ArenaVector<InterferenceNode*>& physical_nodes = to->GetInterval()->IsFloatingPoint() + ? register_allocator_->physical_fp_nodes_ + : register_allocator_->physical_core_nodes_; + InterferenceNode* physical_node = physical_nodes[to->GetInterval()->GetRegister()]; + from->AddInterference(physical_node, /*guaranteed_not_interfering_yet*/ false); + DCHECK_EQ(to->GetInterval()->GetRegister(), physical_node->GetInterval()->GetRegister()); + DCHECK_EQ(to->GetAlias(), physical_node) << "Fixed nodes should alias the canonical fixed node"; + + // If a node interferes with a fixed pair node, the weight of the edge may + // be inaccurate after using the alias of the pair node, because the alias of the pair node + // is a singular node. + // We could make special pair fixed nodes, but that ends up being too conservative because + // a node could then interfere with both {r1} and {r1,r2}, leading to a degree of + // three rather than two. + // Instead, we explicitly add an interference with the high node of the fixed pair node. + // TODO: This is too conservative at time for pair nodes, but the fact that fixed pair intervals + // can be unaligned on x86 complicates things. + if (to->IsPair()) { + InterferenceNode* high_node = + physical_nodes[to->GetInterval()->GetHighInterval()->GetRegister()]; + DCHECK_EQ(to->GetInterval()->GetHighInterval()->GetRegister(), + high_node->GetInterval()->GetRegister()); + from->AddInterference(high_node, /*guaranteed_not_interfering_yet*/ false); + } + } else { + // Standard interference between two uncolored nodes. + from->AddInterference(to, guaranteed_not_interfering_yet); + } + + if (both_directions) { + AddPotentialInterference(to, from, guaranteed_not_interfering_yet, /*both_directions*/ false); + } +} + +// Returns true if `in_node` represents an input interval of `out_node`, and the output interval +// is allowed to have the same register as the input interval. +// TODO: Ideally we should just produce correct intervals in liveness analysis. +// We would need to refactor the current live interval layout to do so, which is +// no small task. +static bool CheckInputOutputCanOverlap(InterferenceNode* in_node, InterferenceNode* out_node) { + LiveInterval* output_interval = out_node->GetInterval(); + HInstruction* defined_by = output_interval->GetDefinedBy(); + if (defined_by == nullptr) { + // This must not be a definition point. + return false; + } + + LocationSummary* locations = defined_by->GetLocations(); + if (locations->OutputCanOverlapWithInputs()) { + // This instruction does not allow the output to reuse a register from an input. + return false; + } + + LiveInterval* input_interval = in_node->GetInterval(); + LiveInterval* next_sibling = input_interval->GetNextSibling(); + size_t def_position = defined_by->GetLifetimePosition(); + size_t use_position = def_position + 1; + if (next_sibling != nullptr && next_sibling->GetStart() == use_position) { + // The next sibling starts at the use position, so reusing the input register in the output + // would clobber the input before it's moved into the sibling interval location. + return false; + } + + if (!input_interval->IsDeadAt(use_position) && input_interval->CoversSlow(use_position)) { + // The input interval is live after the use position. + return false; + } + + HInputsRef inputs = defined_by->GetInputs(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i]->GetLiveInterval()->GetSiblingAt(def_position) == input_interval) { + DCHECK(input_interval->SameRegisterKind(*output_interval)); + return true; + } + } + + // The input interval was not an input for this instruction. + return false; +} + +void ColoringIteration::BuildInterferenceGraph( + const ArenaVector<LiveInterval*>& intervals, + const ArenaVector<InterferenceNode*>& physical_nodes, + ArenaVector<InterferenceNode*>* safepoints) { + DCHECK(interval_node_map_.Empty() && prunable_nodes_.empty()); + // Build the interference graph efficiently by ordering range endpoints + // by position and doing a linear sweep to find interferences. (That is, we + // jump from endpoint to endpoint, maintaining a set of intervals live at each + // point. If two nodes are ever in the live set at the same time, then they + // interfere with each other.) + // + // We order by both position and (secondarily) by whether the endpoint + // begins or ends a range; we want to process range endings before range + // beginnings at the same position because they should not conflict. + // + // For simplicity, we create a tuple for each endpoint, and then sort the tuples. + // Tuple contents: (position, is_range_beginning, node). + ArenaVector<std::tuple<size_t, bool, InterferenceNode*>> range_endpoints( + allocator_->Adapter(kArenaAllocRegisterAllocator)); + + // We reserve plenty of space to avoid excessive copying. + range_endpoints.reserve(4 * prunable_nodes_.size()); + + for (LiveInterval* parent : intervals) { + for (LiveInterval* sibling = parent; sibling != nullptr; sibling = sibling->GetNextSibling()) { + LiveRange* range = sibling->GetFirstRange(); + if (range != nullptr) { + InterferenceNode* node = new (allocator_) InterferenceNode( + allocator_, sibling, register_allocator_->liveness_); + interval_node_map_.Insert(std::make_pair(sibling, node)); + + if (sibling->HasRegister()) { + // Fixed nodes should alias the canonical node for the corresponding register. + node->stage = NodeStage::kPrecolored; + InterferenceNode* physical_node = physical_nodes[sibling->GetRegister()]; + node->SetAlias(physical_node); + DCHECK_EQ(node->GetInterval()->GetRegister(), + physical_node->GetInterval()->GetRegister()); + } else if (sibling->IsSlowPathSafepoint()) { + // Safepoint intervals are synthesized to count max live registers. + // They will be processed separately after coloring. + node->stage = NodeStage::kSafepoint; + safepoints->push_back(node); + } else { + node->stage = NodeStage::kPrunable; + prunable_nodes_.push_back(node); + } + + while (range != nullptr) { + range_endpoints.push_back(std::make_tuple(range->GetStart(), true, node)); + range_endpoints.push_back(std::make_tuple(range->GetEnd(), false, node)); + range = range->GetNext(); + } + } + } + } + + // Sort the endpoints. + // We explicitly ignore the third entry of each tuple (the node pointer) in order + // to maintain determinism. + std::sort(range_endpoints.begin(), range_endpoints.end(), + [] (const std::tuple<size_t, bool, InterferenceNode*>& lhs, + const std::tuple<size_t, bool, InterferenceNode*>& rhs) { + return std::tie(std::get<0>(lhs), std::get<1>(lhs)) + < std::tie(std::get<0>(rhs), std::get<1>(rhs)); + }); + + // Nodes live at the current position in the linear sweep. + ArenaVector<InterferenceNode*> live( + allocator_->Adapter(kArenaAllocRegisterAllocator)); + + // Linear sweep. When we encounter the beginning of a range, we add the corresponding node to the + // live set. When we encounter the end of a range, we remove the corresponding node + // from the live set. Nodes interfere if they are in the live set at the same time. + for (auto it = range_endpoints.begin(); it != range_endpoints.end(); ++it) { + bool is_range_beginning; + InterferenceNode* node; + size_t position; + // Extract information from the tuple, including the node this tuple represents. + std::tie(position, is_range_beginning, node) = *it; + + if (is_range_beginning) { + bool guaranteed_not_interfering_yet = position == node->GetInterval()->GetStart(); + for (InterferenceNode* conflicting : live) { + DCHECK_NE(node, conflicting); + if (CheckInputOutputCanOverlap(conflicting, node)) { + // We do not add an interference, because the instruction represented by `node` allows + // its output to share a register with an input, represented here by `conflicting`. + } else { + AddPotentialInterference(node, conflicting, guaranteed_not_interfering_yet); + } + } + DCHECK(std::find(live.begin(), live.end(), node) == live.end()); + live.push_back(node); + } else { + // End of range. + auto live_it = std::find(live.begin(), live.end(), node); + DCHECK(live_it != live.end()); + live.erase(live_it); + } + } + DCHECK(live.empty()); +} + +void ColoringIteration::CreateCoalesceOpportunity(InterferenceNode* a, + InterferenceNode* b, + CoalesceKind kind, + size_t position) { + DCHECK_EQ(a->IsPair(), b->IsPair()) + << "Nodes of different memory widths should never be coalesced"; + CoalesceOpportunity* opportunity = + new (allocator_) CoalesceOpportunity(a, b, kind, position, register_allocator_->liveness_); + a->AddCoalesceOpportunity(opportunity); + b->AddCoalesceOpportunity(opportunity); + coalesce_worklist_.push(opportunity); +} + +// When looking for coalesce opportunities, we use the interval_node_map_ to find the node +// corresponding to an interval. Note that not all intervals are in this map, notably the parents +// of constants and stack arguments. (However, these interval should not be involved in coalesce +// opportunities anyway, because they're not going to be in registers.) +void ColoringIteration::FindCoalesceOpportunities() { + DCHECK(coalesce_worklist_.empty()); + + for (InterferenceNode* node : prunable_nodes_) { + LiveInterval* interval = node->GetInterval(); + + // Coalesce siblings. + LiveInterval* next_sibling = interval->GetNextSibling(); + if (next_sibling != nullptr && interval->GetEnd() == next_sibling->GetStart()) { + auto it = interval_node_map_.Find(next_sibling); + if (it != interval_node_map_.end()) { + InterferenceNode* sibling_node = it->second; + CreateCoalesceOpportunity(node, + sibling_node, + CoalesceKind::kAdjacentSibling, + interval->GetEnd()); + } + } + + // Coalesce fixed outputs with this interval if this interval is an adjacent sibling. + LiveInterval* parent = interval->GetParent(); + if (parent->HasRegister() + && parent->GetNextSibling() == interval + && parent->GetEnd() == interval->GetStart()) { + auto it = interval_node_map_.Find(parent); + if (it != interval_node_map_.end()) { + InterferenceNode* parent_node = it->second; + CreateCoalesceOpportunity(node, + parent_node, + CoalesceKind::kFixedOutputSibling, + parent->GetEnd()); + } + } + + // Try to prevent moves across blocks. + // Note that this does not lead to many succeeding coalesce attempts, so could be removed + // if found to add to compile time. + const SsaLivenessAnalysis& liveness = register_allocator_->liveness_; + if (interval->IsSplit() && liveness.IsAtBlockBoundary(interval->GetStart() / 2)) { + // If the start of this interval is at a block boundary, we look at the + // location of the interval in blocks preceding the block this interval + // starts at. This can avoid a move between the two blocks. + HBasicBlock* block = liveness.GetBlockFromPosition(interval->GetStart() / 2); + for (HBasicBlock* predecessor : block->GetPredecessors()) { + size_t position = predecessor->GetLifetimeEnd() - 1; + LiveInterval* existing = interval->GetParent()->GetSiblingAt(position); + if (existing != nullptr) { + auto it = interval_node_map_.Find(existing); + if (it != interval_node_map_.end()) { + InterferenceNode* existing_node = it->second; + CreateCoalesceOpportunity(node, + existing_node, + CoalesceKind::kNonlinearControlFlow, + position); + } + } + } + } + + // Coalesce phi inputs with the corresponding output. + HInstruction* defined_by = interval->GetDefinedBy(); + if (defined_by != nullptr && defined_by->IsPhi()) { + const ArenaVector<HBasicBlock*>& predecessors = defined_by->GetBlock()->GetPredecessors(); + HInputsRef inputs = defined_by->GetInputs(); + + for (size_t i = 0, e = inputs.size(); i < e; ++i) { + // We want the sibling at the end of the appropriate predecessor block. + size_t position = predecessors[i]->GetLifetimeEnd() - 1; + LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(position); + + auto it = interval_node_map_.Find(input_interval); + if (it != interval_node_map_.end()) { + InterferenceNode* input_node = it->second; + CreateCoalesceOpportunity(node, input_node, CoalesceKind::kPhi, position); + } + } + } + + // Coalesce output with first input when policy is kSameAsFirstInput. + if (defined_by != nullptr) { + Location out = defined_by->GetLocations()->Out(); + if (out.IsUnallocated() && out.GetPolicy() == Location::kSameAsFirstInput) { + LiveInterval* input_interval + = defined_by->InputAt(0)->GetLiveInterval()->GetSiblingAt(interval->GetStart() - 1); + // TODO: Could we consider lifetime holes here? + if (input_interval->GetEnd() == interval->GetStart()) { + auto it = interval_node_map_.Find(input_interval); + if (it != interval_node_map_.end()) { + InterferenceNode* input_node = it->second; + CreateCoalesceOpportunity(node, + input_node, + CoalesceKind::kFirstInput, + interval->GetStart()); + } + } + } + } + + // An interval that starts an instruction (that is, it is not split), may + // re-use the registers used by the inputs of that instruction, based on the + // location summary. + if (defined_by != nullptr) { + DCHECK(!interval->IsSplit()); + LocationSummary* locations = defined_by->GetLocations(); + if (!locations->OutputCanOverlapWithInputs()) { + HInputsRef inputs = defined_by->GetInputs(); + for (size_t i = 0; i < inputs.size(); ++i) { + size_t def_point = defined_by->GetLifetimePosition(); + // TODO: Getting the sibling at the def_point might not be quite what we want + // for fixed inputs, since the use will be *at* the def_point rather than after. + LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(def_point); + if (input_interval != nullptr && + input_interval->HasHighInterval() == interval->HasHighInterval()) { + auto it = interval_node_map_.Find(input_interval); + if (it != interval_node_map_.end()) { + InterferenceNode* input_node = it->second; + CreateCoalesceOpportunity(node, + input_node, + CoalesceKind::kAnyInput, + interval->GetStart()); + } + } + } + } + } + + // Try to prevent moves into fixed input locations. + UsePosition* use = interval->GetFirstUse(); + for (; use != nullptr && use->GetPosition() <= interval->GetStart(); use = use->GetNext()) { + // Skip past uses before the start of this interval. + } + for (; use != nullptr && use->GetPosition() <= interval->GetEnd(); use = use->GetNext()) { + HInstruction* user = use->GetUser(); + if (user == nullptr) { + // User may be null for certain intervals, such as temp intervals. + continue; + } + LocationSummary* locations = user->GetLocations(); + Location input = locations->InAt(use->GetInputIndex()); + if (input.IsRegister() || input.IsFpuRegister()) { + // TODO: Could try to handle pair interval too, but coalescing with fixed pair nodes + // is currently not supported. + InterferenceNode* fixed_node = input.IsRegister() + ? register_allocator_->physical_core_nodes_[input.reg()] + : register_allocator_->physical_fp_nodes_[input.reg()]; + CreateCoalesceOpportunity(node, + fixed_node, + CoalesceKind::kFixedInput, + user->GetLifetimePosition()); + } + } + } // for node in prunable_nodes +} + +static bool IsLowDegreeNode(InterferenceNode* node, size_t num_regs) { + return node->GetOutDegree() < num_regs; +} + +static bool IsHighDegreeNode(InterferenceNode* node, size_t num_regs) { + return !IsLowDegreeNode(node, num_regs); +} + +void ColoringIteration::PruneInterferenceGraph() { + DCHECK(pruned_nodes_.empty() + && simplify_worklist_.empty() + && freeze_worklist_.empty() + && spill_worklist_.empty()); + // When pruning the graph, we refer to nodes with degree less than num_regs as low degree nodes, + // and all others as high degree nodes. The distinction is important: low degree nodes are + // guaranteed a color, while high degree nodes are not. + + // Build worklists. Note that the coalesce worklist has already been + // filled by FindCoalesceOpportunities(). + for (InterferenceNode* node : prunable_nodes_) { + DCHECK(!node->IsPrecolored()) << "Fixed nodes should never be pruned"; + DCHECK(!node->GetInterval()->IsSlowPathSafepoint()) << "Safepoint nodes should never be pruned"; + if (IsLowDegreeNode(node, num_regs_)) { + if (node->GetCoalesceOpportunities().empty()) { + // Simplify Worklist. + node->stage = NodeStage::kSimplifyWorklist; + simplify_worklist_.push_back(node); + } else { + // Freeze Worklist. + node->stage = NodeStage::kFreezeWorklist; + freeze_worklist_.push_back(node); + } + } else { + // Spill worklist. + node->stage = NodeStage::kSpillWorklist; + spill_worklist_.push(node); + } + } + + // Prune graph. + // Note that we do not remove a node from its current worklist if it moves to another, so it may + // be in multiple worklists at once; the node's `phase` says which worklist it is really in. + while (true) { + if (!simplify_worklist_.empty()) { + // Prune low-degree nodes. + // TODO: pop_back() should work as well, but it didn't; we get a + // failed check while pruning. We should look into this. + InterferenceNode* node = simplify_worklist_.front(); + simplify_worklist_.pop_front(); + DCHECK_EQ(node->stage, NodeStage::kSimplifyWorklist) << "Cannot move from simplify list"; + DCHECK_LT(node->GetOutDegree(), num_regs_) << "Nodes in simplify list should be low degree"; + DCHECK(!node->IsMoveRelated()) << "Nodes in simplify list should not be move related"; + PruneNode(node); + } else if (!coalesce_worklist_.empty()) { + // Coalesce. + CoalesceOpportunity* opportunity = coalesce_worklist_.top(); + coalesce_worklist_.pop(); + if (opportunity->stage == CoalesceStage::kWorklist) { + Coalesce(opportunity); + } + } else if (!freeze_worklist_.empty()) { + // Freeze moves and prune a low-degree move-related node. + InterferenceNode* node = freeze_worklist_.front(); + freeze_worklist_.pop_front(); + if (node->stage == NodeStage::kFreezeWorklist) { + DCHECK_LT(node->GetOutDegree(), num_regs_) << "Nodes in freeze list should be low degree"; + DCHECK(node->IsMoveRelated()) << "Nodes in freeze list should be move related"; + FreezeMoves(node); + PruneNode(node); + } + } else if (!spill_worklist_.empty()) { + // We spill the lowest-priority node, because pruning a node earlier + // gives it a higher chance of being spilled. + InterferenceNode* node = spill_worklist_.top(); + spill_worklist_.pop(); + if (node->stage == NodeStage::kSpillWorklist) { + DCHECK_GE(node->GetOutDegree(), num_regs_) << "Nodes in spill list should be high degree"; + FreezeMoves(node); + PruneNode(node); + } + } else { + // Pruning complete. + break; + } + } + DCHECK_EQ(prunable_nodes_.size(), pruned_nodes_.size()); +} + +void ColoringIteration::EnableCoalesceOpportunities(InterferenceNode* node) { + for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) { + if (opportunity->stage == CoalesceStage::kActive) { + opportunity->stage = CoalesceStage::kWorklist; + coalesce_worklist_.push(opportunity); + } + } +} + +void ColoringIteration::PruneNode(InterferenceNode* node) { + DCHECK_NE(node->stage, NodeStage::kPruned); + DCHECK(!node->IsPrecolored()); + node->stage = NodeStage::kPruned; + pruned_nodes_.push(node); + + for (InterferenceNode* adj : node->GetAdjacentNodes()) { + DCHECK(!adj->GetInterval()->IsSlowPathSafepoint()) + << "Nodes should never interfere with synthesized safepoint nodes"; + DCHECK_NE(adj->stage, NodeStage::kPruned) << "Should be no interferences with pruned nodes"; + + if (adj->IsPrecolored()) { + // No effect on pre-colored nodes; they're never pruned. + } else { + // Remove the interference. + bool was_high_degree = IsHighDegreeNode(adj, num_regs_); + DCHECK(adj->ContainsInterference(node)) + << "Missing reflexive interference from non-fixed node"; + adj->RemoveInterference(node); + + // Handle transitions from high degree to low degree. + if (was_high_degree && IsLowDegreeNode(adj, num_regs_)) { + EnableCoalesceOpportunities(adj); + for (InterferenceNode* adj_adj : adj->GetAdjacentNodes()) { + EnableCoalesceOpportunities(adj_adj); + } + + DCHECK_EQ(adj->stage, NodeStage::kSpillWorklist); + if (adj->IsMoveRelated()) { + adj->stage = NodeStage::kFreezeWorklist; + freeze_worklist_.push_back(adj); + } else { + adj->stage = NodeStage::kSimplifyWorklist; + simplify_worklist_.push_back(adj); + } + } + } + } +} + +void ColoringIteration::CheckTransitionFromFreezeWorklist(InterferenceNode* node) { + if (IsLowDegreeNode(node, num_regs_) && !node->IsMoveRelated()) { + DCHECK_EQ(node->stage, NodeStage::kFreezeWorklist); + node->stage = NodeStage::kSimplifyWorklist; + simplify_worklist_.push_back(node); + } +} + +void ColoringIteration::FreezeMoves(InterferenceNode* node) { + for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) { + if (opportunity->stage == CoalesceStage::kDefunct) { + // Constrained moves should remain constrained, since they will not be considered + // during last-chance coalescing. + } else { + opportunity->stage = CoalesceStage::kInactive; + } + InterferenceNode* other = opportunity->node_a->GetAlias() == node + ? opportunity->node_b->GetAlias() + : opportunity->node_a->GetAlias(); + if (other != node && other->stage == NodeStage::kFreezeWorklist) { + DCHECK(IsLowDegreeNode(node, num_regs_)); + CheckTransitionFromFreezeWorklist(other); + } + } +} + +bool ColoringIteration::PrecoloredHeuristic(InterferenceNode* from, + InterferenceNode* into) { + if (!into->IsPrecolored()) { + // The uncolored heuristic will cover this case. + return false; + } + if (from->IsPair() || into->IsPair()) { + // TODO: Merging from a pair node is currently not supported, since fixed pair nodes + // are currently represented as two single fixed nodes in the graph, and `into` is + // only one of them. (We may lose the implicit connections to the second one in a merge.) + return false; + } + + // If all adjacent nodes of `from` are "ok", then we can conservatively merge with `into`. + // Reasons an adjacent node `adj` can be "ok": + // (1) If `adj` is low degree, interference with `into` will not affect its existing + // colorable guarantee. (Notice that coalescing cannot increase its degree.) + // (2) If `adj` is pre-colored, it already interferes with `into`. See (3). + // (3) If there's already an interference with `into`, coalescing will not add interferences. + for (InterferenceNode* adj : from->GetAdjacentNodes()) { + if (IsLowDegreeNode(adj, num_regs_) || adj->IsPrecolored() || adj->ContainsInterference(into)) { + // Ok. + } else { + return false; + } + } + return true; +} + +bool ColoringIteration::UncoloredHeuristic(InterferenceNode* from, + InterferenceNode* into) { + if (into->IsPrecolored()) { + // The pre-colored heuristic will handle this case. + return false; + } + + // Arbitrary cap to improve compile time. Tests show that this has negligible affect + // on generated code. + if (from->GetOutDegree() + into->GetOutDegree() > 2 * num_regs_) { + return false; + } + + // It's safe to coalesce two nodes if the resulting node has fewer than `num_regs` neighbors + // of high degree. (Low degree neighbors can be ignored, because they will eventually be + // pruned from the interference graph in the simplify stage.) + size_t high_degree_interferences = 0; + for (InterferenceNode* adj : from->GetAdjacentNodes()) { + if (IsHighDegreeNode(adj, num_regs_)) { + high_degree_interferences += from->EdgeWeightWith(adj); + } + } + for (InterferenceNode* adj : into->GetAdjacentNodes()) { + if (IsHighDegreeNode(adj, num_regs_)) { + if (from->ContainsInterference(adj)) { + // We've already counted this adjacent node. + // Furthermore, its degree will decrease if coalescing succeeds. Thus, it's possible that + // we should not have counted it at all. (This extends the textbook Briggs coalescing test, + // but remains conservative.) + if (adj->GetOutDegree() - into->EdgeWeightWith(adj) < num_regs_) { + high_degree_interferences -= from->EdgeWeightWith(adj); + } + } else { + high_degree_interferences += into->EdgeWeightWith(adj); + } + } + } + + return high_degree_interferences < num_regs_; +} + +void ColoringIteration::Combine(InterferenceNode* from, + InterferenceNode* into) { + from->SetAlias(into); + + // Add interferences. + for (InterferenceNode* adj : from->GetAdjacentNodes()) { + bool was_low_degree = IsLowDegreeNode(adj, num_regs_); + AddPotentialInterference(adj, into, /*guaranteed_not_interfering_yet*/ false); + if (was_low_degree && IsHighDegreeNode(adj, num_regs_)) { + // This is a (temporary) transition to a high degree node. Its degree will decrease again + // when we prune `from`, but it's best to be consistent about the current worklist. + adj->stage = NodeStage::kSpillWorklist; + spill_worklist_.push(adj); + } + } + + // Add coalesce opportunities. + for (CoalesceOpportunity* opportunity : from->GetCoalesceOpportunities()) { + if (opportunity->stage != CoalesceStage::kDefunct) { + into->AddCoalesceOpportunity(opportunity); + } + } + EnableCoalesceOpportunities(from); + + // Prune and update worklists. + PruneNode(from); + if (IsLowDegreeNode(into, num_regs_)) { + // Coalesce(...) takes care of checking for a transition to the simplify worklist. + DCHECK_EQ(into->stage, NodeStage::kFreezeWorklist); + } else if (into->stage == NodeStage::kFreezeWorklist) { + // This is a transition to a high degree node. + into->stage = NodeStage::kSpillWorklist; + spill_worklist_.push(into); + } else { + DCHECK(into->stage == NodeStage::kSpillWorklist || into->stage == NodeStage::kPrecolored); + } +} + +void ColoringIteration::Coalesce(CoalesceOpportunity* opportunity) { + InterferenceNode* from = opportunity->node_a->GetAlias(); + InterferenceNode* into = opportunity->node_b->GetAlias(); + DCHECK_NE(from->stage, NodeStage::kPruned); + DCHECK_NE(into->stage, NodeStage::kPruned); + + if (from->IsPrecolored()) { + // If we have one pre-colored node, make sure it's the `into` node. + std::swap(from, into); + } + + if (from == into) { + // These nodes have already been coalesced. + opportunity->stage = CoalesceStage::kDefunct; + CheckTransitionFromFreezeWorklist(from); + } else if (from->IsPrecolored() || from->ContainsInterference(into)) { + // These nodes interfere. + opportunity->stage = CoalesceStage::kDefunct; + CheckTransitionFromFreezeWorklist(from); + CheckTransitionFromFreezeWorklist(into); + } else if (PrecoloredHeuristic(from, into) + || UncoloredHeuristic(from, into)) { + // We can coalesce these nodes. + opportunity->stage = CoalesceStage::kDefunct; + Combine(from, into); + CheckTransitionFromFreezeWorklist(into); + } else { + // We cannot coalesce, but we may be able to later. + opportunity->stage = CoalesceStage::kActive; + } +} + +// Build a mask with a bit set for each register assigned to some +// interval in `intervals`. +template <typename Container> +static std::bitset<kMaxNumRegs> BuildConflictMask(Container& intervals) { + std::bitset<kMaxNumRegs> conflict_mask; + for (InterferenceNode* adjacent : intervals) { + LiveInterval* conflicting = adjacent->GetInterval(); + if (conflicting->HasRegister()) { + conflict_mask.set(conflicting->GetRegister()); + if (conflicting->HasHighInterval()) { + DCHECK(conflicting->GetHighInterval()->HasRegister()); + conflict_mask.set(conflicting->GetHighInterval()->GetRegister()); + } + } else { + DCHECK(!conflicting->HasHighInterval() + || !conflicting->GetHighInterval()->HasRegister()); + } + } + return conflict_mask; +} + +bool RegisterAllocatorGraphColor::IsCallerSave(size_t reg, bool processing_core_regs) { + return processing_core_regs + ? !codegen_->IsCoreCalleeSaveRegister(reg) + : !codegen_->IsCoreCalleeSaveRegister(reg); +} + +static bool RegisterIsAligned(size_t reg) { + return reg % 2 == 0; +} + +static size_t FindFirstZeroInConflictMask(std::bitset<kMaxNumRegs> conflict_mask) { + // We use CTZ (count trailing zeros) to quickly find the lowest 0 bit. + // Note that CTZ is undefined if all bits are 0, so we special-case it. + return conflict_mask.all() ? conflict_mask.size() : CTZ(~conflict_mask.to_ulong()); +} + +bool ColoringIteration::ColorInterferenceGraph() { + DCHECK_LE(num_regs_, kMaxNumRegs) << "kMaxNumRegs is too small"; + ArenaVector<LiveInterval*> colored_intervals( + allocator_->Adapter(kArenaAllocRegisterAllocator)); + bool successful = true; + + while (!pruned_nodes_.empty()) { + InterferenceNode* node = pruned_nodes_.top(); + pruned_nodes_.pop(); + LiveInterval* interval = node->GetInterval(); + size_t reg = 0; + + InterferenceNode* alias = node->GetAlias(); + if (alias != node) { + // This node was coalesced with another. + LiveInterval* alias_interval = alias->GetInterval(); + if (alias_interval->HasRegister()) { + reg = alias_interval->GetRegister(); + DCHECK(!BuildConflictMask(node->GetAdjacentNodes())[reg]) + << "This node conflicts with the register it was coalesced with"; + } else { + DCHECK(false) << node->GetOutDegree() << " " << alias->GetOutDegree() << " " + << "Move coalescing was not conservative, causing a node to be coalesced " + << "with another node that could not be colored"; + if (interval->RequiresRegister()) { + successful = false; + } + } + } else { + // Search for free register(s). + std::bitset<kMaxNumRegs> conflict_mask = BuildConflictMask(node->GetAdjacentNodes()); + if (interval->HasHighInterval()) { + // Note that the graph coloring allocator assumes that pair intervals are aligned here, + // excluding pre-colored pair intervals (which can currently be unaligned on x86). If we + // change the alignment requirements here, we will have to update the algorithm (e.g., + // be more conservative about the weight of edges adjacent to pair nodes.) + while (reg < num_regs_ - 1 && (conflict_mask[reg] || conflict_mask[reg + 1])) { + reg += 2; + } + + // Try to use a caller-save register first. + for (size_t i = 0; i < num_regs_ - 1; i += 2) { + bool low_caller_save = register_allocator_->IsCallerSave(i, processing_core_regs_); + bool high_caller_save = register_allocator_->IsCallerSave(i + 1, processing_core_regs_); + if (!conflict_mask[i] && !conflict_mask[i + 1]) { + if (low_caller_save && high_caller_save) { + reg = i; + break; + } else if (low_caller_save || high_caller_save) { + reg = i; + // Keep looking to try to get both parts in caller-save registers. + } + } + } + } else { + // Not a pair interval. + reg = FindFirstZeroInConflictMask(conflict_mask); + + // Try to use caller-save registers first. + for (size_t i = 0; i < num_regs_; ++i) { + if (!conflict_mask[i] && register_allocator_->IsCallerSave(i, processing_core_regs_)) { + reg = i; + break; + } + } + } + + // Last-chance coalescing. + for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) { + if (opportunity->stage == CoalesceStage::kDefunct) { + continue; + } + LiveInterval* other_interval = opportunity->node_a->GetAlias() == node + ? opportunity->node_b->GetAlias()->GetInterval() + : opportunity->node_a->GetAlias()->GetInterval(); + if (other_interval->HasRegister()) { + size_t coalesce_register = other_interval->GetRegister(); + if (interval->HasHighInterval()) { + if (!conflict_mask[coalesce_register] && + !conflict_mask[coalesce_register + 1] && + RegisterIsAligned(coalesce_register)) { + reg = coalesce_register; + break; + } + } else if (!conflict_mask[coalesce_register]) { + reg = coalesce_register; + break; + } + } + } + } + + if (reg < (interval->HasHighInterval() ? num_regs_ - 1 : num_regs_)) { + // Assign register. + DCHECK(!interval->HasRegister()); + interval->SetRegister(reg); + colored_intervals.push_back(interval); + if (interval->HasHighInterval()) { + DCHECK(!interval->GetHighInterval()->HasRegister()); + interval->GetHighInterval()->SetRegister(reg + 1); + colored_intervals.push_back(interval->GetHighInterval()); + } + } else if (interval->RequiresRegister()) { + // The interference graph is too dense to color. Make it sparser by + // splitting this live interval. + successful = false; + register_allocator_->SplitAtRegisterUses(interval); + // We continue coloring, because there may be additional intervals that cannot + // be colored, and that we should split. + } else { + // Spill. + node->SetNeedsSpillSlot(); + } + } + + // If unsuccessful, reset all register assignments. + if (!successful) { + for (LiveInterval* interval : colored_intervals) { + interval->ClearRegister(); + } + } + + return successful; +} + +size_t RegisterAllocatorGraphColor::ComputeMaxSafepointLiveRegisters( + const ArenaVector<InterferenceNode*>& safepoints) { + size_t max_safepoint_live_regs = 0; + for (InterferenceNode* safepoint : safepoints) { + DCHECK(safepoint->GetInterval()->IsSlowPathSafepoint()); + std::bitset<kMaxNumRegs> conflict_mask = BuildConflictMask(safepoint->GetAdjacentNodes()); + size_t live_regs = conflict_mask.count(); + max_safepoint_live_regs = std::max(max_safepoint_live_regs, live_regs); + } + return max_safepoint_live_regs; +} + +void RegisterAllocatorGraphColor::AllocateSpillSlots(const ArenaVector<InterferenceNode*>& nodes) { + // The register allocation resolver will organize the stack based on value type, + // so we assign stack slots for each value type separately. + ArenaVector<LiveInterval*> double_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<LiveInterval*> long_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<LiveInterval*> float_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<LiveInterval*> int_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + + // The set of parent intervals already handled. + ArenaSet<LiveInterval*> seen(allocator_->Adapter(kArenaAllocRegisterAllocator)); + + // Find nodes that need spill slots. + for (InterferenceNode* node : nodes) { + if (!node->NeedsSpillSlot()) { + continue; + } + + LiveInterval* parent = node->GetInterval()->GetParent(); + if (seen.find(parent) != seen.end()) { + // We've already handled this interval. + // This can happen if multiple siblings of the same interval request a stack slot. + continue; + } + seen.insert(parent); + + HInstruction* defined_by = parent->GetDefinedBy(); + if (parent->HasSpillSlot()) { + // We already have a spill slot for this value that we can reuse. + } else if (defined_by->IsParameterValue()) { + // Parameters already have a stack slot. + parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue())); + } else if (defined_by->IsCurrentMethod()) { + // The current method is always at stack slot 0. + parent->SetSpillSlot(0); + } else if (defined_by->IsConstant()) { + // Constants don't need a spill slot. + } else { + // We need to find a spill slot for this interval. Place it in the correct + // worklist to be processed later. + switch (node->GetInterval()->GetType()) { + case Primitive::kPrimDouble: + double_intervals.push_back(parent); + break; + case Primitive::kPrimLong: + long_intervals.push_back(parent); + break; + case Primitive::kPrimFloat: + float_intervals.push_back(parent); + break; + case Primitive::kPrimNot: + case Primitive::kPrimInt: + case Primitive::kPrimChar: + case Primitive::kPrimByte: + case Primitive::kPrimBoolean: + case Primitive::kPrimShort: + int_intervals.push_back(parent); + break; + case Primitive::kPrimVoid: + LOG(FATAL) << "Unexpected type for interval " << node->GetInterval()->GetType(); + UNREACHABLE(); + } + } + } + + // Color spill slots for each value type. + ColorSpillSlots(&double_intervals, &num_double_spill_slots_); + ColorSpillSlots(&long_intervals, &num_long_spill_slots_); + ColorSpillSlots(&float_intervals, &num_float_spill_slots_); + ColorSpillSlots(&int_intervals, &num_int_spill_slots_); +} + +void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* intervals, + size_t* num_stack_slots_used) { + // We cannot use the original interference graph here because spill slots are assigned to + // all of the siblings of an interval, whereas an interference node represents only a single + // sibling. So, we assign spill slots linear-scan-style by sorting all the interval endpoints + // by position, and assigning the lowest spill slot available when we encounter an interval + // beginning. We ignore lifetime holes for simplicity. + ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints( + allocator_->Adapter(kArenaAllocRegisterAllocator)); + + for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) { + LiveInterval* parent_interval = *it; + DCHECK(parent_interval->IsParent()); + DCHECK(!parent_interval->HasSpillSlot()); + size_t start = parent_interval->GetStart(); + size_t end = parent_interval->GetLastSibling()->GetEnd(); + DCHECK_LT(start, end); + interval_endpoints.push_back(std::make_tuple(start, true, parent_interval)); + interval_endpoints.push_back(std::make_tuple(end, false, parent_interval)); + } + + // Sort by position. + // We explicitly ignore the third entry of each tuple (the interval pointer) in order + // to maintain determinism. + std::sort(interval_endpoints.begin(), interval_endpoints.end(), + [] (const std::tuple<size_t, bool, LiveInterval*>& lhs, + const std::tuple<size_t, bool, LiveInterval*>& rhs) { + return std::tie(std::get<0>(lhs), std::get<1>(lhs)) + < std::tie(std::get<0>(rhs), std::get<1>(rhs)); + }); + + ArenaBitVector taken(allocator_, 0, true); + for (auto it = interval_endpoints.begin(), end = interval_endpoints.end(); it != end; ++it) { + // Extract information from the current tuple. + LiveInterval* parent_interval; + bool is_interval_beginning; + size_t position; + std::tie(position, is_interval_beginning, parent_interval) = *it; + + bool needs_two_slots = parent_interval->NeedsTwoSpillSlots(); + + if (is_interval_beginning) { + DCHECK(!parent_interval->HasSpillSlot()); + DCHECK_EQ(position, parent_interval->GetStart()); + + // Find a free stack slot. + size_t slot = 0; + for (; taken.IsBitSet(slot) || (needs_two_slots && taken.IsBitSet(slot + 1)); ++slot) { + // Skip taken slots. + } + parent_interval->SetSpillSlot(slot); + + *num_stack_slots_used = std::max(*num_stack_slots_used, + needs_two_slots ? slot + 1 : slot + 2); + if (needs_two_slots && *num_stack_slots_used % 2 != 0) { + // The parallel move resolver requires that there be an even number of spill slots + // allocated for pair value types. + ++(*num_stack_slots_used); + } + + taken.SetBit(slot); + if (needs_two_slots) { + taken.SetBit(slot + 1); + } + } else { + DCHECK_EQ(position, parent_interval->GetLastSibling()->GetEnd()); + DCHECK(parent_interval->HasSpillSlot()); + + // Free up the stack slot used by this interval. + size_t slot = parent_interval->GetSpillSlot(); + DCHECK(taken.IsBitSet(slot)); + DCHECK(!needs_two_slots || taken.IsBitSet(slot + 1)); + taken.ClearBit(slot); + if (needs_two_slots) { + taken.ClearBit(slot + 1); + } + } + } + DCHECK_EQ(taken.NumSetBits(), 0u); +} + +} // namespace art diff --git a/compiler/optimizing/register_allocator_graph_color.h b/compiler/optimizing/register_allocator_graph_color.h new file mode 100644 index 0000000000..ed12561d2c --- /dev/null +++ b/compiler/optimizing/register_allocator_graph_color.h @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_GRAPH_COLOR_H_ +#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_GRAPH_COLOR_H_ + +#include "arch/instruction_set.h" +#include "base/arena_containers.h" +#include "base/arena_object.h" +#include "base/macros.h" +#include "primitive.h" +#include "register_allocator.h" + +namespace art { + +class CodeGenerator; +class HBasicBlock; +class HGraph; +class HInstruction; +class HParallelMove; +class Location; +class SsaLivenessAnalysis; +class InterferenceNode; +struct CoalesceOpportunity; +enum class CoalesceKind; + +/** + * A graph coloring register allocator. + * + * The algorithm proceeds as follows: + * (1) Build an interference graph, where nodes represent live intervals, and edges represent + * interferences between two intervals. Coloring this graph with k colors is isomorphic to + * finding a valid register assignment with k registers. + * (2) To color the graph, first prune all nodes with degree less than k, since these nodes are + * guaranteed a color. (No matter how we color their adjacent nodes, we can give them a + * different color.) As we prune nodes from the graph, more nodes may drop below degree k, + * enabling further pruning. The key is to maintain the pruning order in a stack, so that we + * can color the nodes in the reverse order. + * When there are no more nodes with degree less than k, we start pruning alternate nodes based + * on heuristics. Since these nodes are not guaranteed a color, we are careful to + * prioritize nodes that require a register. We also prioritize short intervals, because + * short intervals cannot be split very much if coloring fails (see below). "Prioritizing" + * a node amounts to pruning it later, since it will have fewer interferences if we prune other + * nodes first. + * (3) We color nodes in the reverse order in which we pruned them. If we cannot assign + * a node a color, we do one of two things: + * - If the node requires a register, we consider the current coloring attempt a failure. + * However, we split the node's live interval in order to make the interference graph + * sparser, so that future coloring attempts may succeed. + * - If the node does not require a register, we simply assign it a location on the stack. + * + * If iterative move coalescing is enabled, the algorithm also attempts to conservatively + * combine nodes in the graph that would prefer to have the same color. (For example, the output + * of a phi instruction would prefer to have the same register as at least one of its inputs.) + * There are several additional steps involved with this: + * - We look for coalesce opportunities by examining each live interval, a step similar to that + * used by linear scan when looking for register hints. + * - When pruning the graph, we maintain a worklist of coalesce opportunities, as well as a worklist + * of low degree nodes that have associated coalesce opportunities. Only when we run out of + * coalesce opportunities do we start pruning coalesce-associated nodes. + * - When pruning a node, if any nodes transition from high degree to low degree, we add + * associated coalesce opportunities to the worklist, since these opportunities may now succeed. + * - Whether two nodes can be combined is decided by two different heuristics--one used when + * coalescing uncolored nodes, and one used for coalescing an uncolored node with a colored node. + * It is vital that we only combine two nodes if the node that remains is guaranteed to receive + * a color. This is because additionally spilling is more costly than failing to coalesce. + * - Even if nodes are not coalesced while pruning, we keep the coalesce opportunities around + * to be used as last-chance register hints when coloring. If nothing else, we try to use + * caller-save registers before callee-save registers. + * + * A good reference for graph coloring register allocation is + * "Modern Compiler Implementation in Java" (Andrew W. Appel, 2nd Edition). + */ +class RegisterAllocatorGraphColor : public RegisterAllocator { + public: + RegisterAllocatorGraphColor(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& analysis, + bool iterative_move_coalescing = true); + ~RegisterAllocatorGraphColor() OVERRIDE {} + + void AllocateRegisters() OVERRIDE; + + bool Validate(bool log_fatal_on_failure); + + private: + // Collect all intervals and prepare for register allocation. + void ProcessInstructions(); + void ProcessInstruction(HInstruction* instruction); + + // If any inputs require specific registers, block those registers + // at the position of this instruction. + void CheckForFixedInputs(HInstruction* instruction); + + // If the output of an instruction requires a specific register, split + // the interval and assign the register to the first part. + void CheckForFixedOutput(HInstruction* instruction); + + // Add all applicable safepoints to a live interval. + // Currently depends on instruction processing order. + void AddSafepointsFor(HInstruction* instruction); + + // Collect all live intervals associated with the temporary locations + // needed by an instruction. + void CheckForTempLiveIntervals(HInstruction* instruction); + + // If a safe point is needed, add a synthesized interval to later record + // the number of live registers at this point. + void CheckForSafepoint(HInstruction* instruction); + + // Split an interval, but only if `position` is inside of `interval`. + // Return either the new interval, or the original interval if not split. + static LiveInterval* TrySplit(LiveInterval* interval, size_t position); + + // To ensure every graph can be colored, split live intervals + // at their register defs and uses. This creates short intervals with low + // degree in the interference graph, which are prioritized during graph + // coloring. + void SplitAtRegisterUses(LiveInterval* interval); + + // If the given instruction is a catch phi, give it a spill slot. + void AllocateSpillSlotForCatchPhi(HInstruction* instruction); + + // Ensure that the given register cannot be allocated for a given range. + void BlockRegister(Location location, size_t start, size_t end); + void BlockRegisters(size_t start, size_t end, bool caller_save_only = false); + + bool IsCallerSave(size_t reg, bool processing_core_regs); + + // Return the maximum number of registers live at safepoints, + // based on the outgoing interference edges of safepoint nodes. + size_t ComputeMaxSafepointLiveRegisters(const ArenaVector<InterferenceNode*>& safepoints); + + // Assigns stack slots to a list of intervals, ensuring that interfering intervals are not + // assigned the same stack slot. + void ColorSpillSlots(ArenaVector<LiveInterval*>* nodes, + size_t* num_stack_slots_used); + + // Provide stack slots to nodes that need them. + void AllocateSpillSlots(const ArenaVector<InterferenceNode*>& nodes); + + // Whether iterative move coalescing should be performed. Iterative move coalescing + // improves code quality, but increases compile time. + const bool iterative_move_coalescing_; + + // Live intervals, split by kind (core and floating point). + // These should not contain high intervals, as those are represented by + // the corresponding low interval throughout register allocation. + ArenaVector<LiveInterval*> core_intervals_; + ArenaVector<LiveInterval*> fp_intervals_; + + // Intervals for temporaries, saved for special handling in the resolution phase. + ArenaVector<LiveInterval*> temp_intervals_; + + // Safepoints, saved for special handling while processing instructions. + ArenaVector<HInstruction*> safepoints_; + + // Interference nodes representing specific registers. These are "pre-colored" nodes + // in the interference graph. + ArenaVector<InterferenceNode*> physical_core_nodes_; + ArenaVector<InterferenceNode*> physical_fp_nodes_; + + // Allocated stack slot counters. + size_t num_int_spill_slots_; + size_t num_double_spill_slots_; + size_t num_float_spill_slots_; + size_t num_long_spill_slots_; + size_t catch_phi_spill_slot_counter_; + + // Number of stack slots needed for the pointer to the current method. + // This is 1 for 32-bit architectures, and 2 for 64-bit architectures. + const size_t reserved_art_method_slots_; + + // Number of stack slots needed for outgoing arguments. + const size_t reserved_out_slots_; + + // The number of globally blocked core and floating point registers, such as the stack pointer. + size_t number_of_globally_blocked_core_regs_; + size_t number_of_globally_blocked_fp_regs_; + + // The maximum number of registers live at safe points. Needed by the code generator. + size_t max_safepoint_live_core_regs_; + size_t max_safepoint_live_fp_regs_; + + friend class ColoringIteration; + + DISALLOW_COPY_AND_ASSIGN(RegisterAllocatorGraphColor); +}; + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_GRAPH_COLOR_H_ diff --git a/compiler/optimizing/register_allocator_linear_scan.cc b/compiler/optimizing/register_allocator_linear_scan.cc new file mode 100644 index 0000000000..768ed2d26a --- /dev/null +++ b/compiler/optimizing/register_allocator_linear_scan.cc @@ -0,0 +1,1225 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "register_allocator_linear_scan.h" + +#include <iostream> +#include <sstream> + +#include "base/bit_vector-inl.h" +#include "base/enums.h" +#include "code_generator.h" +#include "register_allocation_resolver.h" +#include "ssa_liveness_analysis.h" + +namespace art { + +static constexpr size_t kMaxLifetimePosition = -1; +static constexpr size_t kDefaultNumberOfSpillSlots = 4; + +// For simplicity, we implement register pairs as (reg, reg + 1). +// Note that this is a requirement for double registers on ARM, since we +// allocate SRegister. +static int GetHighForLowRegister(int reg) { return reg + 1; } +static bool IsLowRegister(int reg) { return (reg & 1) == 0; } +static bool IsLowOfUnalignedPairInterval(LiveInterval* low) { + return GetHighForLowRegister(low->GetRegister()) != low->GetHighInterval()->GetRegister(); +} + +RegisterAllocatorLinearScan::RegisterAllocatorLinearScan(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& liveness) + : RegisterAllocator(allocator, codegen, liveness), + unhandled_core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + unhandled_fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + unhandled_(nullptr), + handled_(allocator->Adapter(kArenaAllocRegisterAllocator)), + active_(allocator->Adapter(kArenaAllocRegisterAllocator)), + inactive_(allocator->Adapter(kArenaAllocRegisterAllocator)), + physical_core_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + physical_fp_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)), + int_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), + long_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), + float_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), + double_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)), + catch_phi_spill_slots_(0), + safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)), + processing_core_registers_(false), + number_of_registers_(-1), + registers_array_(nullptr), + blocked_core_registers_(codegen->GetBlockedCoreRegisters()), + blocked_fp_registers_(codegen->GetBlockedFloatingPointRegisters()), + reserved_out_slots_(0), + maximum_number_of_live_core_registers_(0), + maximum_number_of_live_fp_registers_(0) { + temp_intervals_.reserve(4); + int_spill_slots_.reserve(kDefaultNumberOfSpillSlots); + long_spill_slots_.reserve(kDefaultNumberOfSpillSlots); + float_spill_slots_.reserve(kDefaultNumberOfSpillSlots); + double_spill_slots_.reserve(kDefaultNumberOfSpillSlots); + + codegen->SetupBlockedRegisters(); + physical_core_register_intervals_.resize(codegen->GetNumberOfCoreRegisters(), nullptr); + physical_fp_register_intervals_.resize(codegen->GetNumberOfFloatingPointRegisters(), nullptr); + // Always reserve for the current method and the graph's max out registers. + // TODO: compute it instead. + // ArtMethod* takes 2 vregs for 64 bits. + size_t ptr_size = static_cast<size_t>(InstructionSetPointerSize(codegen->GetInstructionSet())); + reserved_out_slots_ = ptr_size / kVRegSize + codegen->GetGraph()->GetMaximumNumberOfOutVRegs(); +} + +static bool ShouldProcess(bool processing_core_registers, LiveInterval* interval) { + if (interval == nullptr) return false; + bool is_core_register = (interval->GetType() != Primitive::kPrimDouble) + && (interval->GetType() != Primitive::kPrimFloat); + return processing_core_registers == is_core_register; +} + +void RegisterAllocatorLinearScan::AllocateRegisters() { + AllocateRegistersInternal(); + RegisterAllocationResolver(allocator_, codegen_, liveness_) + .Resolve(maximum_number_of_live_core_registers_, + maximum_number_of_live_fp_registers_, + reserved_out_slots_, + int_spill_slots_.size(), + long_spill_slots_.size(), + float_spill_slots_.size(), + double_spill_slots_.size(), + catch_phi_spill_slots_, + temp_intervals_); + + if (kIsDebugBuild) { + processing_core_registers_ = true; + ValidateInternal(true); + processing_core_registers_ = false; + ValidateInternal(true); + // Check that the linear order is still correct with regards to lifetime positions. + // Since only parallel moves have been inserted during the register allocation, + // these checks are mostly for making sure these moves have been added correctly. + size_t current_liveness = 0; + for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { + HBasicBlock* block = it.Current(); + for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) { + HInstruction* instruction = inst_it.Current(); + DCHECK_LE(current_liveness, instruction->GetLifetimePosition()); + current_liveness = instruction->GetLifetimePosition(); + } + for (HInstructionIterator inst_it(block->GetInstructions()); + !inst_it.Done(); + inst_it.Advance()) { + HInstruction* instruction = inst_it.Current(); + DCHECK_LE(current_liveness, instruction->GetLifetimePosition()) << instruction->DebugName(); + current_liveness = instruction->GetLifetimePosition(); + } + } + } +} + +void RegisterAllocatorLinearScan::BlockRegister(Location location, size_t start, size_t end) { + int reg = location.reg(); + DCHECK(location.IsRegister() || location.IsFpuRegister()); + LiveInterval* interval = location.IsRegister() + ? physical_core_register_intervals_[reg] + : physical_fp_register_intervals_[reg]; + Primitive::Type type = location.IsRegister() + ? Primitive::kPrimInt + : Primitive::kPrimFloat; + if (interval == nullptr) { + interval = LiveInterval::MakeFixedInterval(allocator_, reg, type); + if (location.IsRegister()) { + physical_core_register_intervals_[reg] = interval; + } else { + physical_fp_register_intervals_[reg] = interval; + } + } + DCHECK(interval->GetRegister() == reg); + interval->AddRange(start, end); +} + +void RegisterAllocatorLinearScan::BlockRegisters(size_t start, size_t end, bool caller_save_only) { + for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) { + if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) { + BlockRegister(Location::RegisterLocation(i), start, end); + } + } + for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) { + if (!caller_save_only || !codegen_->IsFloatingPointCalleeSaveRegister(i)) { + BlockRegister(Location::FpuRegisterLocation(i), start, end); + } + } +} + +void RegisterAllocatorLinearScan::AllocateRegistersInternal() { + // Iterate post-order, to ensure the list is sorted, and the last added interval + // is the one with the lowest start position. + for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) { + HBasicBlock* block = it.Current(); + for (HBackwardInstructionIterator back_it(block->GetInstructions()); !back_it.Done(); + back_it.Advance()) { + ProcessInstruction(back_it.Current()); + } + for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) { + ProcessInstruction(inst_it.Current()); + } + + if (block->IsCatchBlock() || + (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) { + // By blocking all registers at the top of each catch block or irreducible loop, we force + // intervals belonging to the live-in set of the catch/header block to be spilled. + // TODO(ngeoffray): Phis in this block could be allocated in register. + size_t position = block->GetLifetimeStart(); + BlockRegisters(position, position + 1); + } + } + + number_of_registers_ = codegen_->GetNumberOfCoreRegisters(); + registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_, + kArenaAllocRegisterAllocator); + processing_core_registers_ = true; + unhandled_ = &unhandled_core_intervals_; + for (LiveInterval* fixed : physical_core_register_intervals_) { + if (fixed != nullptr) { + // Fixed interval is added to inactive_ instead of unhandled_. + // It's also the only type of inactive interval whose start position + // can be after the current interval during linear scan. + // Fixed interval is never split and never moves to unhandled_. + inactive_.push_back(fixed); + } + } + LinearScan(); + + inactive_.clear(); + active_.clear(); + handled_.clear(); + + number_of_registers_ = codegen_->GetNumberOfFloatingPointRegisters(); + registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_, + kArenaAllocRegisterAllocator); + processing_core_registers_ = false; + unhandled_ = &unhandled_fp_intervals_; + for (LiveInterval* fixed : physical_fp_register_intervals_) { + if (fixed != nullptr) { + // Fixed interval is added to inactive_ instead of unhandled_. + // It's also the only type of inactive interval whose start position + // can be after the current interval during linear scan. + // Fixed interval is never split and never moves to unhandled_. + inactive_.push_back(fixed); + } + } + LinearScan(); +} + +void RegisterAllocatorLinearScan::ProcessInstruction(HInstruction* instruction) { + LocationSummary* locations = instruction->GetLocations(); + size_t position = instruction->GetLifetimePosition(); + + if (locations == nullptr) return; + + // Create synthesized intervals for temporaries. + for (size_t i = 0; i < locations->GetTempCount(); ++i) { + Location temp = locations->GetTemp(i); + if (temp.IsRegister() || temp.IsFpuRegister()) { + BlockRegister(temp, position, position + 1); + // Ensure that an explicit temporary register is marked as being allocated. + codegen_->AddAllocatedRegister(temp); + } else { + DCHECK(temp.IsUnallocated()); + switch (temp.GetPolicy()) { + case Location::kRequiresRegister: { + LiveInterval* interval = + LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt); + temp_intervals_.push_back(interval); + interval->AddTempUse(instruction, i); + unhandled_core_intervals_.push_back(interval); + break; + } + + case Location::kRequiresFpuRegister: { + LiveInterval* interval = + LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble); + temp_intervals_.push_back(interval); + interval->AddTempUse(instruction, i); + if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) { + interval->AddHighInterval(/* is_temp */ true); + LiveInterval* high = interval->GetHighInterval(); + temp_intervals_.push_back(high); + unhandled_fp_intervals_.push_back(high); + } + unhandled_fp_intervals_.push_back(interval); + break; + } + + default: + LOG(FATAL) << "Unexpected policy for temporary location " + << temp.GetPolicy(); + } + } + } + + bool core_register = (instruction->GetType() != Primitive::kPrimDouble) + && (instruction->GetType() != Primitive::kPrimFloat); + + if (locations->NeedsSafepoint()) { + if (codegen_->IsLeafMethod()) { + // TODO: We do this here because we do not want the suspend check to artificially + // create live registers. We should find another place, but this is currently the + // simplest. + DCHECK(instruction->IsSuspendCheckEntry()); + instruction->GetBlock()->RemoveInstruction(instruction); + return; + } + safepoints_.push_back(instruction); + if (locations->OnlyCallsOnSlowPath()) { + // We add a synthesized range at this position to record the live registers + // at this position. Ideally, we could just update the safepoints when locations + // are updated, but we currently need to know the full stack size before updating + // locations (because of parameters and the fact that we don't have a frame pointer). + // And knowing the full stack size requires to know the maximum number of live + // registers at calls in slow paths. + // By adding the following interval in the algorithm, we can compute this + // maximum before updating locations. + LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction); + interval->AddRange(position, position + 1); + AddSorted(&unhandled_core_intervals_, interval); + AddSorted(&unhandled_fp_intervals_, interval); + } + } + + if (locations->WillCall()) { + BlockRegisters(position, position + 1, /* caller_save_only */ true); + } + + for (size_t i = 0; i < locations->GetInputCount(); ++i) { + Location input = locations->InAt(i); + if (input.IsRegister() || input.IsFpuRegister()) { + BlockRegister(input, position, position + 1); + } else if (input.IsPair()) { + BlockRegister(input.ToLow(), position, position + 1); + BlockRegister(input.ToHigh(), position, position + 1); + } + } + + LiveInterval* current = instruction->GetLiveInterval(); + if (current == nullptr) return; + + ArenaVector<LiveInterval*>& unhandled = core_register + ? unhandled_core_intervals_ + : unhandled_fp_intervals_; + + DCHECK(unhandled.empty() || current->StartsBeforeOrAt(unhandled.back())); + + if (codegen_->NeedsTwoRegisters(current->GetType())) { + current->AddHighInterval(); + } + + for (size_t safepoint_index = safepoints_.size(); safepoint_index > 0; --safepoint_index) { + HInstruction* safepoint = safepoints_[safepoint_index - 1u]; + size_t safepoint_position = safepoint->GetLifetimePosition(); + + // Test that safepoints are ordered in the optimal way. + DCHECK(safepoint_index == safepoints_.size() || + safepoints_[safepoint_index]->GetLifetimePosition() < safepoint_position); + + if (safepoint_position == current->GetStart()) { + // The safepoint is for this instruction, so the location of the instruction + // does not need to be saved. + DCHECK_EQ(safepoint_index, safepoints_.size()); + DCHECK_EQ(safepoint, instruction); + continue; + } else if (current->IsDeadAt(safepoint_position)) { + break; + } else if (!current->Covers(safepoint_position)) { + // Hole in the interval. + continue; + } + current->AddSafepoint(safepoint); + } + current->ResetSearchCache(); + + // Some instructions define their output in fixed register/stack slot. We need + // to ensure we know these locations before doing register allocation. For a + // given register, we create an interval that covers these locations. The register + // will be unavailable at these locations when trying to allocate one for an + // interval. + // + // The backwards walking ensures the ranges are ordered on increasing start positions. + Location output = locations->Out(); + if (output.IsUnallocated() && output.GetPolicy() == Location::kSameAsFirstInput) { + Location first = locations->InAt(0); + if (first.IsRegister() || first.IsFpuRegister()) { + current->SetFrom(position + 1); + current->SetRegister(first.reg()); + } else if (first.IsPair()) { + current->SetFrom(position + 1); + current->SetRegister(first.low()); + LiveInterval* high = current->GetHighInterval(); + high->SetRegister(first.high()); + high->SetFrom(position + 1); + } + } else if (output.IsRegister() || output.IsFpuRegister()) { + // Shift the interval's start by one to account for the blocked register. + current->SetFrom(position + 1); + current->SetRegister(output.reg()); + BlockRegister(output, position, position + 1); + } else if (output.IsPair()) { + current->SetFrom(position + 1); + current->SetRegister(output.low()); + LiveInterval* high = current->GetHighInterval(); + high->SetRegister(output.high()); + high->SetFrom(position + 1); + BlockRegister(output.ToLow(), position, position + 1); + BlockRegister(output.ToHigh(), position, position + 1); + } else if (output.IsStackSlot() || output.IsDoubleStackSlot()) { + current->SetSpillSlot(output.GetStackIndex()); + } else { + DCHECK(output.IsUnallocated() || output.IsConstant()); + } + + if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) { + AllocateSpillSlotForCatchPhi(instruction->AsPhi()); + } + + // If needed, add interval to the list of unhandled intervals. + if (current->HasSpillSlot() || instruction->IsConstant()) { + // Split just before first register use. + size_t first_register_use = current->FirstRegisterUse(); + if (first_register_use != kNoLifetime) { + LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1); + // Don't add directly to `unhandled`, it needs to be sorted and the start + // of this new interval might be after intervals already in the list. + AddSorted(&unhandled, split); + } else { + // Nothing to do, we won't allocate a register for this value. + } + } else { + // Don't add directly to `unhandled`, temp or safepoint intervals + // for this instruction may have been added, and those can be + // processed first. + AddSorted(&unhandled, current); + } +} + +class AllRangesIterator : public ValueObject { + public: + explicit AllRangesIterator(LiveInterval* interval) + : current_interval_(interval), + current_range_(interval->GetFirstRange()) {} + + bool Done() const { return current_interval_ == nullptr; } + LiveRange* CurrentRange() const { return current_range_; } + LiveInterval* CurrentInterval() const { return current_interval_; } + + void Advance() { + current_range_ = current_range_->GetNext(); + if (current_range_ == nullptr) { + current_interval_ = current_interval_->GetNextSibling(); + if (current_interval_ != nullptr) { + current_range_ = current_interval_->GetFirstRange(); + } + } + } + + private: + LiveInterval* current_interval_; + LiveRange* current_range_; + + DISALLOW_COPY_AND_ASSIGN(AllRangesIterator); +}; + +bool RegisterAllocatorLinearScan::ValidateInternal(bool log_fatal_on_failure) const { + // To simplify unit testing, we eagerly create the array of intervals, and + // call the helper method. + ArenaVector<LiveInterval*> intervals(allocator_->Adapter(kArenaAllocRegisterAllocatorValidate)); + for (size_t i = 0; i < liveness_.GetNumberOfSsaValues(); ++i) { + HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i); + if (ShouldProcess(processing_core_registers_, instruction->GetLiveInterval())) { + intervals.push_back(instruction->GetLiveInterval()); + } + } + + const ArenaVector<LiveInterval*>* physical_register_intervals = processing_core_registers_ + ? &physical_core_register_intervals_ + : &physical_fp_register_intervals_; + for (LiveInterval* fixed : *physical_register_intervals) { + if (fixed != nullptr) { + intervals.push_back(fixed); + } + } + + for (LiveInterval* temp : temp_intervals_) { + if (ShouldProcess(processing_core_registers_, temp)) { + intervals.push_back(temp); + } + } + + return ValidateIntervals(intervals, GetNumberOfSpillSlots(), reserved_out_slots_, *codegen_, + allocator_, processing_core_registers_, log_fatal_on_failure); +} + +void RegisterAllocatorLinearScan::DumpInterval(std::ostream& stream, LiveInterval* interval) const { + interval->Dump(stream); + stream << ": "; + if (interval->HasRegister()) { + if (interval->IsFloatingPoint()) { + codegen_->DumpFloatingPointRegister(stream, interval->GetRegister()); + } else { + codegen_->DumpCoreRegister(stream, interval->GetRegister()); + } + } else { + stream << "spilled"; + } + stream << std::endl; +} + +void RegisterAllocatorLinearScan::DumpAllIntervals(std::ostream& stream) const { + stream << "inactive: " << std::endl; + for (LiveInterval* inactive_interval : inactive_) { + DumpInterval(stream, inactive_interval); + } + stream << "active: " << std::endl; + for (LiveInterval* active_interval : active_) { + DumpInterval(stream, active_interval); + } + stream << "unhandled: " << std::endl; + auto unhandled = (unhandled_ != nullptr) ? + unhandled_ : &unhandled_core_intervals_; + for (LiveInterval* unhandled_interval : *unhandled) { + DumpInterval(stream, unhandled_interval); + } + stream << "handled: " << std::endl; + for (LiveInterval* handled_interval : handled_) { + DumpInterval(stream, handled_interval); + } +} + +// By the book implementation of a linear scan register allocator. +void RegisterAllocatorLinearScan::LinearScan() { + while (!unhandled_->empty()) { + // (1) Remove interval with the lowest start position from unhandled. + LiveInterval* current = unhandled_->back(); + unhandled_->pop_back(); + + // Make sure the interval is an expected state. + DCHECK(!current->IsFixed() && !current->HasSpillSlot()); + // Make sure we are going in the right order. + DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() >= current->GetStart()); + // Make sure a low interval is always with a high. + DCHECK(!current->IsLowInterval() || unhandled_->back()->IsHighInterval()); + // Make sure a high interval is always with a low. + DCHECK(current->IsLowInterval() || + unhandled_->empty() || + !unhandled_->back()->IsHighInterval()); + + size_t position = current->GetStart(); + + // Remember the inactive_ size here since the ones moved to inactive_ from + // active_ below shouldn't need to be re-checked. + size_t inactive_intervals_to_handle = inactive_.size(); + + // (2) Remove currently active intervals that are dead at this position. + // Move active intervals that have a lifetime hole at this position + // to inactive. + auto active_kept_end = std::remove_if( + active_.begin(), + active_.end(), + [this, position](LiveInterval* interval) { + if (interval->IsDeadAt(position)) { + handled_.push_back(interval); + return true; + } else if (!interval->Covers(position)) { + inactive_.push_back(interval); + return true; + } else { + return false; // Keep this interval. + } + }); + active_.erase(active_kept_end, active_.end()); + + // (3) Remove currently inactive intervals that are dead at this position. + // Move inactive intervals that cover this position to active. + auto inactive_to_handle_end = inactive_.begin() + inactive_intervals_to_handle; + auto inactive_kept_end = std::remove_if( + inactive_.begin(), + inactive_to_handle_end, + [this, position](LiveInterval* interval) { + DCHECK(interval->GetStart() < position || interval->IsFixed()); + if (interval->IsDeadAt(position)) { + handled_.push_back(interval); + return true; + } else if (interval->Covers(position)) { + active_.push_back(interval); + return true; + } else { + return false; // Keep this interval. + } + }); + inactive_.erase(inactive_kept_end, inactive_to_handle_end); + + if (current->IsSlowPathSafepoint()) { + // Synthesized interval to record the maximum number of live registers + // at safepoints. No need to allocate a register for it. + if (processing_core_registers_) { + maximum_number_of_live_core_registers_ = + std::max(maximum_number_of_live_core_registers_, active_.size()); + } else { + maximum_number_of_live_fp_registers_ = + std::max(maximum_number_of_live_fp_registers_, active_.size()); + } + DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() > current->GetStart()); + continue; + } + + if (current->IsHighInterval() && !current->GetLowInterval()->HasRegister()) { + DCHECK(!current->HasRegister()); + // Allocating the low part was unsucessful. The splitted interval for the high part + // will be handled next (it is in the `unhandled_` list). + continue; + } + + // (4) Try to find an available register. + bool success = TryAllocateFreeReg(current); + + // (5) If no register could be found, we need to spill. + if (!success) { + success = AllocateBlockedReg(current); + } + + // (6) If the interval had a register allocated, add it to the list of active + // intervals. + if (success) { + codegen_->AddAllocatedRegister(processing_core_registers_ + ? Location::RegisterLocation(current->GetRegister()) + : Location::FpuRegisterLocation(current->GetRegister())); + active_.push_back(current); + if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) { + current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister())); + } + } + } +} + +static void FreeIfNotCoverAt(LiveInterval* interval, size_t position, size_t* free_until) { + DCHECK(!interval->IsHighInterval()); + // Note that the same instruction may occur multiple times in the input list, + // so `free_until` may have changed already. + // Since `position` is not the current scan position, we need to use CoversSlow. + if (interval->IsDeadAt(position)) { + // Set the register to be free. Note that inactive intervals might later + // update this. + free_until[interval->GetRegister()] = kMaxLifetimePosition; + if (interval->HasHighInterval()) { + DCHECK(interval->GetHighInterval()->IsDeadAt(position)); + free_until[interval->GetHighInterval()->GetRegister()] = kMaxLifetimePosition; + } + } else if (!interval->CoversSlow(position)) { + // The interval becomes inactive at `defined_by`. We make its register + // available only until the next use strictly after `defined_by`. + free_until[interval->GetRegister()] = interval->FirstUseAfter(position); + if (interval->HasHighInterval()) { + DCHECK(!interval->GetHighInterval()->CoversSlow(position)); + free_until[interval->GetHighInterval()->GetRegister()] = free_until[interval->GetRegister()]; + } + } +} + +// Find a free register. If multiple are found, pick the register that +// is free the longest. +bool RegisterAllocatorLinearScan::TryAllocateFreeReg(LiveInterval* current) { + size_t* free_until = registers_array_; + + // First set all registers to be free. + for (size_t i = 0; i < number_of_registers_; ++i) { + free_until[i] = kMaxLifetimePosition; + } + + // For each active interval, set its register to not free. + for (LiveInterval* interval : active_) { + DCHECK(interval->HasRegister()); + free_until[interval->GetRegister()] = 0; + } + + // An interval that starts an instruction (that is, it is not split), may + // re-use the registers used by the inputs of that instruciton, based on the + // location summary. + HInstruction* defined_by = current->GetDefinedBy(); + if (defined_by != nullptr && !current->IsSplit()) { + LocationSummary* locations = defined_by->GetLocations(); + if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) { + HInputsRef inputs = defined_by->GetInputs(); + for (size_t i = 0; i < inputs.size(); ++i) { + // Take the last interval of the input. It is the location of that interval + // that will be used at `defined_by`. + LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling(); + // Note that interval may have not been processed yet. + // TODO: Handle non-split intervals last in the work list. + if (locations->InAt(i).IsValid() + && interval->HasRegister() + && interval->SameRegisterKind(*current)) { + // The input must be live until the end of `defined_by`, to comply to + // the linear scan algorithm. So we use `defined_by`'s end lifetime + // position to check whether the input is dead or is inactive after + // `defined_by`. + DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition())); + size_t position = defined_by->GetLifetimePosition() + 1; + FreeIfNotCoverAt(interval, position, free_until); + } + } + } + } + + // For each inactive interval, set its register to be free until + // the next intersection with `current`. + for (LiveInterval* inactive : inactive_) { + // Temp/Slow-path-safepoint interval has no holes. + DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint()); + if (!current->IsSplit() && !inactive->IsFixed()) { + // Neither current nor inactive are fixed. + // Thanks to SSA, a non-split interval starting in a hole of an + // inactive interval should never intersect with that inactive interval. + // Only if it's not fixed though, because fixed intervals don't come from SSA. + DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime); + continue; + } + + DCHECK(inactive->HasRegister()); + if (free_until[inactive->GetRegister()] == 0) { + // Already used by some active interval. No need to intersect. + continue; + } + size_t next_intersection = inactive->FirstIntersectionWith(current); + if (next_intersection != kNoLifetime) { + free_until[inactive->GetRegister()] = + std::min(free_until[inactive->GetRegister()], next_intersection); + } + } + + int reg = kNoRegister; + if (current->HasRegister()) { + // Some instructions have a fixed register output. + reg = current->GetRegister(); + if (free_until[reg] == 0) { + DCHECK(current->IsHighInterval()); + // AllocateBlockedReg will spill the holder of the register. + return false; + } + } else { + DCHECK(!current->IsHighInterval()); + int hint = current->FindFirstRegisterHint(free_until, liveness_); + if ((hint != kNoRegister) + // For simplicity, if the hint we are getting for a pair cannot be used, + // we are just going to allocate a new pair. + && !(current->IsLowInterval() && IsBlocked(GetHighForLowRegister(hint)))) { + DCHECK(!IsBlocked(hint)); + reg = hint; + } else if (current->IsLowInterval()) { + reg = FindAvailableRegisterPair(free_until, current->GetStart()); + } else { + reg = FindAvailableRegister(free_until, current); + } + } + + DCHECK_NE(reg, kNoRegister); + // If we could not find a register, we need to spill. + if (free_until[reg] == 0) { + return false; + } + + if (current->IsLowInterval()) { + // If the high register of this interval is not available, we need to spill. + int high_reg = current->GetHighInterval()->GetRegister(); + if (high_reg == kNoRegister) { + high_reg = GetHighForLowRegister(reg); + } + if (free_until[high_reg] == 0) { + return false; + } + } + + current->SetRegister(reg); + if (!current->IsDeadAt(free_until[reg])) { + // If the register is only available for a subset of live ranges + // covered by `current`, split `current` before the position where + // the register is not available anymore. + LiveInterval* split = SplitBetween(current, current->GetStart(), free_until[reg]); + DCHECK(split != nullptr); + AddSorted(unhandled_, split); + } + return true; +} + +bool RegisterAllocatorLinearScan::IsBlocked(int reg) const { + return processing_core_registers_ + ? blocked_core_registers_[reg] + : blocked_fp_registers_[reg]; +} + +int RegisterAllocatorLinearScan::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const { + int reg = kNoRegister; + // Pick the register pair that is used the last. + for (size_t i = 0; i < number_of_registers_; ++i) { + if (IsBlocked(i)) continue; + if (!IsLowRegister(i)) continue; + int high_register = GetHighForLowRegister(i); + if (IsBlocked(high_register)) continue; + int existing_high_register = GetHighForLowRegister(reg); + if ((reg == kNoRegister) || (next_use[i] >= next_use[reg] + && next_use[high_register] >= next_use[existing_high_register])) { + reg = i; + if (next_use[i] == kMaxLifetimePosition + && next_use[high_register] == kMaxLifetimePosition) { + break; + } + } else if (next_use[reg] <= starting_at || next_use[existing_high_register] <= starting_at) { + // If one of the current register is known to be unavailable, just unconditionally + // try a new one. + reg = i; + } + } + return reg; +} + +bool RegisterAllocatorLinearScan::IsCallerSaveRegister(int reg) const { + return processing_core_registers_ + ? !codegen_->IsCoreCalleeSaveRegister(reg) + : !codegen_->IsFloatingPointCalleeSaveRegister(reg); +} + +int RegisterAllocatorLinearScan::FindAvailableRegister(size_t* next_use, LiveInterval* current) const { + // We special case intervals that do not span a safepoint to try to find a caller-save + // register if one is available. We iterate from 0 to the number of registers, + // so if there are caller-save registers available at the end, we continue the iteration. + bool prefers_caller_save = !current->HasWillCallSafepoint(); + int reg = kNoRegister; + for (size_t i = 0; i < number_of_registers_; ++i) { + if (IsBlocked(i)) { + // Register cannot be used. Continue. + continue; + } + + // Best case: we found a register fully available. + if (next_use[i] == kMaxLifetimePosition) { + if (prefers_caller_save && !IsCallerSaveRegister(i)) { + // We can get shorter encodings on some platforms by using + // small register numbers. So only update the candidate if the previous + // one was not available for the whole method. + if (reg == kNoRegister || next_use[reg] != kMaxLifetimePosition) { + reg = i; + } + // Continue the iteration in the hope of finding a caller save register. + continue; + } else { + reg = i; + // We know the register is good enough. Return it. + break; + } + } + + // If we had no register before, take this one as a reference. + if (reg == kNoRegister) { + reg = i; + continue; + } + + // Pick the register that is used the last. + if (next_use[i] > next_use[reg]) { + reg = i; + continue; + } + } + return reg; +} + +// Remove interval and its other half if any. Return iterator to the following element. +static ArenaVector<LiveInterval*>::iterator RemoveIntervalAndPotentialOtherHalf( + ArenaVector<LiveInterval*>* intervals, ArenaVector<LiveInterval*>::iterator pos) { + DCHECK(intervals->begin() <= pos && pos < intervals->end()); + LiveInterval* interval = *pos; + if (interval->IsLowInterval()) { + DCHECK(pos + 1 < intervals->end()); + DCHECK_EQ(*(pos + 1), interval->GetHighInterval()); + return intervals->erase(pos, pos + 2); + } else if (interval->IsHighInterval()) { + DCHECK(intervals->begin() < pos); + DCHECK_EQ(*(pos - 1), interval->GetLowInterval()); + return intervals->erase(pos - 1, pos + 1); + } else { + return intervals->erase(pos); + } +} + +bool RegisterAllocatorLinearScan::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position, + size_t first_register_use, + size_t* next_use) { + for (auto it = active_.begin(), end = active_.end(); it != end; ++it) { + LiveInterval* active = *it; + DCHECK(active->HasRegister()); + if (active->IsFixed()) continue; + if (active->IsHighInterval()) continue; + if (first_register_use > next_use[active->GetRegister()]) continue; + + // Split the first interval found that is either: + // 1) A non-pair interval. + // 2) A pair interval whose high is not low + 1. + // 3) A pair interval whose low is not even. + if (!active->IsLowInterval() || + IsLowOfUnalignedPairInterval(active) || + !IsLowRegister(active->GetRegister())) { + LiveInterval* split = Split(active, position); + if (split != active) { + handled_.push_back(active); + } + RemoveIntervalAndPotentialOtherHalf(&active_, it); + AddSorted(unhandled_, split); + return true; + } + } + return false; +} + +// Find the register that is used the last, and spill the interval +// that holds it. If the first use of `current` is after that register +// we spill `current` instead. +bool RegisterAllocatorLinearScan::AllocateBlockedReg(LiveInterval* current) { + size_t first_register_use = current->FirstRegisterUse(); + if (current->HasRegister()) { + DCHECK(current->IsHighInterval()); + // The low interval has allocated the register for the high interval. In + // case the low interval had to split both intervals, we may end up in a + // situation where the high interval does not have a register use anymore. + // We must still proceed in order to split currently active and inactive + // uses of the high interval's register, and put the high interval in the + // active set. + DCHECK(first_register_use != kNoLifetime || (current->GetNextSibling() != nullptr)); + } else if (first_register_use == kNoLifetime) { + AllocateSpillSlotFor(current); + return false; + } + + // First set all registers as not being used. + size_t* next_use = registers_array_; + for (size_t i = 0; i < number_of_registers_; ++i) { + next_use[i] = kMaxLifetimePosition; + } + + // For each active interval, find the next use of its register after the + // start of current. + for (LiveInterval* active : active_) { + DCHECK(active->HasRegister()); + if (active->IsFixed()) { + next_use[active->GetRegister()] = current->GetStart(); + } else { + size_t use = active->FirstRegisterUseAfter(current->GetStart()); + if (use != kNoLifetime) { + next_use[active->GetRegister()] = use; + } + } + } + + // For each inactive interval, find the next use of its register after the + // start of current. + for (LiveInterval* inactive : inactive_) { + // Temp/Slow-path-safepoint interval has no holes. + DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint()); + if (!current->IsSplit() && !inactive->IsFixed()) { + // Neither current nor inactive are fixed. + // Thanks to SSA, a non-split interval starting in a hole of an + // inactive interval should never intersect with that inactive interval. + // Only if it's not fixed though, because fixed intervals don't come from SSA. + DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime); + continue; + } + DCHECK(inactive->HasRegister()); + size_t next_intersection = inactive->FirstIntersectionWith(current); + if (next_intersection != kNoLifetime) { + if (inactive->IsFixed()) { + next_use[inactive->GetRegister()] = + std::min(next_intersection, next_use[inactive->GetRegister()]); + } else { + size_t use = inactive->FirstUseAfter(current->GetStart()); + if (use != kNoLifetime) { + next_use[inactive->GetRegister()] = std::min(use, next_use[inactive->GetRegister()]); + } + } + } + } + + int reg = kNoRegister; + bool should_spill = false; + if (current->HasRegister()) { + DCHECK(current->IsHighInterval()); + reg = current->GetRegister(); + // When allocating the low part, we made sure the high register was available. + DCHECK_LT(first_register_use, next_use[reg]); + } else if (current->IsLowInterval()) { + reg = FindAvailableRegisterPair(next_use, first_register_use); + // We should spill if both registers are not available. + should_spill = (first_register_use >= next_use[reg]) + || (first_register_use >= next_use[GetHighForLowRegister(reg)]); + } else { + DCHECK(!current->IsHighInterval()); + reg = FindAvailableRegister(next_use, current); + should_spill = (first_register_use >= next_use[reg]); + } + + DCHECK_NE(reg, kNoRegister); + if (should_spill) { + DCHECK(!current->IsHighInterval()); + bool is_allocation_at_use_site = (current->GetStart() >= (first_register_use - 1)); + if (is_allocation_at_use_site) { + if (!current->IsLowInterval()) { + DumpInterval(std::cerr, current); + DumpAllIntervals(std::cerr); + // This situation has the potential to infinite loop, so we make it a non-debug CHECK. + HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2); + CHECK(false) << "There is not enough registers available for " + << current->GetParent()->GetDefinedBy()->DebugName() << " " + << current->GetParent()->GetDefinedBy()->GetId() + << " at " << first_register_use - 1 << " " + << (at == nullptr ? "" : at->DebugName()); + } + + // If we're allocating a register for `current` because the instruction at + // that position requires it, but we think we should spill, then there are + // non-pair intervals or unaligned pair intervals blocking the allocation. + // We split the first interval found, and put ourselves first in the + // `unhandled_` list. + bool success = TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(), + first_register_use, + next_use); + DCHECK(success); + LiveInterval* existing = unhandled_->back(); + DCHECK(existing->IsHighInterval()); + DCHECK_EQ(existing->GetLowInterval(), current); + unhandled_->push_back(current); + } else { + // If the first use of that instruction is after the last use of the found + // register, we split this interval just before its first register use. + AllocateSpillSlotFor(current); + LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1); + DCHECK(current != split); + AddSorted(unhandled_, split); + } + return false; + } else { + // Use this register and spill the active and inactives interval that + // have that register. + current->SetRegister(reg); + + for (auto it = active_.begin(), end = active_.end(); it != end; ++it) { + LiveInterval* active = *it; + if (active->GetRegister() == reg) { + DCHECK(!active->IsFixed()); + LiveInterval* split = Split(active, current->GetStart()); + if (split != active) { + handled_.push_back(active); + } + RemoveIntervalAndPotentialOtherHalf(&active_, it); + AddSorted(unhandled_, split); + break; + } + } + + // NOTE: Retrieve end() on each iteration because we're removing elements in the loop body. + for (auto it = inactive_.begin(); it != inactive_.end(); ) { + LiveInterval* inactive = *it; + bool erased = false; + if (inactive->GetRegister() == reg) { + if (!current->IsSplit() && !inactive->IsFixed()) { + // Neither current nor inactive are fixed. + // Thanks to SSA, a non-split interval starting in a hole of an + // inactive interval should never intersect with that inactive interval. + // Only if it's not fixed though, because fixed intervals don't come from SSA. + DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime); + } else { + size_t next_intersection = inactive->FirstIntersectionWith(current); + if (next_intersection != kNoLifetime) { + if (inactive->IsFixed()) { + LiveInterval* split = Split(current, next_intersection); + DCHECK_NE(split, current); + AddSorted(unhandled_, split); + } else { + // Split at the start of `current`, which will lead to splitting + // at the end of the lifetime hole of `inactive`. + LiveInterval* split = Split(inactive, current->GetStart()); + // If it's inactive, it must start before the current interval. + DCHECK_NE(split, inactive); + it = RemoveIntervalAndPotentialOtherHalf(&inactive_, it); + erased = true; + handled_.push_back(inactive); + AddSorted(unhandled_, split); + } + } + } + } + // If we have erased the element, `it` already points to the next element. + // Otherwise we need to move to the next element. + if (!erased) { + ++it; + } + } + + return true; + } +} + +void RegisterAllocatorLinearScan::AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval) { + DCHECK(!interval->IsFixed() && !interval->HasSpillSlot()); + size_t insert_at = 0; + for (size_t i = array->size(); i > 0; --i) { + LiveInterval* current = (*array)[i - 1u]; + // High intervals must be processed right after their low equivalent. + if (current->StartsAfter(interval) && !current->IsHighInterval()) { + insert_at = i; + break; + } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) { + // Ensure the slow path interval is the last to be processed at its location: we want the + // interval to know all live registers at this location. + DCHECK(i == 1 || (*array)[i - 2u]->StartsAfter(current)); + insert_at = i; + break; + } + } + + // Insert the high interval before the low, to ensure the low is processed before. + auto insert_pos = array->begin() + insert_at; + if (interval->HasHighInterval()) { + array->insert(insert_pos, { interval->GetHighInterval(), interval }); + } else if (interval->HasLowInterval()) { + array->insert(insert_pos, { interval, interval->GetLowInterval() }); + } else { + array->insert(insert_pos, interval); + } +} + +void RegisterAllocatorLinearScan::AllocateSpillSlotFor(LiveInterval* interval) { + if (interval->IsHighInterval()) { + // The low interval already took care of allocating the spill slot. + DCHECK(!interval->GetLowInterval()->HasRegister()); + DCHECK(interval->GetLowInterval()->GetParent()->HasSpillSlot()); + return; + } + + LiveInterval* parent = interval->GetParent(); + + // An instruction gets a spill slot for its entire lifetime. If the parent + // of this interval already has a spill slot, there is nothing to do. + if (parent->HasSpillSlot()) { + return; + } + + HInstruction* defined_by = parent->GetDefinedBy(); + DCHECK(!defined_by->IsPhi() || !defined_by->AsPhi()->IsCatchPhi()); + + if (defined_by->IsParameterValue()) { + // Parameters have their own stack slot. + parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue())); + return; + } + + if (defined_by->IsCurrentMethod()) { + parent->SetSpillSlot(0); + return; + } + + if (defined_by->IsConstant()) { + // Constants don't need a spill slot. + return; + } + + ArenaVector<size_t>* spill_slots = nullptr; + switch (interval->GetType()) { + case Primitive::kPrimDouble: + spill_slots = &double_spill_slots_; + break; + case Primitive::kPrimLong: + spill_slots = &long_spill_slots_; + break; + case Primitive::kPrimFloat: + spill_slots = &float_spill_slots_; + break; + case Primitive::kPrimNot: + case Primitive::kPrimInt: + case Primitive::kPrimChar: + case Primitive::kPrimByte: + case Primitive::kPrimBoolean: + case Primitive::kPrimShort: + spill_slots = &int_spill_slots_; + break; + case Primitive::kPrimVoid: + LOG(FATAL) << "Unexpected type for interval " << interval->GetType(); + } + + // Find an available spill slot. + size_t slot = 0; + for (size_t e = spill_slots->size(); slot < e; ++slot) { + if ((*spill_slots)[slot] <= parent->GetStart()) { + if (!parent->NeedsTwoSpillSlots()) { + // One spill slot is sufficient. + break; + } + if (slot == e - 1 || (*spill_slots)[slot + 1] <= parent->GetStart()) { + // Two spill slots are available. + break; + } + } + } + + size_t end = interval->GetLastSibling()->GetEnd(); + if (parent->NeedsTwoSpillSlots()) { + if (slot + 2u > spill_slots->size()) { + // We need a new spill slot. + spill_slots->resize(slot + 2u, end); + } + (*spill_slots)[slot] = end; + (*spill_slots)[slot + 1] = end; + } else { + if (slot == spill_slots->size()) { + // We need a new spill slot. + spill_slots->push_back(end); + } else { + (*spill_slots)[slot] = end; + } + } + + // Note that the exact spill slot location will be computed when we resolve, + // that is when we know the number of spill slots for each type. + parent->SetSpillSlot(slot); +} + +void RegisterAllocatorLinearScan::AllocateSpillSlotForCatchPhi(HPhi* phi) { + LiveInterval* interval = phi->GetLiveInterval(); + + HInstruction* previous_phi = phi->GetPrevious(); + DCHECK(previous_phi == nullptr || + previous_phi->AsPhi()->GetRegNumber() <= phi->GetRegNumber()) + << "Phis expected to be sorted by vreg number, so that equivalent phis are adjacent."; + + if (phi->IsVRegEquivalentOf(previous_phi)) { + // This is an equivalent of the previous phi. We need to assign the same + // catch phi slot. + DCHECK(previous_phi->GetLiveInterval()->HasSpillSlot()); + interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot()); + } else { + // Allocate a new spill slot for this catch phi. + // TODO: Reuse spill slots when intervals of phis from different catch + // blocks do not overlap. + interval->SetSpillSlot(catch_phi_spill_slots_); + catch_phi_spill_slots_ += interval->NeedsTwoSpillSlots() ? 2 : 1; + } +} + +} // namespace art diff --git a/compiler/optimizing/register_allocator_linear_scan.h b/compiler/optimizing/register_allocator_linear_scan.h new file mode 100644 index 0000000000..1a643a0d1a --- /dev/null +++ b/compiler/optimizing/register_allocator_linear_scan.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_LINEAR_SCAN_H_ +#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_LINEAR_SCAN_H_ + +#include "arch/instruction_set.h" +#include "base/arena_containers.h" +#include "base/macros.h" +#include "primitive.h" +#include "register_allocator.h" + +namespace art { + +class CodeGenerator; +class HBasicBlock; +class HGraph; +class HInstruction; +class HParallelMove; +class HPhi; +class LiveInterval; +class Location; +class SsaLivenessAnalysis; + +/** + * An implementation of a linear scan register allocator on an `HGraph` with SSA form. + */ +class RegisterAllocatorLinearScan : public RegisterAllocator { + public: + RegisterAllocatorLinearScan(ArenaAllocator* allocator, + CodeGenerator* codegen, + const SsaLivenessAnalysis& analysis); + ~RegisterAllocatorLinearScan() OVERRIDE {} + + void AllocateRegisters() OVERRIDE; + + bool Validate(bool log_fatal_on_failure) OVERRIDE { + processing_core_registers_ = true; + if (!ValidateInternal(log_fatal_on_failure)) { + return false; + } + processing_core_registers_ = false; + return ValidateInternal(log_fatal_on_failure); + } + + size_t GetNumberOfSpillSlots() const { + return int_spill_slots_.size() + + long_spill_slots_.size() + + float_spill_slots_.size() + + double_spill_slots_.size() + + catch_phi_spill_slots_; + } + + private: + // Main methods of the allocator. + void LinearScan(); + bool TryAllocateFreeReg(LiveInterval* interval); + bool AllocateBlockedReg(LiveInterval* interval); + + // Add `interval` in the given sorted list. + static void AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval); + + // Returns whether `reg` is blocked by the code generator. + bool IsBlocked(int reg) const; + + // Update the interval for the register in `location` to cover [start, end). + void BlockRegister(Location location, size_t start, size_t end); + void BlockRegisters(size_t start, size_t end, bool caller_save_only = false); + + // Allocate a spill slot for the given interval. Should be called in linear + // order of interval starting positions. + void AllocateSpillSlotFor(LiveInterval* interval); + + // Allocate a spill slot for the given catch phi. Will allocate the same slot + // for phis which share the same vreg. Must be called in reverse linear order + // of lifetime positions and ascending vreg numbers for correctness. + void AllocateSpillSlotForCatchPhi(HPhi* phi); + + // Helper methods. + void AllocateRegistersInternal(); + void ProcessInstruction(HInstruction* instruction); + bool ValidateInternal(bool log_fatal_on_failure) const; + void DumpInterval(std::ostream& stream, LiveInterval* interval) const; + void DumpAllIntervals(std::ostream& stream) const; + int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const; + int FindAvailableRegister(size_t* next_use, LiveInterval* current) const; + bool IsCallerSaveRegister(int reg) const; + + // Try splitting an active non-pair or unaligned pair interval at the given `position`. + // Returns whether it was successful at finding such an interval. + bool TrySplitNonPairOrUnalignedPairIntervalAt(size_t position, + size_t first_register_use, + size_t* next_use); + + // List of intervals for core registers that must be processed, ordered by start + // position. Last entry is the interval that has the lowest start position. + // This list is initially populated before doing the linear scan. + ArenaVector<LiveInterval*> unhandled_core_intervals_; + + // List of intervals for floating-point registers. Same comments as above. + ArenaVector<LiveInterval*> unhandled_fp_intervals_; + + // Currently processed list of unhandled intervals. Either `unhandled_core_intervals_` + // or `unhandled_fp_intervals_`. + ArenaVector<LiveInterval*>* unhandled_; + + // List of intervals that have been processed. + ArenaVector<LiveInterval*> handled_; + + // List of intervals that are currently active when processing a new live interval. + // That is, they have a live range that spans the start of the new interval. + ArenaVector<LiveInterval*> active_; + + // List of intervals that are currently inactive when processing a new live interval. + // That is, they have a lifetime hole that spans the start of the new interval. + ArenaVector<LiveInterval*> inactive_; + + // Fixed intervals for physical registers. Such intervals cover the positions + // where an instruction requires a specific register. + ArenaVector<LiveInterval*> physical_core_register_intervals_; + ArenaVector<LiveInterval*> physical_fp_register_intervals_; + + // Intervals for temporaries. Such intervals cover the positions + // where an instruction requires a temporary. + ArenaVector<LiveInterval*> temp_intervals_; + + // The spill slots allocated for live intervals. We ensure spill slots + // are typed to avoid (1) doing moves and swaps between two different kinds + // of registers, and (2) swapping between a single stack slot and a double + // stack slot. This simplifies the parallel move resolver. + ArenaVector<size_t> int_spill_slots_; + ArenaVector<size_t> long_spill_slots_; + ArenaVector<size_t> float_spill_slots_; + ArenaVector<size_t> double_spill_slots_; + + // Spill slots allocated to catch phis. This category is special-cased because + // (1) slots are allocated prior to linear scan and in reverse linear order, + // (2) equivalent phis need to share slots despite having different types. + size_t catch_phi_spill_slots_; + + // Instructions that need a safepoint. + ArenaVector<HInstruction*> safepoints_; + + // True if processing core registers. False if processing floating + // point registers. + bool processing_core_registers_; + + // Number of registers for the current register kind (core or floating point). + size_t number_of_registers_; + + // Temporary array, allocated ahead of time for simplicity. + size_t* registers_array_; + + // Blocked registers, as decided by the code generator. + bool* const blocked_core_registers_; + bool* const blocked_fp_registers_; + + // Slots reserved for out arguments. + size_t reserved_out_slots_; + + // The maximum live core registers at safepoints. + size_t maximum_number_of_live_core_registers_; + + // The maximum live FP registers at safepoints. + size_t maximum_number_of_live_fp_registers_; + + ART_FRIEND_TEST(RegisterAllocatorTest, FreeUntil); + ART_FRIEND_TEST(RegisterAllocatorTest, SpillInactive); + + DISALLOW_COPY_AND_ASSIGN(RegisterAllocatorLinearScan); +}; + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_LINEAR_SCAN_H_ diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc index a9de7c3e59..55ea99e592 100644 --- a/compiler/optimizing/register_allocator_test.cc +++ b/compiler/optimizing/register_allocator_test.cc @@ -25,17 +25,35 @@ #include "nodes.h" #include "optimizing_unit_test.h" #include "register_allocator.h" +#include "register_allocator_linear_scan.h" #include "ssa_liveness_analysis.h" #include "ssa_phi_elimination.h" namespace art { +using Strategy = RegisterAllocator::Strategy; + // Note: the register allocator tests rely on the fact that constants have live // intervals and registers get allocated to them. -class RegisterAllocatorTest : public CommonCompilerTest {}; +class RegisterAllocatorTest : public CommonCompilerTest { + protected: + // These functions need to access private variables of LocationSummary, so we declare it + // as a member of RegisterAllocatorTest, which we make a friend class. + static void SameAsFirstInputHint(Strategy strategy); + static void ExpectedInRegisterHint(Strategy strategy); +}; + +// This macro should include all register allocation strategies that should be tested. +#define TEST_ALL_STRATEGIES(test_name)\ +TEST_F(RegisterAllocatorTest, test_name##_LinearScan) {\ + test_name(Strategy::kRegisterAllocatorLinearScan);\ +}\ +TEST_F(RegisterAllocatorTest, test_name##_GraphColor) {\ + test_name(Strategy::kRegisterAllocatorGraphColor);\ +} -static bool Check(const uint16_t* data) { +static bool Check(const uint16_t* data, Strategy strategy) { ArenaPool pool; ArenaAllocator allocator(&pool); HGraph* graph = CreateCFG(&allocator, data); @@ -44,9 +62,10 @@ static bool Check(const uint16_t* data) { x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); - return register_allocator.Validate(false); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); + return register_allocator->Validate(false); } /** @@ -142,7 +161,7 @@ TEST_F(RegisterAllocatorTest, ValidateIntervals) { } } -TEST_F(RegisterAllocatorTest, CFG1) { +static void CFG1(Strategy strategy) { /* * Test the following snippet: * return 0; @@ -159,10 +178,12 @@ TEST_F(RegisterAllocatorTest, CFG1) { Instruction::CONST_4 | 0 | 0, Instruction::RETURN); - ASSERT_TRUE(Check(data)); + ASSERT_TRUE(Check(data, strategy)); } -TEST_F(RegisterAllocatorTest, Loop1) { +TEST_ALL_STRATEGIES(CFG1); + +static void Loop1(Strategy strategy) { /* * Test the following snippet: * int a = 0; @@ -198,10 +219,12 @@ TEST_F(RegisterAllocatorTest, Loop1) { Instruction::CONST_4 | 5 << 12 | 1 << 8, Instruction::RETURN | 1 << 8); - ASSERT_TRUE(Check(data)); + ASSERT_TRUE(Check(data, strategy)); } -TEST_F(RegisterAllocatorTest, Loop2) { +TEST_ALL_STRATEGIES(Loop1); + +static void Loop2(Strategy strategy) { /* * Test the following snippet: * int a = 0; @@ -247,10 +270,12 @@ TEST_F(RegisterAllocatorTest, Loop2) { Instruction::ADD_INT, 1 << 8 | 0, Instruction::RETURN | 1 << 8); - ASSERT_TRUE(Check(data)); + ASSERT_TRUE(Check(data, strategy)); } -TEST_F(RegisterAllocatorTest, Loop3) { +TEST_ALL_STRATEGIES(Loop2); + +static void Loop3(Strategy strategy) { /* * Test the following snippet: * int a = 0 @@ -295,9 +320,10 @@ TEST_F(RegisterAllocatorTest, Loop3) { x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); - ASSERT_TRUE(register_allocator.Validate(false)); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); + ASSERT_TRUE(register_allocator->Validate(false)); HBasicBlock* loop_header = graph->GetBlocks()[2]; HPhi* phi = loop_header->GetFirstPhi()->AsPhi(); @@ -313,6 +339,8 @@ TEST_F(RegisterAllocatorTest, Loop3) { ASSERT_EQ(phi_interval->GetRegister(), ret->InputAt(0)->GetLiveInterval()->GetRegister()); } +TEST_ALL_STRATEGIES(Loop3); + TEST_F(RegisterAllocatorTest, FirstRegisterUse) { const uint16_t data[] = THREE_REGISTERS_CODE_ITEM( Instruction::CONST_4 | 0 | 0, @@ -353,7 +381,7 @@ TEST_F(RegisterAllocatorTest, FirstRegisterUse) { ASSERT_EQ(new_interval->FirstRegisterUse(), last_xor->GetLifetimePosition()); } -TEST_F(RegisterAllocatorTest, DeadPhi) { +static void DeadPhi(Strategy strategy) { /* Test for a dead loop phi taking as back-edge input a phi that also has * this loop phi as input. Walking backwards in SsaDeadPhiElimination * does not solve the problem because the loop phi will be visited last. @@ -384,15 +412,19 @@ TEST_F(RegisterAllocatorTest, DeadPhi) { x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); - ASSERT_TRUE(register_allocator.Validate(false)); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); + ASSERT_TRUE(register_allocator->Validate(false)); } +TEST_ALL_STRATEGIES(DeadPhi); + /** * Test that the TryAllocateFreeReg method works in the presence of inactive intervals * that share the same register. It should split the interval it is currently * allocating for at the minimum lifetime position between the two inactive intervals. + * This test only applies to the linear scan allocator. */ TEST_F(RegisterAllocatorTest, FreeUntil) { const uint16_t data[] = TWO_REGISTERS_CODE_ITEM( @@ -408,7 +440,7 @@ TEST_F(RegisterAllocatorTest, FreeUntil) { x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions()); SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); + RegisterAllocatorLinearScan register_allocator(&allocator, &codegen, liveness); // Add an artifical range to cover the temps that will be put in the unhandled list. LiveInterval* unhandled = graph->GetEntryBlock()->GetFirstInstruction()->GetLiveInterval(); @@ -506,15 +538,15 @@ static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator, graph->GetDexFile(), dex_cache, 0); -*input2 = new (allocator) HInstanceFieldGet(parameter, - Primitive::kPrimInt, - MemberOffset(42), - false, - kUnknownFieldIndex, - kUnknownClassDefIndex, - graph->GetDexFile(), - dex_cache, - 0); + *input2 = new (allocator) HInstanceFieldGet(parameter, + Primitive::kPrimInt, + MemberOffset(42), + false, + kUnknownFieldIndex, + kUnknownClassDefIndex, + graph->GetDexFile(), + dex_cache, + 0); then->AddInstruction(*input1); else_->AddInstruction(*input2); join->AddInstruction(new (allocator) HExit()); @@ -526,7 +558,7 @@ static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator, return graph; } -TEST_F(RegisterAllocatorTest, PhiHint) { +static void PhiHint(Strategy strategy) { ArenaPool pool; ArenaAllocator allocator(&pool); HPhi *phi; @@ -541,8 +573,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) { liveness.Analyze(); // Check that the register allocator is deterministic. - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 0); ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 0); @@ -560,8 +593,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) { // Set the phi to a specific register, and check that the inputs get allocated // the same register. phi->GetLocations()->UpdateOut(Location::RegisterLocation(2)); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2); ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2); @@ -579,8 +613,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) { // Set input1 to a specific register, and check that the phi and other input get allocated // the same register. input1->GetLocations()->UpdateOut(Location::RegisterLocation(2)); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2); ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2); @@ -598,8 +633,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) { // Set input2 to a specific register, and check that the phi and other input get allocated // the same register. input2->GetLocations()->UpdateOut(Location::RegisterLocation(2)); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2); ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2); @@ -607,6 +643,12 @@ TEST_F(RegisterAllocatorTest, PhiHint) { } } +// TODO: Enable this test for graph coloring register allocation when iterative move +// coalescing is merged. +TEST_F(RegisterAllocatorTest, PhiHint_LinearScan) { + PhiHint(Strategy::kRegisterAllocatorLinearScan); +} + static HGraph* BuildFieldReturn(ArenaAllocator* allocator, HInstruction** field, HInstruction** ret) { @@ -645,7 +687,7 @@ static HGraph* BuildFieldReturn(ArenaAllocator* allocator, return graph; } -TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint) { +void RegisterAllocatorTest::ExpectedInRegisterHint(Strategy strategy) { ArenaPool pool; ArenaAllocator allocator(&pool); HInstruction *field, *ret; @@ -658,8 +700,9 @@ TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint) { SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); // Sanity check that in normal conditions, the register should be hinted to 0 (EAX). ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 0); @@ -677,13 +720,20 @@ TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint) { // Don't use SetInAt because we are overriding an already allocated location. ret->GetLocations()->inputs_[0] = Location::RegisterLocation(2); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 2); } } +// TODO: Enable this test for graph coloring register allocation when iterative move +// coalescing is merged. +TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint_LinearScan) { + ExpectedInRegisterHint(Strategy::kRegisterAllocatorLinearScan); +} + static HGraph* BuildTwoSubs(ArenaAllocator* allocator, HInstruction** first_sub, HInstruction** second_sub) { @@ -713,7 +763,7 @@ static HGraph* BuildTwoSubs(ArenaAllocator* allocator, return graph; } -TEST_F(RegisterAllocatorTest, SameAsFirstInputHint) { +void RegisterAllocatorTest::SameAsFirstInputHint(Strategy strategy) { ArenaPool pool; ArenaAllocator allocator(&pool); HInstruction *first_sub, *second_sub; @@ -726,8 +776,9 @@ TEST_F(RegisterAllocatorTest, SameAsFirstInputHint) { SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); // Sanity check that in normal conditions, the registers are the same. ASSERT_EQ(first_sub->GetLiveInterval()->GetRegister(), 1); @@ -748,14 +799,21 @@ TEST_F(RegisterAllocatorTest, SameAsFirstInputHint) { ASSERT_EQ(first_sub->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput); ASSERT_EQ(second_sub->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); ASSERT_EQ(first_sub->GetLiveInterval()->GetRegister(), 2); ASSERT_EQ(second_sub->GetLiveInterval()->GetRegister(), 2); } } +// TODO: Enable this test for graph coloring register allocation when iterative move +// coalescing is merged. +TEST_F(RegisterAllocatorTest, SameAsFirstInputHint_LinearScan) { + SameAsFirstInputHint(Strategy::kRegisterAllocatorLinearScan); +} + static HGraph* BuildDiv(ArenaAllocator* allocator, HInstruction** div) { HGraph* graph = CreateGraph(allocator); @@ -782,7 +840,7 @@ static HGraph* BuildDiv(ArenaAllocator* allocator, return graph; } -TEST_F(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint) { +static void ExpectedExactInRegisterAndSameOutputHint(Strategy strategy) { ArenaPool pool; ArenaAllocator allocator(&pool); HInstruction *div; @@ -795,17 +853,25 @@ TEST_F(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint) { SsaLivenessAnalysis liveness(graph, &codegen); liveness.Analyze(); - RegisterAllocator register_allocator(&allocator, &codegen, liveness); - register_allocator.AllocateRegisters(); + RegisterAllocator* register_allocator = + RegisterAllocator::Create(&allocator, &codegen, liveness, strategy); + register_allocator->AllocateRegisters(); // div on x86 requires its first input in eax and the output be the same as the first input. ASSERT_EQ(div->GetLiveInterval()->GetRegister(), 0); } } +// TODO: Enable this test for graph coloring register allocation when iterative move +// coalescing is merged. +TEST_F(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint_LinearScan) { + ExpectedExactInRegisterAndSameOutputHint(Strategy::kRegisterAllocatorLinearScan); +} + // Test a bug in the register allocator, where allocating a blocked // register would lead to spilling an inactive interval at the wrong // position. +// This test only applies to the linear scan allocator. TEST_F(RegisterAllocatorTest, SpillInactive) { ArenaPool pool; @@ -892,7 +958,7 @@ TEST_F(RegisterAllocatorTest, SpillInactive) { liveness.instructions_from_lifetime_position_.push_back(user); } - RegisterAllocator register_allocator(&allocator, &codegen, liveness); + RegisterAllocatorLinearScan register_allocator(&allocator, &codegen, liveness); register_allocator.unhandled_core_intervals_.push_back(fourth); register_allocator.unhandled_core_intervals_.push_back(third); register_allocator.unhandled_core_intervals_.push_back(second); diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc index 97f34e6c32..6effc306dc 100644 --- a/compiler/optimizing/sharpening.cc +++ b/compiler/optimizing/sharpening.cc @@ -17,6 +17,7 @@ #include "sharpening.h" #include "base/casts.h" +#include "base/enums.h" #include "class_linker.h" #include "code_generator.h" #include "driver/dex_compilation_unit.h" @@ -259,7 +260,7 @@ void HSharpening::ProcessLoadClass(HLoadClass* load_class) { load_class->SetLoadKindWithAddress(load_kind, address); break; case HLoadClass::LoadKind::kDexCachePcRelative: { - size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); DexCacheArraysLayout layout(pointer_size, &dex_file); size_t element_index = layout.TypeOffset(type_index); load_class->SetLoadKindWithDexCacheReference(load_kind, dex_file, element_index); @@ -278,8 +279,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { const DexFile& dex_file = load_string->GetDexFile(); uint32_t string_index = load_string->GetStringIndex(); - bool is_in_dex_cache = false; - HLoadString::LoadKind desired_load_kind; + HLoadString::LoadKind desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; uint64_t address = 0u; // String or dex cache element address. { Runtime* runtime = Runtime::Current(); @@ -295,33 +295,14 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { DCHECK(!runtime->UseJitCompilation()); mirror::String* string = class_linker->ResolveString(dex_file, string_index, dex_cache); CHECK(string != nullptr); - if (!compiler_driver_->GetSupportBootImageFixup()) { - // MIPS/MIPS64 or compiler_driver_test. Do not sharpen. - desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; - } else { - DCHECK(ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file)); - is_in_dex_cache = true; - desired_load_kind = codegen_->GetCompilerOptions().GetCompilePic() - ? HLoadString::LoadKind::kBootImageLinkTimePcRelative - : HLoadString::LoadKind::kBootImageLinkTimeAddress; - } + // TODO: In follow up CL, add PcRelative and Address back in. } else if (runtime->UseJitCompilation()) { // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus. // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic()); mirror::String* string = dex_cache->GetResolvedString(string_index); - is_in_dex_cache = (string != nullptr); if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string)) { - // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787 desired_load_kind = HLoadString::LoadKind::kBootImageAddress; address = reinterpret_cast64<uint64_t>(string); - } else { - // Note: If the string is not in the dex cache, the instruction needs environment - // and will not be inlined across dex files. Within a dex file, the slow-path helper - // loads the correct string and inlined frames are used correctly for OOM stack trace. - // TODO: Write a test for this. Bug: 29416588 - desired_load_kind = HLoadString::LoadKind::kDexCacheAddress; - void* dex_cache_element_address = &dex_cache->GetStrings()[string_index]; - address = reinterpret_cast64<uint64_t>(dex_cache_element_address); } } else { // AOT app compilation. Try to lookup the string without allocating if not found. @@ -331,19 +312,9 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { !codegen_->GetCompilerOptions().GetCompilePic()) { desired_load_kind = HLoadString::LoadKind::kBootImageAddress; address = reinterpret_cast64<uint64_t>(string); - } else { - // Not JIT and either the string is not in boot image or we are compiling in PIC mode. - // Use PC-relative load from the dex cache if the dex file belongs - // to the oat file that we're currently compiling. - desired_load_kind = ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file) - ? HLoadString::LoadKind::kDexCachePcRelative - : HLoadString::LoadKind::kDexCacheViaMethod; } } } - if (is_in_dex_cache) { - load_string->MarkInDexCache(); - } HLoadString::LoadKind load_kind = codegen_->GetSupportedLoadStringKind(desired_load_kind); switch (load_kind) { @@ -358,7 +329,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { load_string->SetLoadKindWithAddress(load_kind, address); break; case HLoadString::LoadKind::kDexCachePcRelative: { - size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); DexCacheArraysLayout layout(pointer_size, &dex_file); size_t element_index = layout.StringOffset(string_index); load_string->SetLoadKindWithDexCacheReference(load_kind, dex_file, element_index); diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc index 7af4302884..a01e107e02 100644 --- a/compiler/optimizing/ssa_liveness_analysis.cc +++ b/compiler/optimizing/ssa_liveness_analysis.cc @@ -368,6 +368,27 @@ bool SsaLivenessAnalysis::UpdateLiveIn(const HBasicBlock& block) { return live_in->UnionIfNotIn(live_out, kill); } +void LiveInterval::DumpWithContext(std::ostream& stream, + const CodeGenerator& codegen) const { + Dump(stream); + if (IsFixed()) { + stream << ", register:" << GetRegister() << "("; + if (IsFloatingPoint()) { + codegen.DumpFloatingPointRegister(stream, GetRegister()); + } else { + codegen.DumpCoreRegister(stream, GetRegister()); + } + stream << ")"; + } else { + stream << ", spill slot:" << GetSpillSlot(); + } + stream << ", requires_register:" << (GetDefinedBy() != nullptr && RequiresRegister()); + if (GetParent()->GetDefinedBy() != nullptr) { + stream << ", defined_by:" << GetParent()->GetDefinedBy()->GetKind(); + stream << "(" << GetParent()->GetDefinedBy()->GetLifetimePosition() << ")"; + } +} + static int RegisterOrLowRegister(Location location) { return location.IsPair() ? location.low() : location.reg(); } diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h index dc98864d9b..92788fe6b8 100644 --- a/compiler/optimizing/ssa_liveness_analysis.h +++ b/compiler/optimizing/ssa_liveness_analysis.h @@ -150,9 +150,7 @@ class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> { if (GetIsEnvironment()) return false; if (IsSynthesized()) return false; Location location = GetUser()->GetLocations()->InAt(GetInputIndex()); - return location.IsUnallocated() - && (location.GetPolicy() == Location::kRequiresRegister - || location.GetPolicy() == Location::kRequiresFpuRegister); + return location.IsUnallocated() && location.RequiresRegisterKind(); } private: @@ -481,6 +479,10 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { return last_range_->GetEnd(); } + size_t GetLength() const { + return GetEnd() - GetStart(); + } + size_t FirstRegisterUseAfter(size_t position) const { if (is_temp_) { return position == GetStart() ? position : kNoLifetime; @@ -504,10 +506,18 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { return kNoLifetime; } + // Returns the location of the first register use for this live interval, + // including a register definition if applicable. size_t FirstRegisterUse() const { return FirstRegisterUseAfter(GetStart()); } + // Whether the interval requires a register rather than a stack location. + // If needed for performance, this could be cached. + bool RequiresRegister() const { + return !HasRegister() && FirstRegisterUse() != kNoLifetime; + } + size_t FirstUseAfter(size_t position) const { if (is_temp_) { return position == GetStart() ? position : kNoLifetime; @@ -693,6 +703,10 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { stream << " is_high: " << IsHighInterval(); } + // Same as Dump, but adds context such as the instruction defining this interval, and + // the register currently assigned to this interval. + void DumpWithContext(std::ostream& stream, const CodeGenerator& codegen) const; + LiveInterval* GetNextSibling() const { return next_sibling_; } LiveInterval* GetLastSibling() { LiveInterval* result = this; @@ -871,6 +885,33 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { range_search_start_ = first_range_; } + bool DefinitionRequiresRegister() const { + DCHECK(IsParent()); + LocationSummary* locations = defined_by_->GetLocations(); + Location location = locations->Out(); + // This interval is the first interval of the instruction. If the output + // of the instruction requires a register, we return the position of that instruction + // as the first register use. + if (location.IsUnallocated()) { + if ((location.GetPolicy() == Location::kRequiresRegister) + || (location.GetPolicy() == Location::kSameAsFirstInput + && (locations->InAt(0).IsRegister() + || locations->InAt(0).IsRegisterPair() + || locations->InAt(0).GetPolicy() == Location::kRequiresRegister))) { + return true; + } else if ((location.GetPolicy() == Location::kRequiresFpuRegister) + || (location.GetPolicy() == Location::kSameAsFirstInput + && (locations->InAt(0).IsFpuRegister() + || locations->InAt(0).IsFpuRegisterPair() + || locations->InAt(0).GetPolicy() == Location::kRequiresFpuRegister))) { + return true; + } + } else if (location.IsRegister() || location.IsRegisterPair()) { + return true; + } + return false; + } + private: LiveInterval(ArenaAllocator* allocator, Primitive::Type type, @@ -925,33 +966,6 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { return range; } - bool DefinitionRequiresRegister() const { - DCHECK(IsParent()); - LocationSummary* locations = defined_by_->GetLocations(); - Location location = locations->Out(); - // This interval is the first interval of the instruction. If the output - // of the instruction requires a register, we return the position of that instruction - // as the first register use. - if (location.IsUnallocated()) { - if ((location.GetPolicy() == Location::kRequiresRegister) - || (location.GetPolicy() == Location::kSameAsFirstInput - && (locations->InAt(0).IsRegister() - || locations->InAt(0).IsRegisterPair() - || locations->InAt(0).GetPolicy() == Location::kRequiresRegister))) { - return true; - } else if ((location.GetPolicy() == Location::kRequiresFpuRegister) - || (location.GetPolicy() == Location::kSameAsFirstInput - && (locations->InAt(0).IsFpuRegister() - || locations->InAt(0).IsFpuRegisterPair() - || locations->InAt(0).GetPolicy() == Location::kRequiresFpuRegister))) { - return true; - } - } else if (location.IsRegister() || location.IsRegisterPair()) { - return true; - } - return false; - } - bool IsDefiningPosition(size_t position) const { return IsParent() && (position == GetStart()); } diff --git a/compiler/optimizing/x86_memory_gen.cc b/compiler/optimizing/x86_memory_gen.cc new file mode 100644 index 0000000000..8aa315a7e3 --- /dev/null +++ b/compiler/optimizing/x86_memory_gen.cc @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "x86_memory_gen.h" +#include "code_generator.h" + +namespace art { +namespace x86 { + +/** + * Replace instructions with memory operand forms. + */ +class MemoryOperandVisitor : public HGraphVisitor { + public: + MemoryOperandVisitor(HGraph* graph, bool do_implicit_null_checks) + : HGraphVisitor(graph), + do_implicit_null_checks_(do_implicit_null_checks) {} + + private: + void VisitBoundsCheck(HBoundsCheck* check) OVERRIDE { + // Replace the length by the array itself, so that we can do compares to memory. + HArrayLength* array_len = check->InputAt(1)->AsArrayLength(); + + // We only want to replace an ArrayLength. + if (array_len == nullptr) { + return; + } + + HInstruction* array = array_len->InputAt(0); + DCHECK_EQ(array->GetType(), Primitive::kPrimNot); + + // Don't apply this optimization when the array is nullptr. + if (array->IsConstant() || (array->IsNullCheck() && array->InputAt(0)->IsConstant())) { + return; + } + + // Is there a null check that could be an implicit check? + if (array->IsNullCheck() && do_implicit_null_checks_) { + // The ArrayLen may generate the implicit null check. Can the + // bounds check do so as well? + if (array_len->GetNextDisregardingMoves() != check) { + // No, it won't. Leave as is. + return; + } + } + + // Can we suppress the ArrayLength and generate at BoundCheck? + if (array_len->HasOnlyOneNonEnvironmentUse()) { + array_len->MarkEmittedAtUseSite(); + // We need the ArrayLength just before the BoundsCheck. + array_len->MoveBefore(check); + } + } + + bool do_implicit_null_checks_; +}; + +X86MemoryOperandGeneration::X86MemoryOperandGeneration(HGraph* graph, + CodeGenerator* codegen, + OptimizingCompilerStats* stats) + : HOptimization(graph, kX86MemoryOperandGenerationPassName, stats), + do_implicit_null_checks_(codegen->GetCompilerOptions().GetImplicitNullChecks()) { +} + +void X86MemoryOperandGeneration::Run() { + MemoryOperandVisitor visitor(graph_, do_implicit_null_checks_); + visitor.VisitInsertionOrder(); +} + +} // namespace x86 +} // namespace art diff --git a/compiler/optimizing/x86_memory_gen.h b/compiler/optimizing/x86_memory_gen.h new file mode 100644 index 0000000000..5f15d9f1e6 --- /dev/null +++ b/compiler/optimizing/x86_memory_gen.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_X86_MEMORY_GEN_H_ +#define ART_COMPILER_OPTIMIZING_X86_MEMORY_GEN_H_ + +#include "nodes.h" +#include "optimization.h" + +namespace art { +class CodeGenerator; + +namespace x86 { + +class X86MemoryOperandGeneration : public HOptimization { + public: + X86MemoryOperandGeneration(HGraph* graph, + CodeGenerator* codegen, + OptimizingCompilerStats* stats); + + void Run() OVERRIDE; + + static constexpr const char* kX86MemoryOperandGenerationPassName = + "x86_memory_operand_generation"; + + private: + bool do_implicit_null_checks_; +}; + +} // namespace x86 +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_X86_MEMORY_GEN_H_ |