diff options
Diffstat (limited to 'compiler/optimizing')
75 files changed, 8033 insertions, 3013 deletions
diff --git a/compiler/optimizing/block_builder.cc b/compiler/optimizing/block_builder.cc index 5e70a8284d..1e75f10ebe 100644 --- a/compiler/optimizing/block_builder.cc +++ b/compiler/optimizing/block_builder.cc @@ -310,16 +310,18 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // least one predecessor is not covered by the same TryItem as the try block. // We do not split each edge separately, but rather create one boundary block // that all predecessors are relinked to. This preserves loop headers (b/23895756). - for (auto entry : try_block_info) { - HBasicBlock* try_block = graph_->GetBlocks()[entry.first]; + for (const auto& entry : try_block_info) { + uint32_t block_id = entry.first; + const DexFile::TryItem* try_item = entry.second; + HBasicBlock* try_block = graph_->GetBlocks()[block_id]; for (HBasicBlock* predecessor : try_block->GetPredecessors()) { - if (GetTryItem(predecessor, try_block_info) != entry.second) { + if (GetTryItem(predecessor, try_block_info) != try_item) { // Found a predecessor not covered by the same TryItem. Insert entering // boundary block. HTryBoundary* try_entry = new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kEntry, try_block->GetDexPc()); try_block->CreateImmediateDominator()->AddInstruction(try_entry); - LinkToCatchBlocks(try_entry, code_item_, entry.second, catch_blocks); + LinkToCatchBlocks(try_entry, code_item_, try_item, catch_blocks); break; } } @@ -327,8 +329,10 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // Do a second pass over the try blocks and insert exit TryBoundaries where // the successor is not in the same TryItem. - for (auto entry : try_block_info) { - HBasicBlock* try_block = graph_->GetBlocks()[entry.first]; + for (const auto& entry : try_block_info) { + uint32_t block_id = entry.first; + const DexFile::TryItem* try_item = entry.second; + HBasicBlock* try_block = graph_->GetBlocks()[block_id]; // NOTE: Do not use iterators because SplitEdge would invalidate them. for (size_t i = 0, e = try_block->GetSuccessors().size(); i < e; ++i) { HBasicBlock* successor = try_block->GetSuccessors()[i]; @@ -337,7 +341,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // covered by the same TryItem. Otherwise the previous pass would have // created a non-throwing boundary block. if (GetTryItem(successor, try_block_info) != nullptr) { - DCHECK_EQ(entry.second, GetTryItem(successor, try_block_info)); + DCHECK_EQ(try_item, GetTryItem(successor, try_block_info)); continue; } @@ -345,7 +349,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { HTryBoundary* try_exit = new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kExit, successor->GetDexPc()); graph_->SplitEdge(try_block, successor)->AddInstruction(try_exit); - LinkToCatchBlocks(try_exit, code_item_, entry.second, catch_blocks); + LinkToCatchBlocks(try_exit, code_item_, try_item, catch_blocks); } } } diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc index ed630cda91..f3ecdf036a 100644 --- a/compiler/optimizing/bounds_check_elimination.cc +++ b/compiler/optimizing/bounds_check_elimination.cc @@ -1734,8 +1734,8 @@ class BCEVisitor : public HGraphVisitor { */ void InsertPhiNodes() { // Scan all new deoptimization blocks. - for (auto it1 = taken_test_loop_.begin(); it1 != taken_test_loop_.end(); ++it1) { - HBasicBlock* true_block = it1->second; + for (const auto& entry : taken_test_loop_) { + HBasicBlock* true_block = entry.second; HBasicBlock* new_preheader = true_block->GetSingleSuccessor(); // Scan all instructions in a new deoptimization block. for (HInstructionIterator it(true_block->GetInstructions()); !it.Done(); it.Advance()) { diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc index cb6e14b2bd..a949c33149 100644 --- a/compiler/optimizing/bounds_check_elimination_test.cc +++ b/compiler/optimizing/bounds_check_elimination_test.cc @@ -43,7 +43,7 @@ class BoundsCheckEliminationTest : public testing::Test { void RunBCE() { graph_->BuildDominatorTree(); - InstructionSimplifier(graph_, /* codegen */ nullptr).Run(); + InstructionSimplifier(graph_, /* codegen */ nullptr, /* driver */ nullptr).Run(); SideEffectsAnalysis side_effects(graph_); side_effects.Run(); diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 5136d7d2b8..c918ee6687 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -58,7 +58,7 @@ #include "parallel_move_resolver.h" #include "ssa_liveness_analysis.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "utils/assembler.h" namespace art { @@ -145,7 +145,7 @@ size_t CodeGenerator::GetCacheOffset(uint32_t index) { } size_t CodeGenerator::GetCachePointerOffset(uint32_t index) { - auto pointer_size = InstructionSetPointerSize(GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet()); return static_cast<size_t>(pointer_size) * index; } @@ -508,7 +508,7 @@ void CodeGenerator::GenerateUnresolvedFieldAccess( void CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(HLoadClass* cls, Location runtime_type_index_location, Location runtime_return_location) { - DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kDexCacheViaMethod); + DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kRuntimeCall); DCHECK_EQ(cls->InputCount(), 1u); LocationSummary* locations = new (cls->GetBlock()->GetGraph()->GetArena()) LocationSummary( cls, LocationSummary::kCallOnMainOnly); @@ -518,7 +518,7 @@ void CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(HLoadClass* cls, } void CodeGenerator::GenerateLoadClassRuntimeCall(HLoadClass* cls) { - DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kDexCacheViaMethod); + DCHECK_EQ(cls->GetLoadKind(), HLoadClass::LoadKind::kRuntimeCall); LocationSummary* locations = cls->GetLocations(); MoveConstant(locations->GetTemp(0), cls->GetTypeIndex().index_); if (cls->NeedsAccessCheck()) { @@ -557,6 +557,9 @@ void CodeGenerator::BlockIfInRegister(Location location, bool is_out) const { } void CodeGenerator::AllocateLocations(HInstruction* instruction) { + for (HEnvironment* env = instruction->GetEnvironment(); env != nullptr; env = env->GetParent()) { + env->AllocateLocations(); + } instruction->Accept(GetLocationBuilder()); DCHECK(CheckTypeConsistency(instruction)); LocationSummary* locations = instruction->GetLocations(); @@ -1400,20 +1403,6 @@ void CodeGenerator::CreateSystemArrayCopyLocationSummary(HInvoke* invoke) { locations->AddTemp(Location::RequiresRegister()); } -uint32_t CodeGenerator::GetReferenceSlowFlagOffset() const { - ScopedObjectAccess soa(Thread::Current()); - mirror::Class* klass = mirror::Reference::GetJavaLangRefReference(); - DCHECK(klass->IsInitialized()); - return klass->GetSlowPathFlagOffset().Uint32Value(); -} - -uint32_t CodeGenerator::GetReferenceDisableFlagOffset() const { - ScopedObjectAccess soa(Thread::Current()); - mirror::Class* klass = mirror::Reference::GetJavaLangRefReference(); - DCHECK(klass->IsInitialized()); - return klass->GetDisableIntrinsicFlagOffset().Uint32Value(); -} - void CodeGenerator::EmitJitRoots(uint8_t* code, Handle<mirror::ObjectArray<mirror::Object>> roots, const uint8_t* roots_data) { diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index ea463eeb62..c9ba5c3357 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -31,10 +31,11 @@ #include "nodes.h" #include "optimizing_compiler_stats.h" #include "read_barrier_option.h" +#include "stack.h" #include "stack_map_stream.h" #include "string_reference.h" +#include "type_reference.h" #include "utils/label.h" -#include "utils/type_reference.h" namespace art { @@ -541,7 +542,7 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { case HLoadString::LoadKind::kBssEntry: DCHECK(load->NeedsEnvironment()); return LocationSummary::kCallOnSlowPath; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kRuntimeCall: DCHECK(load->NeedsEnvironment()); return LocationSummary::kCallOnMainOnly; case HLoadString::LoadKind::kJitTableAddress: @@ -572,9 +573,6 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { virtual void GenerateNop() = 0; - uint32_t GetReferenceSlowFlagOffset() const; - uint32_t GetReferenceDisableFlagOffset() const; - static QuickEntrypointEnum GetArrayAllocationEntrypoint(Handle<mirror::Class> array_klass); protected: @@ -842,7 +840,7 @@ class SlowPathGenerator { const uint32_t dex_pc = instruction->GetDexPc(); auto iter = slow_path_map_.find(dex_pc); if (iter != slow_path_map_.end()) { - auto candidates = iter->second; + const ArenaVector<std::pair<InstructionType*, SlowPathCode*>>& candidates = iter->second; for (const auto& it : candidates) { InstructionType* other_instruction = it.first; SlowPathCodeType* other_slow_path = down_cast<SlowPathCodeType*>(it.second); diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 419b2afe91..914ae177c4 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -16,6 +16,7 @@ #include "code_generator_arm.h" +#include "arch/arm/asm_support_arm.h" #include "arch/arm/instruction_set_features_arm.h" #include "art_method.h" #include "code_generator_utils.h" @@ -25,6 +26,7 @@ #include "gc/accounting/card_table.h" #include "intrinsics.h" #include "intrinsics_arm.h" +#include "linker/arm/relative_patcher_thumb2.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "thread.h" @@ -60,10 +62,45 @@ static constexpr DRegister DTMP = D31; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle +// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// The reserved entrypoint register for link-time generated thunks. +const Register kBakerCcEntrypointRegister = R4; + // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value() +static inline void CheckLastTempIsBakerCcEntrypointRegister(HInstruction* instruction) { + DCHECK_EQ(static_cast<uint32_t>(kBakerCcEntrypointRegister), + linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister); + DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u); + DCHECK_EQ(kBakerCcEntrypointRegister, + instruction->GetLocations()->GetTemp( + instruction->GetLocations()->GetTempCount() - 1u).AsRegister<Register>()); +} + +static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) { + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(codegen->GetAssembler())); + __ BindTrackedLabel(bne_label); + Label placeholder_label; + __ b(&placeholder_label, NE); // Placeholder, patched at link-time. + __ Bind(&placeholder_label); +} + +static inline bool CanEmitNarrowLdr(Register rt, Register rn, uint32_t offset) { + return ArmAssembler::IsLowRegister(rt) && ArmAssembler::IsLowRegister(rn) && offset < 32u; +} + static constexpr int kRegListThreshold = 4; // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers, @@ -824,7 +861,7 @@ class LoadReferenceWithBakerReadBarrierSlowPathARM : public ReadBarrierMarkSlowP // Baker's read barriers, we need to perform the load of // mirror::Object::monitor_ *before* the original reference load. // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // The slow path (for Baker's algorithm) should look like: // // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); // lfence; // Load fence or artificial data dependency to prevent load-load reordering @@ -959,6 +996,18 @@ class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM __ Bind(GetEntryLabel()); + // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARM's: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // old_ref = ref; + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // compareAndSwapObject(obj, field_offset, old_ref, ref); + // } + // /* int32_t */ monitor = obj->monitor_ uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); __ LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset); @@ -1607,6 +1656,34 @@ static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARM* codegen) { } } +static int64_t AdjustConstantForCondition(int64_t value, + IfCondition* condition, + IfCondition* opposite) { + if (value == 1) { + if (*condition == kCondB) { + value = 0; + *condition = kCondEQ; + *opposite = kCondNE; + } else if (*condition == kCondAE) { + value = 0; + *condition = kCondNE; + *opposite = kCondEQ; + } + } else if (value == -1) { + if (*condition == kCondGT) { + value = 0; + *condition = kCondGE; + *opposite = kCondLT; + } else if (*condition == kCondLE) { + value = 0; + *condition = kCondLT; + *opposite = kCondGE; + } + } + + return value; +} + static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* condition, bool invert, CodeGeneratorARM* codegen) { @@ -1620,7 +1697,7 @@ static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* cond std::swap(cond, opposite); } - std::pair<Condition, Condition> ret; + std::pair<Condition, Condition> ret(EQ, NE); const Location left = locations->InAt(0); const Location right = locations->InAt(1); @@ -1628,7 +1705,38 @@ static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* cond const Register left_high = left.AsRegisterPairHigh<Register>(); const Register left_low = left.AsRegisterPairLow<Register>(); - int64_t value = right.GetConstant()->AsLongConstant()->GetValue(); + int64_t value = AdjustConstantForCondition(right.GetConstant()->AsLongConstant()->GetValue(), + &cond, + &opposite); + + // Comparisons against 0 are common enough to deserve special attention. + if (value == 0) { + switch (cond) { + case kCondNE: + // x > 0 iff x != 0 when the comparison is unsigned. + case kCondA: + ret = std::make_pair(NE, EQ); + FALLTHROUGH_INTENDED; + case kCondEQ: + // x <= 0 iff x == 0 when the comparison is unsigned. + case kCondBE: + __ orrs(IP, left_low, ShifterOperand(left_high)); + return ret; + case kCondLT: + case kCondGE: + __ cmp(left_high, ShifterOperand(0)); + return std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + // Trivially true or false. + case kCondB: + ret = std::make_pair(NE, EQ); + FALLTHROUGH_INTENDED; + case kCondAE: + __ cmp(left_low, ShifterOperand(left_low)); + return ret; + default: + break; + } + } switch (cond) { case kCondEQ: @@ -1788,10 +1896,14 @@ static std::pair<Condition, Condition> GenerateTest(HCondition* condition, static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) { if (condition->GetLeft()->GetType() == Primitive::kPrimLong) { const LocationSummary* const locations = condition->GetLocations(); - const IfCondition c = condition->GetCondition(); if (locations->InAt(1).IsConstant()) { - const int64_t value = locations->InAt(1).GetConstant()->AsLongConstant()->GetValue(); + IfCondition c = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + const int64_t value = AdjustConstantForCondition( + Int64FromConstant(locations->InAt(1).GetConstant()), + &c, + &opposite); ShifterOperand so; if (c < kCondLT || c > kCondGE) { @@ -1799,9 +1911,11 @@ static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) { // we check that the least significant half of the first input to be compared // is in a low register (the other half is read outside an IT block), and // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP - // encoding can be used. - if (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) || - !IsUint<8>(Low32Bits(value))) { + // encoding can be used; 0 is always handled, no matter what registers are + // used by the first input. + if (value != 0 && + (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) || + !IsUint<8>(Low32Bits(value)))) { return false; } } else if (c == kCondLE || c == kCondGT) { @@ -1828,6 +1942,329 @@ static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) { return true; } +static void GenerateConditionGeneric(HCondition* cond, CodeGeneratorARM* codegen) { + DCHECK(CanGenerateTest(cond, codegen->GetAssembler())); + + const Register out = cond->GetLocations()->Out().AsRegister<Register>(); + const auto condition = GenerateTest(cond, false, codegen); + + __ mov(out, ShifterOperand(0), AL, kCcKeep); + + if (ArmAssembler::IsLowRegister(out)) { + __ it(condition.first); + __ mov(out, ShifterOperand(1), condition.first); + } else { + Label done_label; + Label* const final_label = codegen->GetFinalLabel(cond, &done_label); + + __ b(final_label, condition.second); + __ LoadImmediate(out, 1); + + if (done_label.IsLinked()) { + __ Bind(&done_label); + } + } +} + +static void GenerateEqualLong(HCondition* cond, CodeGeneratorARM* codegen) { + DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = cond->GetLocations(); + IfCondition condition = cond->GetCondition(); + const Register out = locations->Out().AsRegister<Register>(); + const Location left = locations->InAt(0); + const Location right = locations->InAt(1); + Register left_high = left.AsRegisterPairHigh<Register>(); + Register left_low = left.AsRegisterPairLow<Register>(); + + if (right.IsConstant()) { + IfCondition opposite = cond->GetOppositeCondition(); + const int64_t value = AdjustConstantForCondition(Int64FromConstant(right.GetConstant()), + &condition, + &opposite); + int32_t value_high = -High32Bits(value); + int32_t value_low = -Low32Bits(value); + + // The output uses Location::kNoOutputOverlap. + if (out == left_high) { + std::swap(left_low, left_high); + std::swap(value_low, value_high); + } + + __ AddConstant(out, left_low, value_low); + __ AddConstant(IP, left_high, value_high); + } else { + DCHECK(right.IsRegisterPair()); + __ sub(IP, left_high, ShifterOperand(right.AsRegisterPairHigh<Register>())); + __ sub(out, left_low, ShifterOperand(right.AsRegisterPairLow<Register>())); + } + + // Need to check after calling AdjustConstantForCondition(). + DCHECK(condition == kCondEQ || condition == kCondNE) << condition; + + if (condition == kCondNE && ArmAssembler::IsLowRegister(out)) { + __ orrs(out, out, ShifterOperand(IP)); + __ it(NE); + __ mov(out, ShifterOperand(1), NE); + } else { + __ orr(out, out, ShifterOperand(IP)); + codegen->GenerateConditionWithZero(condition, out, out, IP); + } +} + +static void GenerateLongComparesAndJumps(HCondition* cond, + Label* true_label, + Label* false_label, + CodeGeneratorARM* codegen) { + LocationSummary* locations = cond->GetLocations(); + Location left = locations->InAt(0); + Location right = locations->InAt(1); + IfCondition if_cond = cond->GetCondition(); + + Register left_high = left.AsRegisterPairHigh<Register>(); + Register left_low = left.AsRegisterPairLow<Register>(); + IfCondition true_high_cond = if_cond; + IfCondition false_high_cond = cond->GetOppositeCondition(); + Condition final_condition = ARMUnsignedCondition(if_cond); // unsigned on lower part + + // Set the conditions for the test, remembering that == needs to be + // decided using the low words. + switch (if_cond) { + case kCondEQ: + case kCondNE: + // Nothing to do. + break; + case kCondLT: + false_high_cond = kCondGT; + break; + case kCondLE: + true_high_cond = kCondLT; + break; + case kCondGT: + false_high_cond = kCondLT; + break; + case kCondGE: + true_high_cond = kCondGT; + break; + case kCondB: + false_high_cond = kCondA; + break; + case kCondBE: + true_high_cond = kCondB; + break; + case kCondA: + false_high_cond = kCondB; + break; + case kCondAE: + true_high_cond = kCondA; + break; + } + if (right.IsConstant()) { + int64_t value = right.GetConstant()->AsLongConstant()->GetValue(); + int32_t val_low = Low32Bits(value); + int32_t val_high = High32Bits(value); + + __ CmpConstant(left_high, val_high); + if (if_cond == kCondNE) { + __ b(true_label, ARMCondition(true_high_cond)); + } else if (if_cond == kCondEQ) { + __ b(false_label, ARMCondition(false_high_cond)); + } else { + __ b(true_label, ARMCondition(true_high_cond)); + __ b(false_label, ARMCondition(false_high_cond)); + } + // Must be equal high, so compare the lows. + __ CmpConstant(left_low, val_low); + } else { + Register right_high = right.AsRegisterPairHigh<Register>(); + Register right_low = right.AsRegisterPairLow<Register>(); + + __ cmp(left_high, ShifterOperand(right_high)); + if (if_cond == kCondNE) { + __ b(true_label, ARMCondition(true_high_cond)); + } else if (if_cond == kCondEQ) { + __ b(false_label, ARMCondition(false_high_cond)); + } else { + __ b(true_label, ARMCondition(true_high_cond)); + __ b(false_label, ARMCondition(false_high_cond)); + } + // Must be equal high, so compare the lows. + __ cmp(left_low, ShifterOperand(right_low)); + } + // The last comparison might be unsigned. + // TODO: optimize cases where this is always true/false + __ b(true_label, final_condition); +} + +static void GenerateConditionLong(HCondition* cond, CodeGeneratorARM* codegen) { + DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = cond->GetLocations(); + IfCondition condition = cond->GetCondition(); + const Register out = locations->Out().AsRegister<Register>(); + const Location left = locations->InAt(0); + const Location right = locations->InAt(1); + + if (right.IsConstant()) { + IfCondition opposite = cond->GetOppositeCondition(); + + // Comparisons against 0 are common enough to deserve special attention. + if (AdjustConstantForCondition(Int64FromConstant(right.GetConstant()), + &condition, + &opposite) == 0) { + switch (condition) { + case kCondNE: + case kCondA: + if (ArmAssembler::IsLowRegister(out)) { + // We only care if both input registers are 0 or not. + __ orrs(out, + left.AsRegisterPairLow<Register>(), + ShifterOperand(left.AsRegisterPairHigh<Register>())); + __ it(NE); + __ mov(out, ShifterOperand(1), NE); + return; + } + + FALLTHROUGH_INTENDED; + case kCondEQ: + case kCondBE: + // We only care if both input registers are 0 or not. + __ orr(out, + left.AsRegisterPairLow<Register>(), + ShifterOperand(left.AsRegisterPairHigh<Register>())); + codegen->GenerateConditionWithZero(condition, out, out); + return; + case kCondLT: + case kCondGE: + // We only care about the sign bit. + FALLTHROUGH_INTENDED; + case kCondAE: + case kCondB: + codegen->GenerateConditionWithZero(condition, out, left.AsRegisterPairHigh<Register>()); + return; + case kCondLE: + case kCondGT: + default: + break; + } + } + } + + if ((condition == kCondEQ || condition == kCondNE) && + // If `out` is a low register, then the GenerateConditionGeneric() + // function generates a shorter code sequence that is still branchless. + (!ArmAssembler::IsLowRegister(out) || !CanGenerateTest(cond, codegen->GetAssembler()))) { + GenerateEqualLong(cond, codegen); + return; + } + + if (CanGenerateTest(cond, codegen->GetAssembler())) { + GenerateConditionGeneric(cond, codegen); + return; + } + + // Convert the jumps into the result. + Label done_label; + Label* const final_label = codegen->GetFinalLabel(cond, &done_label); + Label true_label, false_label; + + GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen); + + // False case: result = 0. + __ Bind(&false_label); + __ mov(out, ShifterOperand(0)); + __ b(final_label); + + // True case: result = 1. + __ Bind(&true_label); + __ mov(out, ShifterOperand(1)); + + if (done_label.IsLinked()) { + __ Bind(&done_label); + } +} + +static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARM* codegen) { + const Primitive::Type type = cond->GetLeft()->GetType(); + + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; + + if (type == Primitive::kPrimLong) { + GenerateConditionLong(cond, codegen); + return; + } + + const LocationSummary* const locations = cond->GetLocations(); + IfCondition condition = cond->GetCondition(); + Register in = locations->InAt(0).AsRegister<Register>(); + const Register out = locations->Out().AsRegister<Register>(); + const Location right = cond->GetLocations()->InAt(1); + int64_t value; + + if (right.IsConstant()) { + IfCondition opposite = cond->GetOppositeCondition(); + + value = AdjustConstantForCondition(Int64FromConstant(right.GetConstant()), + &condition, + &opposite); + + // Comparisons against 0 are common enough to deserve special attention. + if (value == 0) { + switch (condition) { + case kCondNE: + case kCondA: + if (ArmAssembler::IsLowRegister(out) && out == in) { + __ cmp(out, ShifterOperand(0)); + __ it(NE); + __ mov(out, ShifterOperand(1), NE); + return; + } + + FALLTHROUGH_INTENDED; + case kCondEQ: + case kCondBE: + case kCondLT: + case kCondGE: + case kCondAE: + case kCondB: + codegen->GenerateConditionWithZero(condition, out, in); + return; + case kCondLE: + case kCondGT: + default: + break; + } + } + } + + if (condition == kCondEQ || condition == kCondNE) { + ShifterOperand operand; + + if (right.IsConstant()) { + operand = ShifterOperand(value); + } else if (out == right.AsRegister<Register>()) { + // Avoid 32-bit instructions if possible. + operand = ShifterOperand(in); + in = right.AsRegister<Register>(); + } else { + operand = ShifterOperand(right.AsRegister<Register>()); + } + + if (condition == kCondNE && ArmAssembler::IsLowRegister(out)) { + __ subs(out, in, operand); + __ it(NE); + __ mov(out, ShifterOperand(1), NE); + } else { + __ sub(out, in, operand); + codegen->GenerateConditionWithZero(condition, out, out); + } + + return; + } + + GenerateConditionGeneric(cond, codegen); +} + static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) { const Primitive::Type type = constant->GetType(); bool ret = false; @@ -1960,13 +2397,11 @@ CodeGeneratorARM::CodeGeneratorARM(HGraph* graph, uint32_literals_(std::less<uint32_t>(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_string_patches_(StringReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_type_patches_(TypeReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -2433,89 +2868,6 @@ void LocationsBuilderARM::VisitExit(HExit* exit) { void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) { } -void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond, - Label* true_label, - Label* false_label) { - LocationSummary* locations = cond->GetLocations(); - Location left = locations->InAt(0); - Location right = locations->InAt(1); - IfCondition if_cond = cond->GetCondition(); - - Register left_high = left.AsRegisterPairHigh<Register>(); - Register left_low = left.AsRegisterPairLow<Register>(); - IfCondition true_high_cond = if_cond; - IfCondition false_high_cond = cond->GetOppositeCondition(); - Condition final_condition = ARMUnsignedCondition(if_cond); // unsigned on lower part - - // Set the conditions for the test, remembering that == needs to be - // decided using the low words. - switch (if_cond) { - case kCondEQ: - case kCondNE: - // Nothing to do. - break; - case kCondLT: - false_high_cond = kCondGT; - break; - case kCondLE: - true_high_cond = kCondLT; - break; - case kCondGT: - false_high_cond = kCondLT; - break; - case kCondGE: - true_high_cond = kCondGT; - break; - case kCondB: - false_high_cond = kCondA; - break; - case kCondBE: - true_high_cond = kCondB; - break; - case kCondA: - false_high_cond = kCondB; - break; - case kCondAE: - true_high_cond = kCondA; - break; - } - if (right.IsConstant()) { - int64_t value = right.GetConstant()->AsLongConstant()->GetValue(); - int32_t val_low = Low32Bits(value); - int32_t val_high = High32Bits(value); - - __ CmpConstant(left_high, val_high); - if (if_cond == kCondNE) { - __ b(true_label, ARMCondition(true_high_cond)); - } else if (if_cond == kCondEQ) { - __ b(false_label, ARMCondition(false_high_cond)); - } else { - __ b(true_label, ARMCondition(true_high_cond)); - __ b(false_label, ARMCondition(false_high_cond)); - } - // Must be equal high, so compare the lows. - __ CmpConstant(left_low, val_low); - } else { - Register right_high = right.AsRegisterPairHigh<Register>(); - Register right_low = right.AsRegisterPairLow<Register>(); - - __ cmp(left_high, ShifterOperand(right_high)); - if (if_cond == kCondNE) { - __ b(true_label, ARMCondition(true_high_cond)); - } else if (if_cond == kCondEQ) { - __ b(false_label, ARMCondition(false_high_cond)); - } else { - __ b(true_label, ARMCondition(true_high_cond)); - __ b(false_label, ARMCondition(false_high_cond)); - } - // Must be equal high, so compare the lows. - __ cmp(left_low, ShifterOperand(right_low)); - } - // The last comparison might be unsigned. - // TODO: optimize cases where this is always true/false - __ b(true_label, final_condition); -} - void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condition, Label* true_target_in, Label* false_target_in) { @@ -2557,7 +2909,7 @@ void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condi Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in; DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong); - GenerateLongComparesAndJumps(condition, true_target, false_target); + GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_); if (false_target != &fallthrough_target) { __ b(false_target); @@ -2872,6 +3224,80 @@ void CodeGeneratorARM::GenerateNop() { __ nop(); } +// `temp` is an extra temporary register that is used for some conditions; +// callers may not specify it, in which case the method will use a scratch +// register instead. +void CodeGeneratorARM::GenerateConditionWithZero(IfCondition condition, + Register out, + Register in, + Register temp) { + switch (condition) { + case kCondEQ: + // x <= 0 iff x == 0 when the comparison is unsigned. + case kCondBE: + if (temp == kNoRegister || (ArmAssembler::IsLowRegister(out) && out != in)) { + temp = out; + } + + // Avoid 32-bit instructions if possible; note that `in` and `temp` must be + // different as well. + if (ArmAssembler::IsLowRegister(in) && ArmAssembler::IsLowRegister(temp) && in != temp) { + // temp = - in; only 0 sets the carry flag. + __ rsbs(temp, in, ShifterOperand(0)); + + if (out == in) { + std::swap(in, temp); + } + + // out = - in + in + carry = carry + __ adc(out, temp, ShifterOperand(in)); + } else { + // If `in` is 0, then it has 32 leading zeros, and less than that otherwise. + __ clz(out, in); + // Any number less than 32 logically shifted right by 5 bits results in 0; + // the same operation on 32 yields 1. + __ Lsr(out, out, 5); + } + + break; + case kCondNE: + // x > 0 iff x != 0 when the comparison is unsigned. + case kCondA: + if (out == in) { + if (temp == kNoRegister || in == temp) { + temp = IP; + } + } else if (temp == kNoRegister || !ArmAssembler::IsLowRegister(temp)) { + temp = out; + } + + // temp = in - 1; only 0 does not set the carry flag. + __ subs(temp, in, ShifterOperand(1)); + // out = in + ~temp + carry = in + (-(in - 1) - 1) + carry = in - in + 1 - 1 + carry = carry + __ sbc(out, in, ShifterOperand(temp)); + break; + case kCondGE: + __ mvn(out, ShifterOperand(in)); + in = out; + FALLTHROUGH_INTENDED; + case kCondLT: + // We only care about the sign bit. + __ Lsr(out, in, 31); + break; + case kCondAE: + // Trivially true. + __ mov(out, ShifterOperand(1)); + break; + case kCondB: + // Trivially false. + __ mov(out, ShifterOperand(0)); + break; + default: + LOG(FATAL) << "Unexpected condition " << condition; + UNREACHABLE(); + } +} + void LocationsBuilderARM::HandleCondition(HCondition* cond) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cond, LocationSummary::kNoCall); @@ -2908,48 +3334,48 @@ void InstructionCodeGeneratorARM::HandleCondition(HCondition* cond) { return; } - const Register out = cond->GetLocations()->Out().AsRegister<Register>(); - - if (ArmAssembler::IsLowRegister(out) && CanGenerateTest(cond, codegen_->GetAssembler())) { - const auto condition = GenerateTest(cond, false, codegen_); + const Primitive::Type type = cond->GetLeft()->GetType(); - __ it(condition.first); - __ mov(out, ShifterOperand(1), condition.first); - __ it(condition.second); - __ mov(out, ShifterOperand(0), condition.second); + if (Primitive::IsFloatingPointType(type)) { + GenerateConditionGeneric(cond, codegen_); return; } - // Convert the jumps into the result. - Label done_label; - Label* const final_label = codegen_->GetFinalLabel(cond, &done_label); + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; - if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) { - Label true_label, false_label; + const IfCondition condition = cond->GetCondition(); - GenerateLongComparesAndJumps(cond, &true_label, &false_label); + // A condition with only one boolean input, or two boolean inputs without being equality or + // inequality results from transformations done by the instruction simplifier, and is handled + // as a regular condition with integral inputs. + if (type == Primitive::kPrimBoolean && + cond->GetRight()->GetType() == Primitive::kPrimBoolean && + (condition == kCondEQ || condition == kCondNE)) { + const LocationSummary* const locations = cond->GetLocations(); + Register left = locations->InAt(0).AsRegister<Register>(); + const Register out = locations->Out().AsRegister<Register>(); + const Location right_loc = locations->InAt(1); - // False case: result = 0. - __ Bind(&false_label); - __ LoadImmediate(out, 0); - __ b(final_label); + // The constant case is handled by the instruction simplifier. + DCHECK(!right_loc.IsConstant()); - // True case: result = 1. - __ Bind(&true_label); - __ LoadImmediate(out, 1); - } else { - DCHECK(CanGenerateTest(cond, codegen_->GetAssembler())); + Register right = right_loc.AsRegister<Register>(); - const auto condition = GenerateTest(cond, false, codegen_); + // Avoid 32-bit instructions if possible. + if (out == right) { + std::swap(left, right); + } - __ mov(out, ShifterOperand(0), AL, kCcKeep); - __ b(final_label, condition.second); - __ LoadImmediate(out, 1); - } + __ eor(out, left, ShifterOperand(right)); - if (done_label.IsLinked()) { - __ Bind(&done_label); + if (condition == kCondEQ) { + __ eor(out, out, ShifterOperand(1)); + } + + return; } + + GenerateConditionIntegralOrNonPrimitive(cond, codegen_); } void LocationsBuilderARM::VisitEqual(HEqual* comp) { @@ -3082,6 +3508,15 @@ void InstructionCodeGeneratorARM::VisitDoubleConstant(HDoubleConstant* constant // Will be generated at use site. } +void LocationsBuilderARM::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorARM::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderARM::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -5287,7 +5722,18 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else { + locations->AddTemp(Location::RequiresRegister()); + } } } @@ -5753,11 +6199,35 @@ void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) { Location::RequiresRegister(), object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap); } - // We need a temporary register for the read barrier marking slow - // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier. - // Also need for String compression feature. - if ((object_array_get_with_read_barrier && kUseBakerReadBarrier) - || (mirror::kUseStringCompression && instruction->IsStringCharAt())) { + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier. + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation() && + !instruction->GetIndex()->IsConstant()) { + // We need a non-scratch temporary for the array data pointer. + locations->AddTemp(Location::RequiresRegister()); + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else { + locations->AddTemp(Location::RequiresRegister()); + } + } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + // Also need a temporary for String compression feature. locations->AddTemp(Location::RequiresRegister()); } } @@ -5869,8 +6339,20 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { Location temp = locations->GetTemp(0); // Note that a potential implicit null check is handled in this // CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true); + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + data_offset += helpers::Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + locations->GetTemp(0), + /* needs_null_check */ false); + } else { + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false); + } } else { Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { @@ -6275,6 +6757,15 @@ void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* } } +void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) { RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConvention calling_convention; @@ -6645,21 +7136,15 @@ HLoadClass::LoadKind CodeGeneratorARM::GetSupportedLoadClassKind( UNREACHABLE(); case HLoadClass::LoadKind::kReferrersClass: break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadClass::LoadKind::kBootImageAddress: - break; case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kRuntimeCall: break; } return desired_class_load_kind; @@ -6667,7 +7152,7 @@ HLoadClass::LoadKind CodeGeneratorARM::GetSupportedLoadClassKind( void LocationsBuilderARM::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( cls, @@ -6707,13 +7192,20 @@ void LocationsBuilderARM::VisitLoadClass(HLoadClass* cls) { // For non-Baker read barrier we have a temp-clobbering call. } } + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + if (load_kind == HLoadClass::LoadKind::kBssEntry || + (load_kind == HLoadClass::LoadKind::kReferrersClass && + !Runtime::Current()->UseJitCompilation())) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not // move. void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -6740,13 +7232,6 @@ void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE read_barrier_option); break; } - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: { - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); - __ LoadLiteral(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), - cls->GetTypeIndex())); - break; - } case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); @@ -6792,7 +7277,7 @@ void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, read_barrier_option); break; } - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: case HLoadClass::LoadKind::kInvalid: LOG(FATAL) << "UNREACHABLE"; UNREACHABLE(); @@ -6846,21 +7331,15 @@ void InstructionCodeGeneratorARM::GenerateClassInitializationCheck( HLoadString::LoadKind CodeGeneratorARM::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadString::LoadKind::kBootImageAddress: - break; case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kRuntimeCall: break; } return desired_string_load_kind; @@ -6870,7 +7349,7 @@ void LocationsBuilderARM::VisitLoadString(HLoadString* load) { LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); HLoadString::LoadKind load_kind = load->GetLoadKind(); - if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadString::LoadKind::kRuntimeCall) { locations->SetOut(Location::RegisterLocation(R0)); } else { locations->SetOut(Location::RequiresRegister()); @@ -6886,6 +7365,9 @@ void LocationsBuilderARM::VisitLoadString(HLoadString* load) { // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK() // that the the kPrimNot result register is the same as the first argument register. locations->SetCustomSlowPathCallerSaves(caller_saves); + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } } else { // For non-Baker read barrier we have a temp-clobbering call. } @@ -6902,12 +7384,6 @@ void InstructionCodeGeneratorARM::VisitLoadString(HLoadString* load) NO_THREAD_S HLoadString::LoadKind load_kind = load->GetLoadKind(); switch (load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: { - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - __ LoadLiteral(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(), - load->GetStringIndex())); - return; // No dex cache slow path. - } case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorARM::PcRelativePatchInfo* labels = @@ -6960,7 +7436,7 @@ void InstructionCodeGeneratorARM::VisitLoadString(HLoadString* load) NO_THREAD_S } // TODO: Consider re-adding the compiler code to do string dex cache lookup again. - DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod); + DCHECK(load_kind == HLoadString::LoadKind::kRuntimeCall); InvokeRuntimeCallingConvention calling_convention; DCHECK_EQ(calling_convention.GetRegisterAt(0), out); __ LoadImmediate(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_); @@ -7056,6 +7532,9 @@ void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) { // Note that TypeCheckSlowPathARM uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + codegen_->MaybeAddBakerCcEntrypointTempForFields(locations); + } } void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { @@ -7929,48 +8408,96 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Location temp = Location::RegisterLocation(LR); - SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( - instruction, root, /* entrypoint */ temp); - codegen_->AddSlowPath(slow_path); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: + + CheckLastTempIsBakerCcEntrypointRegister(instruction); + bool narrow = CanEmitNarrowLdr(root_reg, obj, offset); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow); + Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + // Currently the offset is always within range. If that changes, + // we shall have to split the load the same way as for fields. + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit()); + ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow); + int old_position = GetAssembler()->GetBuffer()->GetPosition(); + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + EmitPlaceholderBne(codegen_, bne_label); + __ Bind(&return_address); + DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(), + narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET + : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET); + } else { + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - __ LoadFromOffset(kLoadWord, root_reg, obj, offset); - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -7988,6 +8515,16 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct } } +void CodeGeneratorARM::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields) { + if (!Runtime::Current()->UseJitCompilation()) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } + } +} + void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, Register obj, @@ -7997,6 +8534,76 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // HeapReference<mirror::Object> reference = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + Register ref_reg = ref.AsRegister<Register>(); + bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset); + Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + base = temp.AsRegister<Register>(); + DCHECK_NE(base, kBakerCcEntrypointRegister); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u)); + offset &= (kReferenceLoadMinFarOffset - 1u); + // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large + // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely + // increase the overall code size when taking the generated thunks into account. + DCHECK(!narrow); + } + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj, narrow); + Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + EmitPlaceholderBne(this, bne_label); + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit()); + ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow); + int old_position = GetAssembler()->GetBuffer()->GetPosition(); + __ LoadFromOffset(kLoadWord, ref_reg, base, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(), + narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET + : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) Location no_index = Location::NoLocation(); ScaleFactor no_scale_factor = TIMES_1; @@ -8017,9 +8624,67 @@ void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instr static_assert( sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + ScaleFactor scale_factor = TIMES_4; + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // HeapReference<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + Register index_reg = index.AsRegister<Register>(); + Register ref_reg = ref.AsRegister<Register>(); + Register data_reg = temp.AsRegister<Register>(); + DCHECK_NE(data_reg, kBakerCcEntrypointRegister); + + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg); + Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + __ AddConstant(data_reg, obj, data_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + EmitPlaceholderBne(this, bne_label); + ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + int old_position = GetAssembler()->GetBuffer()->GetPosition(); + __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(), + BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET); + return; + } + // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - ScaleFactor scale_factor = TIMES_4; GenerateReferenceLoadWithBakerReadBarrier( instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check); } @@ -8031,9 +8696,7 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i Location index, ScaleFactor scale_factor, Location temp, - bool needs_null_check, - bool always_update_field, - Register* temp2) { + bool needs_null_check) { DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); @@ -8044,6 +8707,73 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // not. // // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp2` the read barrier mark entry point + // corresponding to register `ref`. If `temp2` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } + + Register temp_reg = temp.AsRegister<Register>(); + + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp2`. + Location temp2 = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + temp_reg, + /* entrypoint */ temp2); + AddSlowPath(slow_path); + + // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp2.AsRegister<Register>(), TR, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp2.AsRegister<Register>(), slow_path->GetEntryLabel()); + // Fast path: the GC is not marking: just load the reference. + GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorARM::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + Location field_offset, + Location temp, + bool needs_null_check, + Register temp2) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to update the reference + // field within `obj`. Then, in the slow path, check the gray bit + // in the lock word of the reference's holder (`obj`) to decide + // whether to mark `ref` and update the field or not. + // + // Note that we do not actually check the value of `GetIsGcMarking()`; // instead, we load into `temp3` the read barrier mark entry point // corresponding to register `ref`. If `temp3` is null, it means // that `GetIsGcMarking()` is false, and vice versa. @@ -8056,52 +8786,30 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // HeapReference<mirror::Object> ref = *src; // Original reference load. // bool is_gray = (rb_state == ReadBarrier::GrayState()); // if (is_gray) { + // old_ref = ref; // ref = temp3(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // compareAndSwapObject(obj, field_offset, old_ref, ref); // } - // } else { - // HeapReference<mirror::Object> ref = *src; // Original reference load. // } Register temp_reg = temp.AsRegister<Register>(); - // Slow path marking the object `ref` when the GC is marking. The - // entrypoint will already be loaded in `temp3`. + // Slow path updating the object reference at address `obj + + // field_offset` when the GC is marking. The entrypoint will already + // be loaded in `temp3`. Location temp3 = Location::RegisterLocation(LR); - SlowPathCodeARM* slow_path; - if (always_update_field) { - DCHECK(temp2 != nullptr); - // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM only - // supports address of the form `obj + field_offset`, where `obj` - // is a register and `field_offset` is a register pair (of which - // only the lower half is used). Thus `offset` and `scale_factor` - // above are expected to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); - Location field_offset = index; - slow_path = - new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM( - instruction, - ref, - obj, - offset, - /* index */ field_offset, - scale_factor, - needs_null_check, - temp_reg, - *temp2, - /* entrypoint */ temp3); - } else { - slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM( - instruction, - ref, - obj, - offset, - index, - scale_factor, - needs_null_check, - temp_reg, - /* entrypoint */ temp3); - } + SlowPathCodeARM* slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM( + instruction, + ref, + obj, + /* offset */ 0u, + /* index */ field_offset, + /* scale_factor */ ScaleFactor::TIMES_1, + needs_null_check, + temp_reg, + temp2, + /* entrypoint */ temp3); AddSlowPath(slow_path); // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() @@ -8113,8 +8821,8 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ CompareAndBranchIfNonZero(temp3.AsRegister<Register>(), slow_path->GetEntryLabel()); - // Fast path: just load the reference. - GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + // Fast path: the GC is not marking: nothing to do (the field is + // up-to-date, and we don't need to load the reference). __ Bind(slow_path->GetExitLabel()); } @@ -8245,7 +8953,8 @@ Register CodeGeneratorARM::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOr // save one load. However, since this is just an intrinsic slow path we prefer this // simple and more robust approach rather that trying to determine if that's the case. SlowPathCode* slow_path = GetCurrentSlowPath(); - if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) { + DCHECK(slow_path != nullptr); // For intrinsified invokes the call is emitted on the slow path. + if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) { int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>()); __ LoadFromOffset(kLoadWord, temp, SP, stack_offset); return temp; @@ -8253,8 +8962,7 @@ Register CodeGeneratorARM::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOr return location.AsRegister<Register>(); } -Location CodeGeneratorARM::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, - Location temp) { +void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) { Location callee_method = temp; // For all kinds except kRecursive, callee will be in temp. switch (invoke->GetMethodLoadKind()) { case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: { @@ -8267,6 +8975,18 @@ Location CodeGeneratorARM::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticO case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: { + DCHECK(GetCompilerOptions().IsBootImage()); + Register temp_reg = temp.AsRegister<Register>(); + PcRelativePatchInfo* labels = NewPcRelativeMethodPatch(invoke->GetTargetMethod()); + __ BindTrackedLabel(&labels->movw_label); + __ movw(temp_reg, /* placeholder */ 0u); + __ BindTrackedLabel(&labels->movt_label); + __ movt(temp_reg, /* placeholder */ 0u); + __ BindTrackedLabel(&labels->add_pc_label); + __ add(temp_reg, temp_reg, ShifterOperand(PC)); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ LoadImmediate(temp.AsRegister<Register>(), invoke->GetMethodAddress()); break; @@ -8303,11 +9023,6 @@ Location CodeGeneratorARM::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticO break; } } - return callee_method; -} - -void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) { - Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp); switch (invoke->GetCodePtrLocation()) { case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf: @@ -8359,9 +9074,11 @@ void CodeGeneratorARM::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp __ blx(LR); } -CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeStringPatch( - const DexFile& dex_file, dex::StringIndex string_index) { - return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeMethodPatch( + MethodReference target_method) { + return NewPcRelativePatch(*target_method.dex_file, + target_method.dex_method_index, + &pc_relative_method_patches_); } CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeTypePatch( @@ -8374,6 +9091,11 @@ CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewTypeBssEntryPatch( return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_); } +CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeStringPatch( + const DexFile& dex_file, dex::StringIndex string_index) { + return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +} + CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativeDexCacheArrayPatch( const DexFile& dex_file, uint32_t element_offset) { return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_); @@ -8385,18 +9107,9 @@ CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativePatch( return &patches->back(); } -Literal* CodeGeneratorARM::DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index) { - return boot_image_string_patches_.GetOrCreate( - StringReference(&dex_file, string_index), - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - -Literal* CodeGeneratorARM::DeduplicateBootImageTypeLiteral(const DexFile& dex_file, - dex::TypeIndex type_index) { - return boot_image_type_patches_.GetOrCreate( - TypeReference(&dex_file, type_index), - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); +Label* CodeGeneratorARM::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; } Literal* CodeGeneratorARM::DeduplicateBootImageAddressLiteral(uint32_t address) { @@ -8447,43 +9160,32 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche DCHECK(linker_patches->empty()); size_t size = /* MOVW+MOVT for each entry */ 2u * pc_relative_dex_cache_patches_.size() + - boot_image_string_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + - boot_image_type_patches_.size() + + /* MOVW+MOVT for each entry */ 2u * pc_relative_method_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + + /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - for (const auto& entry : boot_image_string_patches_) { - const StringReference& target_string = entry.first; - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = literal->GetLabel()->Position(); - linker_patches->push_back(LinkerPatch::StringPatch(literal_offset, - target_string.dex_file, - target_string.string_index.index_)); - } - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(pc_relative_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_, linker_patches); - } else { EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_, linker_patches); + } else { + DCHECK(pc_relative_method_patches_.empty()); + DCHECK(pc_relative_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); - for (const auto& entry : boot_image_type_patches_) { - const TypeReference& target_type = entry.first; - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = literal->GetLabel()->Position(); - linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, - target_type.dex_file, - target_type.type_index.index_)); + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.Position(), + info.custom_data)); } DCHECK_EQ(size, linker_patches->size()); } @@ -8494,13 +9196,6 @@ Literal* CodeGeneratorARM::DeduplicateUint32Literal(uint32_t value, Uint32ToLite [this, value]() { return __ NewLiteral<uint32_t>(value); }); } -Literal* CodeGeneratorARM::DeduplicateMethodLiteral(MethodReference target_method, - MethodToLiteralMap* map) { - return map->GetOrCreate( - target_method, - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - void LocationsBuilderARM::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall); @@ -8716,14 +9411,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARM::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index 86f2f21df7..5f37d3bff1 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -24,8 +24,8 @@ #include "nodes.h" #include "string_reference.h" #include "parallel_move_resolver.h" +#include "type_reference.h" #include "utils/arm/assembler_thumb2.h" -#include "utils/type_reference.h" namespace art { namespace arm { @@ -299,7 +299,6 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator { void GenerateCompareTestAndBranch(HCondition* condition, Label* true_target, Label* false_target); - void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label); void DivRemOneOrMinusOne(HBinaryOperation* instruction); void DivRemByPowerOfTwo(HBinaryOperation* instruction); void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); @@ -456,7 +455,6 @@ class CodeGeneratorARM : public CodeGenerator { const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info, HInvokeStaticOrDirect* invoke) OVERRIDE; - Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp); void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE; void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; @@ -482,15 +480,18 @@ class CodeGeneratorARM : public CodeGenerator { Label add_pc_label; }; - PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, - dex::StringIndex string_index); + PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method); PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); + PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, + dex::StringIndex string_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); - Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index); - Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index); + + // Add a new baker read barrier patch and return the label to be bound + // before the BNE instruction. + Label* NewBakerReadBarrierPatch(uint32_t custom_data); + Literal* DeduplicateBootImageAddressLiteral(uint32_t address); Literal* DeduplicateJitStringLiteral(const DexFile& dex_file, dex::StringIndex string_index, @@ -503,6 +504,10 @@ class CodeGeneratorARM : public CodeGenerator { void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Maybe add the reserved entrypoint register as a temporary for field load. This temp + // is added only for AOT compilation if link-time generated thunks for fields are enabled. + void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations); + // Fast path implementation of ReadBarrier::Barrier for a heap // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, @@ -526,11 +531,6 @@ class CodeGeneratorARM : public CodeGenerator { // Load the object reference located at the address // `obj + offset + (index << scale_factor)`, held by object `obj`, into // `ref`, and mark it if needed. - // - // If `always_update_field` is true, the value of the reference is - // atomically updated in the holder (`obj`). This operation - // requires an extra temporary register, which must be provided as a - // non-null pointer (`temp2`). void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, Register obj, @@ -538,9 +538,27 @@ class CodeGeneratorARM : public CodeGenerator { Location index, ScaleFactor scale_factor, Location temp, - bool needs_null_check, - bool always_update_field = false, - Register* temp2 = nullptr); + bool needs_null_check); + + // Generate code checking whether the the reference field at the + // address `obj + field_offset`, held by object `obj`, needs to be + // marked, and if so, marking it and updating the field within `obj` + // with the marked value. + // + // This routine is used for the implementation of the + // UnsafeCASObject intrinsic with Baker read barriers. + // + // This method has a structure similar to + // GenerateReferenceLoadWithBakerReadBarrier, but note that argument + // `ref` is only as a temporary here, and thus its value should not + // be used afterwards. + void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + Location field_offset, + Location temp, + bool needs_null_check, + Register temp2); // Generate a heap reference load (with no read barrier). void GenerateRawReferenceLoad(HInstruction* instruction, @@ -604,11 +622,18 @@ class CodeGeneratorARM : public CodeGenerator { void GenerateImplicitNullCheck(HNullCheck* instruction) OVERRIDE; void GenerateExplicitNullCheck(HNullCheck* instruction) OVERRIDE; + // `temp` is an extra temporary register that is used for some conditions; + // callers may not specify it, in which case the method will use a scratch + // register instead. + void GenerateConditionWithZero(IfCondition condition, + Register out, + Register in, + Register temp = kNoRegister); + private: Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp); using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>; - using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>; using StringToLiteralMap = ArenaSafeMap<StringReference, Literal*, StringReferenceValueComparator>; @@ -616,8 +641,14 @@ class CodeGeneratorARM : public CodeGenerator { Literal*, TypeReferenceValueComparator>; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + Label label; + uint32_t custom_data; + }; + Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); - Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file, uint32_t offset_or_index, ArenaDeque<PcRelativePatchInfo>* patches); @@ -638,16 +669,16 @@ class CodeGeneratorARM : public CodeGenerator { Uint32ToLiteralMap uint32_literals_; // PC-relative patch info for each HArmDexCacheArraysBase. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // Deduplication map for boot string literals for kBootImageLinkTimeAddress. - StringToLiteralMap boot_image_string_patches_; - // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). - ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; - // Deduplication map for boot type literals for kBootImageLinkTimeAddress. - TypeToLiteralMap boot_image_type_patches_; + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). + ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index da146d72cd..f2b312362f 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -29,6 +29,7 @@ #include "linker/arm64/relative_patcher_arm64.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" +#include "lock_word.h" #include "offsets.h" #include "thread.h" #include "utils/arm64/assembler_arm64.h" @@ -91,6 +92,7 @@ constexpr uint32_t kReferenceLoadMinFarOffset = 16 * KB; // Flags controlling the use of link-time generated thunks for Baker read barriers. constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; // Some instructions have special requirements for a temporary, for example @@ -855,7 +857,7 @@ class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlo // Baker's read barriers, we need to perform the load of // mirror::Object::monitor_ *before* the original reference load. // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // The slow path (for Baker's algorithm) should look like: // // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); // lfence; // Load fence or artificial data dependency to prevent load-load reordering @@ -1006,6 +1008,18 @@ class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 __ Bind(GetEntryLabel()); + // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARM64's: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // old_ref = ref; + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // compareAndSwapObject(obj, field_offset, old_ref, ref); + // } + // /* int32_t */ monitor = obj->monitor_ uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); __ Ldr(temp_, HeapOperand(obj_, monitor_offset)); @@ -1436,13 +1450,10 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, uint64_literals_(std::less<uint64_t>(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_string_patches_(StringReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_type_patches_(TypeReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), @@ -1502,7 +1513,7 @@ Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind ki if (kind == Location::kRegister) { scratch = LocationFrom(vixl_temps_.AcquireX()); } else { - DCHECK(kind == Location::kFpuRegister); + DCHECK_EQ(kind, Location::kFpuRegister); scratch = LocationFrom(codegen_->GetGraph()->HasSIMD() ? vixl_temps_.AcquireVRegisterOfSize(kQRegSize) : vixl_temps_.AcquireD()); @@ -1730,9 +1741,9 @@ static bool CoherentConstantAndType(Location constant, Primitive::Type type) { (cst->IsDoubleConstant() && type == Primitive::kPrimDouble); } -// Allocate a scratch register from the VIXL pool, querying first into -// the floating-point register pool, and then the the core register -// pool. This is essentially a reimplementation of +// Allocate a scratch register from the VIXL pool, querying first +// the floating-point register pool, and then the core register +// pool. This is essentially a reimplementation of // vixl::aarch64::UseScratchRegisterScope::AcquireCPURegisterOfSize // using a different allocation strategy. static CPURegister AcquireFPOrCoreCPURegisterOfSize(vixl::aarch64::MacroAssembler* masm, @@ -1880,7 +1891,7 @@ void CodeGeneratorARM64::MoveLocation(Location destination, // ask for a scratch register of any type (core or FP). // // Also, we start by asking for a FP scratch register first, as the - // demand of scratch core registers is higher. This is why we + // demand of scratch core registers is higher. This is why we // use AcquireFPOrCoreCPURegisterOfSize instead of // UseScratchRegisterScope::AcquireCPURegisterOfSize, which // allocates core scratch registers first. @@ -2648,6 +2659,38 @@ void InstructionCodeGeneratorARM64::VisitIntermediateAddress(HIntermediateAddres Operand(InputOperandAt(instruction, 1))); } +void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) { + LocationSummary* locations = + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + + HIntConstant* shift = instruction->GetShift()->AsIntConstant(); + + locations->SetInAt(0, Location::RequiresRegister()); + // For byte case we don't need to shift the index variable so we can encode the data offset into + // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist + // data offset constant generation out of the loop and reduce the critical path length in the + // loop. + locations->SetInAt(1, shift->GetValue() == 0 + ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant()) + : Location::RequiresRegister()); + locations->SetInAt(2, Location::ConstantLocation(shift)); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); +} + +void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + Register index_reg = InputRegisterAt(instruction, 0); + uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2)); + uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue(); + + if (shift == 0) { + __ Add(OutputRegister(instruction), index_reg, offset); + } else { + Register offset_reg = InputRegisterAt(instruction, 1); + __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift)); + } +} + void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall); @@ -2764,6 +2807,7 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { // Object ArrayGet with Baker's read barrier case. // Note that a potential implicit null check is handled in the // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call. + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); if (index.IsConstant()) { // Array load with a constant index can be treated as a field load. offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type); @@ -2774,12 +2818,12 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { obj.W(), offset, maybe_temp, - /* needs_null_check */ true, + /* needs_null_check */ false, /* use_load_acquire */ false); } else { Register temp = WRegisterFrom(locations->GetTemp(0)); codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ true); + instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ false); } } else { // General case. @@ -4446,8 +4490,7 @@ HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARM64::GetSupportedInvokeStatic return desired_dispatch_info; } -Location CodeGeneratorARM64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, - Location temp) { +void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) { // Make sure that ArtMethod* is passed in kArtMethodRegister as per the calling convention. Location callee_method = temp; // For all kinds except kRecursive, callee will be in temp. switch (invoke->GetMethodLoadKind()) { @@ -4461,6 +4504,17 @@ Location CodeGeneratorARM64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStati case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: { + DCHECK(GetCompilerOptions().IsBootImage()); + // Add ADRP with its PC-relative method patch. + vixl::aarch64::Label* adrp_label = NewPcRelativeMethodPatch(invoke->GetTargetMethod()); + EmitAdrpPlaceholder(adrp_label, XRegisterFrom(temp)); + // Add ADD with its PC-relative method patch. + vixl::aarch64::Label* add_label = + NewPcRelativeMethodPatch(invoke->GetTargetMethod(), adrp_label); + EmitAddPlaceholder(add_label, XRegisterFrom(temp), XRegisterFrom(temp)); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: // Load method address from literal pool. __ Ldr(XRegisterFrom(temp), DeduplicateUint64Literal(invoke->GetMethodAddress())); @@ -4501,12 +4555,6 @@ Location CodeGeneratorARM64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStati break; } } - return callee_method; -} - -void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) { - // All registers are assumed to be correctly set up. - Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp); switch (invoke->GetCodePtrLocation()) { case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf: @@ -4584,12 +4632,13 @@ void InstructionCodeGeneratorARM64::VisitInvokePolymorphic(HInvokePolymorphic* i codegen_->GenerateInvokePolymorphicCall(invoke); } -vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeStringPatch( - const DexFile& dex_file, - dex::StringIndex string_index, +vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeMethodPatch( + MethodReference target_method, vixl::aarch64::Label* adrp_label) { - return - NewPcRelativePatch(dex_file, string_index.index_, adrp_label, &pc_relative_string_patches_); + return NewPcRelativePatch(*target_method.dex_file, + target_method.dex_method_index, + adrp_label, + &pc_relative_method_patches_); } vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeTypePatch( @@ -4606,6 +4655,14 @@ vixl::aarch64::Label* CodeGeneratorARM64::NewBssEntryTypePatch( return NewPcRelativePatch(dex_file, type_index.index_, adrp_label, &type_bss_entry_patches_); } +vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeStringPatch( + const DexFile& dex_file, + dex::StringIndex string_index, + vixl::aarch64::Label* adrp_label) { + return + NewPcRelativePatch(dex_file, string_index.index_, adrp_label, &pc_relative_string_patches_); +} + vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch( const DexFile& dex_file, uint32_t element_offset, @@ -4632,20 +4689,6 @@ vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativePatch( return label; } -vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral( - const DexFile& dex_file, dex::StringIndex string_index) { - return boot_image_string_patches_.GetOrCreate( - StringReference(&dex_file, string_index), - [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); }); -} - -vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLiteral( - const DexFile& dex_file, dex::TypeIndex type_index) { - return boot_image_type_patches_.GetOrCreate( - TypeReference(&dex_file, type_index), - [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); }); -} - vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral( uint64_t address) { return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); @@ -4712,11 +4755,10 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - boot_image_string_patches_.size() + - pc_relative_string_patches_.size() + - boot_image_type_patches_.size() + + pc_relative_method_patches_.size() + pc_relative_type_patches_.size() + type_bss_entry_patches_.size() + + pc_relative_string_patches_.size() + baker_read_barrier_patches_.size(); linker_patches->reserve(size); for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) { @@ -4725,32 +4767,21 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc info.pc_insn_label->GetLocation(), info.offset_or_index)); } - for (const auto& entry : boot_image_string_patches_) { - const StringReference& target_string = entry.first; - vixl::aarch64::Literal<uint32_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::StringPatch(literal->GetOffset(), - target_string.dex_file, - target_string.string_index.index_)); - } - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(pc_relative_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_, linker_patches); - } else { EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_, linker_patches); + } else { + DCHECK(pc_relative_method_patches_.empty()); + DCHECK(pc_relative_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); - for (const auto& entry : boot_image_type_patches_) { - const TypeReference& target_type = entry.first; - vixl::aarch64::Literal<uint32_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::TypePatch(literal->GetOffset(), - target_type.dex_file, - target_type.type_index.index_)); - } for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(), info.custom_data)); @@ -4771,14 +4802,6 @@ vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateUint64Literal(u [this, value]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(value); }); } -vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral( - MethodReference target_method, - MethodToLiteralMap* map) { - return map->GetOrCreate( - target_method, - [this]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(/* placeholder */ 0u); }); -} - void InstructionCodeGeneratorARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { // Explicit clinit checks triggered by static invokes must have been pruned by // art::PrepareForRegisterAllocation. @@ -4818,21 +4841,15 @@ HLoadClass::LoadKind CodeGeneratorARM64::GetSupportedLoadClassKind( UNREACHABLE(); case HLoadClass::LoadKind::kReferrersClass: break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadClass::LoadKind::kBootImageAddress: - break; case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kRuntimeCall: break; } return desired_class_load_kind; @@ -4840,7 +4857,7 @@ HLoadClass::LoadKind CodeGeneratorARM64::GetSupportedLoadClassKind( void LocationsBuilderARM64::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( cls, @@ -4885,7 +4902,7 @@ void LocationsBuilderARM64::VisitLoadClass(HLoadClass* cls) { // move. void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -4914,11 +4931,6 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SA read_barrier_option); break; } - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); - __ Ldr(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), - cls->GetTypeIndex())); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); // Add ADRP with its PC-relative type patch. @@ -4972,7 +4984,7 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SA read_barrier_option); break; } - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: case HLoadClass::LoadKind::kInvalid: LOG(FATAL) << "UNREACHABLE"; UNREACHABLE(); @@ -5020,21 +5032,15 @@ void InstructionCodeGeneratorARM64::VisitClearException(HClearException* clear A HLoadString::LoadKind CodeGeneratorARM64::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadString::LoadKind::kBootImageAddress: - break; case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kRuntimeCall: break; } return desired_string_load_kind; @@ -5043,7 +5049,7 @@ HLoadString::LoadKind CodeGeneratorARM64::GetSupportedLoadStringKind( void LocationsBuilderARM64::VisitLoadString(HLoadString* load) { LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); - if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load->GetLoadKind() == HLoadString::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; locations->SetOut(calling_convention.GetReturnLocation(load->GetType())); } else { @@ -5073,10 +5079,6 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) NO_THREAD Location out_loc = load->GetLocations()->Out(); switch (load->GetLoadKind()) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - __ Ldr(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(), - load->GetStringIndex())); - return; // No dex cache slow path. case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { // Add ADRP with its PC-relative String patch. const DexFile& dex_file = load->GetDexFile(); @@ -5478,6 +5480,15 @@ void InstructionCodeGeneratorARM64::VisitRem(HRem* rem) { } } +void LocationsBuilderARM64::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorARM64::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderARM64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -5929,9 +5940,9 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( !Runtime::Current()->UseJitCompilation()) { // Note that we do not actually check the value of `GetIsGcMarking()` // to decide whether to mark the loaded GC root or not. Instead, we - // load into `temp` the read barrier mark introspection entrypoint. - // If `temp` is null, it means that `GetIsGcMarking()` is false, and - // vice versa. + // load into `temp` (actually IP1) the read barrier mark introspection + // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is + // false, and vice versa. // // We use link-time generated thunks for the slow path. That thunk // checks the reference and jumps to the entrypoint if needed. @@ -6055,24 +6066,24 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins !use_load_acquire && !Runtime::Current()->UseJitCompilation()) { // Note that we do not actually check the value of `GetIsGcMarking()` - // to decide whether to mark the loaded GC root or not. Instead, we - // load into `temp` the read barrier mark introspection entrypoint. - // If `temp` is null, it means that `GetIsGcMarking()` is false, and - // vice versa. + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually IP1) the read barrier mark introspection + // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is + // false, and vice versa. // // We use link-time generated thunks for the slow path. That thunk checks // the holder and jumps to the entrypoint if needed. If the holder is not // gray, it creates a fake dependency and returns to the LDR instruction. // // temp = Thread::Current()->pReadBarrierMarkIntrospection - // lr = &return_address; + // lr = &gray_return_address; // if (temp != nullptr) { // goto field_thunk<holder_reg, base_reg>(lr) // } // not_gray_return_address: // // Original reference load. If the offset is too large to fit // // into LDR, we use an adjusted base register here. - // GcRoot<mirror::Object> root = *(obj+offset); + // HeapReference<mirror::Object> reference = *(obj+offset); // gray_return_address: DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); @@ -6142,16 +6153,74 @@ void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* ins DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + size_t scale_factor = Primitive::ComponentSizeShift(Primitive::kPrimNot); + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually IP1) the read barrier mark introspection + // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is + // false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // HeapReference<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + Register index_reg = RegisterFrom(index, Primitive::kPrimInt); + Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + + UseScratchRegisterScope temps(GetVIXLAssembler()); + DCHECK(temps.IsAvailable(ip0)); + DCHECK(temps.IsAvailable(ip1)); + temps.Exclude(ip0, ip1); + uint32_t custom_data = + linker::Arm64RelativePatcher::EncodeBakerReadBarrierArrayData(temp.GetCode()); + vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data); + + // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip0.GetCode(), 16u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); + __ Ldr(ip1, MemOperand(tr, entry_point_offset)); + __ Add(temp.X(), obj.X(), Operand(data_offset)); + EmissionCheckScope guard(GetVIXLAssembler(), + (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize); + vixl::aarch64::Label return_address; + __ adr(lr, &return_address); + __ Bind(cbnz_label); + __ cbnz(ip1, static_cast<int64_t>(0)); // Placeholder, patched at link-time. + static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Array LDR must be 1 instruction (4B) before the return address label; " + " 2 instructions (8B) for heap poisoning."); + __ ldr(ref_reg, MemOperand(temp.X(), index_reg.X(), LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + return; + } + // Array cells are never volatile variables, therefore array loads // never use Load-Acquire instructions on ARM64. const bool use_load_acquire = false; - static_assert( - sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - size_t scale_factor = Primitive::ComponentSizeShift(Primitive::kPrimNot); GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, @@ -6171,8 +6240,7 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* size_t scale_factor, Register temp, bool needs_null_check, - bool use_load_acquire, - bool always_update_field) { + bool use_load_acquire) { DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); // If we are emitting an array load, we should not be using a @@ -6209,41 +6277,18 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // entrypoint will already be loaded in `temp2`. Register temp2 = lr; Location temp2_loc = LocationFrom(temp2); - SlowPathCodeARM64* slow_path; - if (always_update_field) { - // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 - // only supports address of the form `obj + field_offset`, where - // `obj` is a register and `field_offset` is a register. Thus - // `offset` and `scale_factor` above are expected to be null in - // this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, 0u); /* "times 1" */ - Location field_offset = index; - slow_path = - new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64( - instruction, - ref, - obj, - offset, - /* index */ field_offset, - scale_factor, - needs_null_check, - use_load_acquire, - temp, - /* entrypoint */ temp2_loc); - } else { - slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64( - instruction, - ref, - obj, - offset, - index, - scale_factor, - needs_null_check, - use_load_acquire, - temp, - /* entrypoint */ temp2_loc); - } + SlowPathCodeARM64* slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + use_load_acquire, + temp, + /* entrypoint */ temp2_loc); AddSlowPath(slow_path); // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() @@ -6255,12 +6300,83 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ Cbnz(temp2, slow_path->GetEntryLabel()); - // Fast path: just load the reference. + // Fast path: the GC is not marking: just load the reference. GenerateRawReferenceLoad( instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire); __ Bind(slow_path->GetExitLabel()); } +void CodeGeneratorARM64::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + Location field_offset, + Register temp, + bool needs_null_check, + bool use_load_acquire) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + // If we are emitting an array load, we should not be using a + // Load Acquire instruction. In other words: + // `instruction->IsArrayGet()` => `!use_load_acquire`. + DCHECK(!instruction->IsArrayGet() || !use_load_acquire); + + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to update the reference + // field within `obj`. Then, in the slow path, check the gray bit + // in the lock word of the reference's holder (`obj`) to decide + // whether to mark `ref` and update the field or not. + // + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp2` the read barrier mark entry point + // corresponding to register `ref`. If `temp2` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *(obj + field_offset); // Reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // old_ref = ref; + // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // compareAndSwapObject(obj, field_offset, old_ref, ref); + // } + // } + + // Slow path updating the object reference at address `obj + field_offset` + // when the GC is marking. The entrypoint will already be loaded in `temp2`. + Register temp2 = lr; + Location temp2_loc = LocationFrom(temp2); + SlowPathCodeARM64* slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64( + instruction, + ref, + obj, + /* offset */ 0u, + /* index */ field_offset, + /* scale_factor */ 0u /* "times 1" */, + needs_null_check, + use_load_acquire, + temp, + /* entrypoint */ temp2_loc); + AddSlowPath(slow_path); + + // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp2, MemOperand(tr, entry_point_offset)); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ Cbnz(temp2, slow_path->GetEntryLabel()); + // Fast path: the GC is not marking: nothing to do (the field is + // up-to-date, and we don't need to load the reference). + __ Bind(slow_path->GetExitLabel()); +} + void CodeGeneratorARM64::GenerateRawReferenceLoad(HInstruction* instruction, Location ref, Register obj, @@ -6436,14 +6552,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 332ab49153..747fc9f0b1 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -25,8 +25,8 @@ #include "nodes.h" #include "parallel_move_resolver.h" #include "string_reference.h" +#include "type_reference.h" #include "utils/arm64/assembler_arm64.h" -#include "utils/type_reference.h" // TODO(VIXL): Make VIXL compile with -Wshadow. #pragma GCC diagnostic push @@ -318,12 +318,13 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { void GenerateDivRemIntegral(HBinaryOperation* instruction); void HandleGoto(HInstruction* got, HBasicBlock* successor); - vixl::aarch64::MemOperand CreateVecMemRegisters( + vixl::aarch64::MemOperand VecAddress( HVecMemoryOperation* instruction, - Location* reg_loc, - bool is_load, // This function may acquire a scratch register. - vixl::aarch64::UseScratchRegisterScope* temps_scope); + vixl::aarch64::UseScratchRegisterScope* temps_scope, + size_t size, + bool is_string_char_at, + /*out*/ vixl::aarch64::Register* scratch); Arm64Assembler* const assembler_; CodeGeneratorARM64* const codegen_; @@ -539,7 +540,6 @@ class CodeGeneratorARM64 : public CodeGenerator { const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info, HInvokeStaticOrDirect* invoke) OVERRIDE; - Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp); void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE; void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; @@ -548,12 +548,11 @@ class CodeGeneratorARM64 : public CodeGenerator { UNIMPLEMENTED(FATAL); } - // Add a new PC-relative string patch for an instruction and return the label + // Add a new PC-relative method patch for an instruction and return the label // to be bound before the instruction. The instruction will be either the // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing // to the associated ADRP patch label). - vixl::aarch64::Label* NewPcRelativeStringPatch(const DexFile& dex_file, - dex::StringIndex string_index, + vixl::aarch64::Label* NewPcRelativeMethodPatch(MethodReference target_method, vixl::aarch64::Label* adrp_label = nullptr); // Add a new PC-relative type patch for an instruction and return the label @@ -572,6 +571,14 @@ class CodeGeneratorARM64 : public CodeGenerator { dex::TypeIndex type_index, vixl::aarch64::Label* adrp_label = nullptr); + // Add a new PC-relative string patch for an instruction and return the label + // to be bound before the instruction. The instruction will be either the + // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing + // to the associated ADRP patch label). + vixl::aarch64::Label* NewPcRelativeStringPatch(const DexFile& dex_file, + dex::StringIndex string_index, + vixl::aarch64::Label* adrp_label = nullptr); + // Add a new PC-relative dex cache array patch for an instruction and return // the label to be bound before the instruction. The instruction will be // either the ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label` @@ -585,11 +592,6 @@ class CodeGeneratorARM64 : public CodeGenerator { // before the CBNZ instruction. vixl::aarch64::Label* NewBakerReadBarrierPatch(uint32_t custom_data); - vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageStringLiteral( - const DexFile& dex_file, - dex::StringIndex string_index); - vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, - dex::TypeIndex type_index); vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address); vixl::aarch64::Literal<uint32_t>* DeduplicateJitStringLiteral(const DexFile& dex_file, dex::StringIndex string_index, @@ -634,9 +636,6 @@ class CodeGeneratorARM64 : public CodeGenerator { // Load the object reference located at the address // `obj + offset + (index << scale_factor)`, held by object `obj`, into // `ref`, and mark it if needed. - // - // If `always_update_field` is true, the value of the reference is - // atomically updated in the holder (`obj`). void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, vixl::aarch64::Register obj, @@ -645,8 +644,27 @@ class CodeGeneratorARM64 : public CodeGenerator { size_t scale_factor, vixl::aarch64::Register temp, bool needs_null_check, - bool use_load_acquire, - bool always_update_field = false); + bool use_load_acquire); + + // Generate code checking whether the the reference field at the + // address `obj + field_offset`, held by object `obj`, needs to be + // marked, and if so, marking it and updating the field within `obj` + // with the marked value. + // + // This routine is used for the implementation of the + // UnsafeCASObject intrinsic with Baker read barriers. + // + // This method has a structure similar to + // GenerateReferenceLoadWithBakerReadBarrier, but note that argument + // `ref` is only as a temporary here, and thus its value should not + // be used afterwards. + void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction, + Location ref, + vixl::aarch64::Register obj, + Location field_offset, + vixl::aarch64::Register temp, + bool needs_null_check, + bool use_load_acquire); // Generate a heap reference load (with no read barrier). void GenerateRawReferenceLoad(HInstruction* instruction, @@ -714,9 +732,6 @@ class CodeGeneratorARM64 : public CodeGenerator { private: using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::aarch64::Literal<uint64_t>*>; using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::aarch64::Literal<uint32_t>*>; - using MethodToLiteralMap = ArenaSafeMap<MethodReference, - vixl::aarch64::Literal<uint64_t>*, - MethodReferenceComparator>; using StringToLiteralMap = ArenaSafeMap<StringReference, vixl::aarch64::Literal<uint32_t>*, StringReferenceValueComparator>; @@ -727,8 +742,6 @@ class CodeGeneratorARM64 : public CodeGenerator { vixl::aarch64::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); vixl::aarch64::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value); - vixl::aarch64::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method, - MethodToLiteralMap* map); // The PcRelativePatchInfo is used for PC-relative addressing of dex cache arrays // and boot image strings/types. The only difference is the interpretation of the @@ -780,16 +793,14 @@ class CodeGeneratorARM64 : public CodeGenerator { Uint64ToLiteralMap uint64_literals_; // PC-relative DexCache access info. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // Deduplication map for boot string literals for kBootImageLinkTimeAddress. - StringToLiteralMap boot_image_string_patches_; - // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). - ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; - // Deduplication map for boot type literals for kBootImageLinkTimeAddress. - TypeToLiteralMap boot_image_type_patches_; + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). + ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; // Baker read barrier patch info. ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index c37cc52a2b..93cbc3b17c 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -16,6 +16,7 @@ #include "code_generator_arm_vixl.h" +#include "arch/arm/asm_support_arm.h" #include "arch/arm/instruction_set_features_arm.h" #include "art_method.h" #include "code_generator_utils.h" @@ -24,6 +25,7 @@ #include "entrypoints/quick/quick_entrypoints.h" #include "gc/accounting/card_table.h" #include "intrinsics_arm_vixl.h" +#include "linker/arm/relative_patcher_thumb2.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "thread.h" @@ -77,6 +79,20 @@ static constexpr size_t kArmBitsPerWord = kArmWordSize * kBitsPerByte; static constexpr int kCurrentMethodStackOffset = 0; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle +// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// The reserved entrypoint register for link-time generated thunks. +const vixl32::Register kBakerCcEntrypointRegister = r4; + #ifdef __ #error "ARM Codegen VIXL macro-assembler macro already defined." #endif @@ -88,6 +104,60 @@ static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; // Marker that code is yet to be, and must, be implemented. #define TODO_VIXL32(level) LOG(level) << __PRETTY_FUNCTION__ << " unimplemented " +static inline void ExcludeIPAndBakerCcEntrypointRegister(UseScratchRegisterScope* temps, + HInstruction* instruction) { + DCHECK(temps->IsAvailable(ip)); + temps->Exclude(ip); + DCHECK(!temps->IsAvailable(kBakerCcEntrypointRegister)); + DCHECK_EQ(kBakerCcEntrypointRegister.GetCode(), + linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister); + DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u); + DCHECK(RegisterFrom(instruction->GetLocations()->GetTemp( + instruction->GetLocations()->GetTempCount() - 1u)).Is(kBakerCcEntrypointRegister)); +} + +static inline void EmitPlaceholderBne(CodeGeneratorARMVIXL* codegen, vixl32::Label* patch_label) { + ExactAssemblyScope eas(codegen->GetVIXLAssembler(), kMaxInstructionSizeInBytes); + __ bind(patch_label); + vixl32::Label placeholder_label; + __ b(ne, EncodingSize(Wide), &placeholder_label); // Placeholder, patched at link-time. + __ bind(&placeholder_label); +} + +static inline bool CanEmitNarrowLdr(vixl32::Register rt, vixl32::Register rn, uint32_t offset) { + return rt.IsLow() && rn.IsLow() && offset < 32u; +} + +class EmitAdrCode { + public: + EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label) + : assembler_(assembler), rd_(rd), label_(label) { + ExactAssemblyScope aas(assembler, kMaxInstructionSizeInBytes); + adr_location_ = assembler->GetCursorOffset(); + assembler->adr(EncodingSize(Wide), rd, label); + } + + ~EmitAdrCode() { + DCHECK(label_->IsBound()); + // The ADR emitted by the assembler does not set the Thumb mode bit we need. + // TODO: Maybe extend VIXL to allow ADR for return address? + uint8_t* raw_adr = assembler_->GetBuffer()->GetOffsetAddress<uint8_t*>(adr_location_); + // Expecting ADR encoding T3 with `(offset & 1) == 0`. + DCHECK_EQ(raw_adr[1] & 0xfbu, 0xf2u); // Check bits 24-31, except 26. + DCHECK_EQ(raw_adr[0] & 0xffu, 0x0fu); // Check bits 16-23. + DCHECK_EQ(raw_adr[3] & 0x8fu, rd_.GetCode()); // Check bits 8-11 and 15. + DCHECK_EQ(raw_adr[2] & 0x01u, 0x00u); // Check bit 0, i.e. the `offset & 1`. + // Add the Thumb mode bit. + raw_adr[2] |= 0x01u; + } + + private: + ArmVIXLMacroAssembler* const assembler_; + vixl32::Register rd_; + vixl32::Label* const label_; + int32_t adr_location_; +}; + // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers, // for each live D registers they treat two corresponding S registers as live ones. // @@ -851,7 +921,7 @@ class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkS // Baker's read barriers, we need to perform the load of // mirror::Object::monitor_ *before* the original reference load. // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // The slow path (for Baker's algorithm) should look like: // // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); // lfence; // Load fence or artificial data dependency to prevent load-load reordering @@ -993,6 +1063,18 @@ class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL __ Bind(GetEntryLabel()); + // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARMVIXL's: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // old_ref = ref; + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // compareAndSwapObject(obj, field_offset, old_ref, ref); + // } + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); // /* int32_t */ monitor = obj->monitor_ @@ -1693,6 +1775,34 @@ static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARMVIXL* codege } } +static int64_t AdjustConstantForCondition(int64_t value, + IfCondition* condition, + IfCondition* opposite) { + if (value == 1) { + if (*condition == kCondB) { + value = 0; + *condition = kCondEQ; + *opposite = kCondNE; + } else if (*condition == kCondAE) { + value = 0; + *condition = kCondNE; + *opposite = kCondEQ; + } + } else if (value == -1) { + if (*condition == kCondGT) { + value = 0; + *condition = kCondGE; + *opposite = kCondLT; + } else if (*condition == kCondLE) { + value = 0; + *condition = kCondLT; + *opposite = kCondGE; + } + } + + return value; +} + static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant( HCondition* condition, bool invert, @@ -1715,7 +1825,37 @@ static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant( const vixl32::Register left_high = HighRegisterFrom(left); const vixl32::Register left_low = LowRegisterFrom(left); - int64_t value = Int64ConstantFrom(right); + int64_t value = AdjustConstantForCondition(Int64ConstantFrom(right), &cond, &opposite); + UseScratchRegisterScope temps(codegen->GetVIXLAssembler()); + + // Comparisons against 0 are common enough to deserve special attention. + if (value == 0) { + switch (cond) { + case kCondNE: + // x > 0 iff x != 0 when the comparison is unsigned. + case kCondA: + ret = std::make_pair(ne, eq); + FALLTHROUGH_INTENDED; + case kCondEQ: + // x <= 0 iff x == 0 when the comparison is unsigned. + case kCondBE: + __ Orrs(temps.Acquire(), left_low, left_high); + return ret; + case kCondLT: + case kCondGE: + __ Cmp(left_high, 0); + return std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + // Trivially true or false. + case kCondB: + ret = std::make_pair(ne, eq); + FALLTHROUGH_INTENDED; + case kCondAE: + __ Cmp(left_low, left_low); + return ret; + default: + break; + } + } switch (cond) { case kCondEQ: @@ -1760,8 +1900,6 @@ static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant( FALLTHROUGH_INTENDED; case kCondGE: case kCondLT: { - UseScratchRegisterScope temps(codegen->GetVIXLAssembler()); - __ Cmp(left_low, Low32Bits(value)); __ Sbcs(temps.Acquire(), left_high, High32Bits(value)); ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); @@ -1879,18 +2017,22 @@ static std::pair<vixl32::Condition, vixl32::Condition> GenerateTest(HCondition* static bool CanGenerateTest(HCondition* condition, ArmVIXLAssembler* assembler) { if (condition->GetLeft()->GetType() == Primitive::kPrimLong) { const LocationSummary* const locations = condition->GetLocations(); - const IfCondition c = condition->GetCondition(); if (locations->InAt(1).IsConstant()) { - const int64_t value = Int64ConstantFrom(locations->InAt(1)); + IfCondition c = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + const int64_t value = + AdjustConstantForCondition(Int64ConstantFrom(locations->InAt(1)), &c, &opposite); if (c < kCondLT || c > kCondGE) { // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, // we check that the least significant half of the first input to be compared // is in a low register (the other half is read outside an IT block), and // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP - // encoding can be used. - if (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value))) { + // encoding can be used; 0 is always handled, no matter what registers are + // used by the first input. + if (value != 0 && + (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value)))) { return false; } // TODO(VIXL): The rest of the checks are there to keep the backend in sync with @@ -1909,6 +2051,354 @@ static bool CanGenerateTest(HCondition* condition, ArmVIXLAssembler* assembler) return true; } +static void GenerateConditionGeneric(HCondition* cond, CodeGeneratorARMVIXL* codegen) { + DCHECK(CanGenerateTest(cond, codegen->GetAssembler())); + + const vixl32::Register out = OutputRegister(cond); + const auto condition = GenerateTest(cond, false, codegen); + + __ Mov(LeaveFlags, out, 0); + + if (out.IsLow()) { + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(condition.first); + __ mov(condition.first, out, 1); + } else { + vixl32::Label done_label; + vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label); + + __ B(condition.second, final_label, /* far_target */ false); + __ Mov(out, 1); + + if (done_label.IsReferenced()) { + __ Bind(&done_label); + } + } +} + +static void GenerateEqualLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) { + DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = cond->GetLocations(); + IfCondition condition = cond->GetCondition(); + const vixl32::Register out = OutputRegister(cond); + const Location left = locations->InAt(0); + const Location right = locations->InAt(1); + vixl32::Register left_high = HighRegisterFrom(left); + vixl32::Register left_low = LowRegisterFrom(left); + vixl32::Register temp; + UseScratchRegisterScope temps(codegen->GetVIXLAssembler()); + + if (right.IsConstant()) { + IfCondition opposite = cond->GetOppositeCondition(); + const int64_t value = AdjustConstantForCondition(Int64ConstantFrom(right), + &condition, + &opposite); + Operand right_high = High32Bits(value); + Operand right_low = Low32Bits(value); + + // The output uses Location::kNoOutputOverlap. + if (out.Is(left_high)) { + std::swap(left_low, left_high); + std::swap(right_low, right_high); + } + + __ Sub(out, left_low, right_low); + temp = temps.Acquire(); + __ Sub(temp, left_high, right_high); + } else { + DCHECK(right.IsRegisterPair()); + temp = temps.Acquire(); + __ Sub(temp, left_high, HighRegisterFrom(right)); + __ Sub(out, left_low, LowRegisterFrom(right)); + } + + // Need to check after calling AdjustConstantForCondition(). + DCHECK(condition == kCondEQ || condition == kCondNE) << condition; + + if (condition == kCondNE && out.IsLow()) { + __ Orrs(out, out, temp); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(ne); + __ mov(ne, out, 1); + } else { + __ Orr(out, out, temp); + codegen->GenerateConditionWithZero(condition, out, out, temp); + } +} + +static void GenerateLongComparesAndJumps(HCondition* cond, + vixl32::Label* true_label, + vixl32::Label* false_label, + CodeGeneratorARMVIXL* codegen, + bool is_far_target = true) { + LocationSummary* locations = cond->GetLocations(); + Location left = locations->InAt(0); + Location right = locations->InAt(1); + IfCondition if_cond = cond->GetCondition(); + + vixl32::Register left_high = HighRegisterFrom(left); + vixl32::Register left_low = LowRegisterFrom(left); + IfCondition true_high_cond = if_cond; + IfCondition false_high_cond = cond->GetOppositeCondition(); + vixl32::Condition final_condition = ARMUnsignedCondition(if_cond); // unsigned on lower part + + // Set the conditions for the test, remembering that == needs to be + // decided using the low words. + switch (if_cond) { + case kCondEQ: + case kCondNE: + // Nothing to do. + break; + case kCondLT: + false_high_cond = kCondGT; + break; + case kCondLE: + true_high_cond = kCondLT; + break; + case kCondGT: + false_high_cond = kCondLT; + break; + case kCondGE: + true_high_cond = kCondGT; + break; + case kCondB: + false_high_cond = kCondA; + break; + case kCondBE: + true_high_cond = kCondB; + break; + case kCondA: + false_high_cond = kCondB; + break; + case kCondAE: + true_high_cond = kCondA; + break; + } + if (right.IsConstant()) { + int64_t value = Int64ConstantFrom(right); + int32_t val_low = Low32Bits(value); + int32_t val_high = High32Bits(value); + + __ Cmp(left_high, val_high); + if (if_cond == kCondNE) { + __ B(ARMCondition(true_high_cond), true_label, is_far_target); + } else if (if_cond == kCondEQ) { + __ B(ARMCondition(false_high_cond), false_label, is_far_target); + } else { + __ B(ARMCondition(true_high_cond), true_label, is_far_target); + __ B(ARMCondition(false_high_cond), false_label, is_far_target); + } + // Must be equal high, so compare the lows. + __ Cmp(left_low, val_low); + } else { + vixl32::Register right_high = HighRegisterFrom(right); + vixl32::Register right_low = LowRegisterFrom(right); + + __ Cmp(left_high, right_high); + if (if_cond == kCondNE) { + __ B(ARMCondition(true_high_cond), true_label, is_far_target); + } else if (if_cond == kCondEQ) { + __ B(ARMCondition(false_high_cond), false_label, is_far_target); + } else { + __ B(ARMCondition(true_high_cond), true_label, is_far_target); + __ B(ARMCondition(false_high_cond), false_label, is_far_target); + } + // Must be equal high, so compare the lows. + __ Cmp(left_low, right_low); + } + // The last comparison might be unsigned. + // TODO: optimize cases where this is always true/false + __ B(final_condition, true_label, is_far_target); +} + +static void GenerateConditionLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) { + DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = cond->GetLocations(); + IfCondition condition = cond->GetCondition(); + const vixl32::Register out = OutputRegister(cond); + const Location left = locations->InAt(0); + const Location right = locations->InAt(1); + + if (right.IsConstant()) { + IfCondition opposite = cond->GetOppositeCondition(); + + // Comparisons against 0 are common enough to deserve special attention. + if (AdjustConstantForCondition(Int64ConstantFrom(right), &condition, &opposite) == 0) { + switch (condition) { + case kCondNE: + case kCondA: + if (out.IsLow()) { + // We only care if both input registers are 0 or not. + __ Orrs(out, LowRegisterFrom(left), HighRegisterFrom(left)); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(ne); + __ mov(ne, out, 1); + return; + } + + FALLTHROUGH_INTENDED; + case kCondEQ: + case kCondBE: + // We only care if both input registers are 0 or not. + __ Orr(out, LowRegisterFrom(left), HighRegisterFrom(left)); + codegen->GenerateConditionWithZero(condition, out, out); + return; + case kCondLT: + case kCondGE: + // We only care about the sign bit. + FALLTHROUGH_INTENDED; + case kCondAE: + case kCondB: + codegen->GenerateConditionWithZero(condition, out, HighRegisterFrom(left)); + return; + case kCondLE: + case kCondGT: + default: + break; + } + } + } + + if ((condition == kCondEQ || condition == kCondNE) && + // If `out` is a low register, then the GenerateConditionGeneric() + // function generates a shorter code sequence that is still branchless. + (!out.IsLow() || !CanGenerateTest(cond, codegen->GetAssembler()))) { + GenerateEqualLong(cond, codegen); + return; + } + + if (CanGenerateTest(cond, codegen->GetAssembler())) { + GenerateConditionGeneric(cond, codegen); + return; + } + + // Convert the jumps into the result. + vixl32::Label done_label; + vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label); + vixl32::Label true_label, false_label; + + GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen, /* is_far_target */ false); + + // False case: result = 0. + __ Bind(&false_label); + __ Mov(out, 0); + __ B(final_label); + + // True case: result = 1. + __ Bind(&true_label); + __ Mov(out, 1); + + if (done_label.IsReferenced()) { + __ Bind(&done_label); + } +} + +static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARMVIXL* codegen) { + const Primitive::Type type = cond->GetLeft()->GetType(); + + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; + + if (type == Primitive::kPrimLong) { + GenerateConditionLong(cond, codegen); + return; + } + + IfCondition condition = cond->GetCondition(); + vixl32::Register in = InputRegisterAt(cond, 0); + const vixl32::Register out = OutputRegister(cond); + const Location right = cond->GetLocations()->InAt(1); + int64_t value; + + if (right.IsConstant()) { + IfCondition opposite = cond->GetOppositeCondition(); + + value = AdjustConstantForCondition(Int64ConstantFrom(right), &condition, &opposite); + + // Comparisons against 0 are common enough to deserve special attention. + if (value == 0) { + switch (condition) { + case kCondNE: + case kCondA: + if (out.IsLow() && out.Is(in)) { + __ Cmp(out, 0); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(ne); + __ mov(ne, out, 1); + return; + } + + FALLTHROUGH_INTENDED; + case kCondEQ: + case kCondBE: + case kCondLT: + case kCondGE: + case kCondAE: + case kCondB: + codegen->GenerateConditionWithZero(condition, out, in); + return; + case kCondLE: + case kCondGT: + default: + break; + } + } + } + + if (condition == kCondEQ || condition == kCondNE) { + Operand operand(0); + + if (right.IsConstant()) { + operand = Operand::From(value); + } else if (out.Is(RegisterFrom(right))) { + // Avoid 32-bit instructions if possible. + operand = InputOperandAt(cond, 0); + in = RegisterFrom(right); + } else { + operand = InputOperandAt(cond, 1); + } + + if (condition == kCondNE && out.IsLow()) { + __ Subs(out, in, operand); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(ne); + __ mov(ne, out, 1); + } else { + __ Sub(out, in, operand); + codegen->GenerateConditionWithZero(condition, out, out); + } + + return; + } + + GenerateConditionGeneric(cond, codegen); +} + static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) { const Primitive::Type type = constant->GetType(); bool ret = false; @@ -2011,13 +2501,11 @@ CodeGeneratorARMVIXL::CodeGeneratorARMVIXL(HGraph* graph, uint32_literals_(std::less<uint32_t>(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_string_patches_(StringReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_type_patches_(TypeReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -2468,92 +2956,10 @@ void LocationsBuilderARMVIXL::VisitExit(HExit* exit) { void InstructionCodeGeneratorARMVIXL::VisitExit(HExit* exit ATTRIBUTE_UNUSED) { } -void InstructionCodeGeneratorARMVIXL::GenerateLongComparesAndJumps(HCondition* cond, - vixl32::Label* true_label, - vixl32::Label* false_label) { - LocationSummary* locations = cond->GetLocations(); - Location left = locations->InAt(0); - Location right = locations->InAt(1); - IfCondition if_cond = cond->GetCondition(); - - vixl32::Register left_high = HighRegisterFrom(left); - vixl32::Register left_low = LowRegisterFrom(left); - IfCondition true_high_cond = if_cond; - IfCondition false_high_cond = cond->GetOppositeCondition(); - vixl32::Condition final_condition = ARMUnsignedCondition(if_cond); // unsigned on lower part - - // Set the conditions for the test, remembering that == needs to be - // decided using the low words. - switch (if_cond) { - case kCondEQ: - case kCondNE: - // Nothing to do. - break; - case kCondLT: - false_high_cond = kCondGT; - break; - case kCondLE: - true_high_cond = kCondLT; - break; - case kCondGT: - false_high_cond = kCondLT; - break; - case kCondGE: - true_high_cond = kCondGT; - break; - case kCondB: - false_high_cond = kCondA; - break; - case kCondBE: - true_high_cond = kCondB; - break; - case kCondA: - false_high_cond = kCondB; - break; - case kCondAE: - true_high_cond = kCondA; - break; - } - if (right.IsConstant()) { - int64_t value = Int64ConstantFrom(right); - int32_t val_low = Low32Bits(value); - int32_t val_high = High32Bits(value); - - __ Cmp(left_high, val_high); - if (if_cond == kCondNE) { - __ B(ARMCondition(true_high_cond), true_label); - } else if (if_cond == kCondEQ) { - __ B(ARMCondition(false_high_cond), false_label); - } else { - __ B(ARMCondition(true_high_cond), true_label); - __ B(ARMCondition(false_high_cond), false_label); - } - // Must be equal high, so compare the lows. - __ Cmp(left_low, val_low); - } else { - vixl32::Register right_high = HighRegisterFrom(right); - vixl32::Register right_low = LowRegisterFrom(right); - - __ Cmp(left_high, right_high); - if (if_cond == kCondNE) { - __ B(ARMCondition(true_high_cond), true_label); - } else if (if_cond == kCondEQ) { - __ B(ARMCondition(false_high_cond), false_label); - } else { - __ B(ARMCondition(true_high_cond), true_label); - __ B(ARMCondition(false_high_cond), false_label); - } - // Must be equal high, so compare the lows. - __ Cmp(left_low, right_low); - } - // The last comparison might be unsigned. - // TODO: optimize cases where this is always true/false - __ B(final_condition, true_label); -} - void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* condition, vixl32::Label* true_target_in, - vixl32::Label* false_target_in) { + vixl32::Label* false_target_in, + bool is_far_target) { if (CanGenerateTest(condition, codegen_->GetAssembler())) { vixl32::Label* non_fallthrough_target; bool invert; @@ -2575,7 +2981,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* c const auto cond = GenerateTest(condition, invert, codegen_); - __ B(cond.first, non_fallthrough_target); + __ B(cond.first, non_fallthrough_target, is_far_target); if (emit_both_branches) { // No target falls through, we need to branch. @@ -2592,7 +2998,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* c vixl32::Label* false_target = (false_target_in == nullptr) ? &fallthrough : false_target_in; DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong); - GenerateLongComparesAndJumps(condition, true_target, false_target); + GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_, is_far_target); if (false_target != &fallthrough) { __ B(false_target); @@ -2660,7 +3066,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateTestAndBranch(HInstruction* instru // the HCondition, generate the comparison directly. Primitive::Type type = condition->InputAt(0)->GetType(); if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) { - GenerateCompareTestAndBranch(condition, true_target, false_target); + GenerateCompareTestAndBranch(condition, true_target, false_target, far_target); return; } @@ -2679,14 +3085,14 @@ void InstructionCodeGeneratorARMVIXL::GenerateTestAndBranch(HInstruction* instru if (right.IsImmediate() && right.GetImmediate() == 0 && (arm_cond.Is(ne) || arm_cond.Is(eq))) { if (arm_cond.Is(eq)) { - __ CompareAndBranchIfZero(left, non_fallthrough_target); + __ CompareAndBranchIfZero(left, non_fallthrough_target, far_target); } else { DCHECK(arm_cond.Is(ne)); - __ CompareAndBranchIfNonZero(left, non_fallthrough_target); + __ CompareAndBranchIfNonZero(left, non_fallthrough_target, far_target); } } else { __ Cmp(left, right); - __ B(arm_cond, non_fallthrough_target); + __ B(arm_cond, non_fallthrough_target, far_target); } } @@ -2903,6 +3309,83 @@ void CodeGeneratorARMVIXL::GenerateNop() { __ Nop(); } +// `temp` is an extra temporary register that is used for some conditions; +// callers may not specify it, in which case the method will use a scratch +// register instead. +void CodeGeneratorARMVIXL::GenerateConditionWithZero(IfCondition condition, + vixl32::Register out, + vixl32::Register in, + vixl32::Register temp) { + switch (condition) { + case kCondEQ: + // x <= 0 iff x == 0 when the comparison is unsigned. + case kCondBE: + if (!temp.IsValid() || (out.IsLow() && !out.Is(in))) { + temp = out; + } + + // Avoid 32-bit instructions if possible; note that `in` and `temp` must be + // different as well. + if (in.IsLow() && temp.IsLow() && !in.Is(temp)) { + // temp = - in; only 0 sets the carry flag. + __ Rsbs(temp, in, 0); + + if (out.Is(in)) { + std::swap(in, temp); + } + + // out = - in + in + carry = carry + __ Adc(out, temp, in); + } else { + // If `in` is 0, then it has 32 leading zeros, and less than that otherwise. + __ Clz(out, in); + // Any number less than 32 logically shifted right by 5 bits results in 0; + // the same operation on 32 yields 1. + __ Lsr(out, out, 5); + } + + break; + case kCondNE: + // x > 0 iff x != 0 when the comparison is unsigned. + case kCondA: { + UseScratchRegisterScope temps(GetVIXLAssembler()); + + if (out.Is(in)) { + if (!temp.IsValid() || in.Is(temp)) { + temp = temps.Acquire(); + } + } else if (!temp.IsValid() || !temp.IsLow()) { + temp = out; + } + + // temp = in - 1; only 0 does not set the carry flag. + __ Subs(temp, in, 1); + // out = in + ~temp + carry = in + (-(in - 1) - 1) + carry = in - in + 1 - 1 + carry = carry + __ Sbc(out, in, temp); + break; + } + case kCondGE: + __ Mvn(out, in); + in = out; + FALLTHROUGH_INTENDED; + case kCondLT: + // We only care about the sign bit. + __ Lsr(out, in, 31); + break; + case kCondAE: + // Trivially true. + __ Mov(out, 1); + break; + case kCondB: + // Trivially false. + __ Mov(out, 0); + break; + default: + LOG(FATAL) << "Unexpected condition " << condition; + UNREACHABLE(); + } +} + void LocationsBuilderARMVIXL::HandleCondition(HCondition* cond) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cond, LocationSummary::kNoCall); @@ -2939,52 +3422,47 @@ void InstructionCodeGeneratorARMVIXL::HandleCondition(HCondition* cond) { return; } - const vixl32::Register out = OutputRegister(cond); + const Primitive::Type type = cond->GetLeft()->GetType(); - if (out.IsLow() && CanGenerateTest(cond, codegen_->GetAssembler())) { - const auto condition = GenerateTest(cond, false, codegen_); - // We use the scope because of the IT block that follows. - ExactAssemblyScope guard(GetVIXLAssembler(), - 4 * vixl32::k16BitT32InstructionSizeInBytes, - CodeBufferCheckScope::kExactSize); - - __ it(condition.first); - __ mov(condition.first, out, 1); - __ it(condition.second); - __ mov(condition.second, out, 0); + if (Primitive::IsFloatingPointType(type)) { + GenerateConditionGeneric(cond, codegen_); return; } - // Convert the jumps into the result. - vixl32::Label done_label; - vixl32::Label* const final_label = codegen_->GetFinalLabel(cond, &done_label); + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; - if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) { - vixl32::Label true_label, false_label; + const IfCondition condition = cond->GetCondition(); - GenerateLongComparesAndJumps(cond, &true_label, &false_label); + // A condition with only one boolean input, or two boolean inputs without being equality or + // inequality results from transformations done by the instruction simplifier, and is handled + // as a regular condition with integral inputs. + if (type == Primitive::kPrimBoolean && + cond->GetRight()->GetType() == Primitive::kPrimBoolean && + (condition == kCondEQ || condition == kCondNE)) { + vixl32::Register left = InputRegisterAt(cond, 0); + const vixl32::Register out = OutputRegister(cond); + const Location right_loc = cond->GetLocations()->InAt(1); - // False case: result = 0. - __ Bind(&false_label); - __ Mov(out, 0); - __ B(final_label); + // The constant case is handled by the instruction simplifier. + DCHECK(!right_loc.IsConstant()); - // True case: result = 1. - __ Bind(&true_label); - __ Mov(out, 1); - } else { - DCHECK(CanGenerateTest(cond, codegen_->GetAssembler())); + vixl32::Register right = RegisterFrom(right_loc); - const auto condition = GenerateTest(cond, false, codegen_); + // Avoid 32-bit instructions if possible. + if (out.Is(right)) { + std::swap(left, right); + } - __ Mov(LeaveFlags, out, 0); - __ B(condition.second, final_label, /* far_target */ false); - __ Mov(out, 1); - } + __ Eor(out, left, right); - if (done_label.IsReferenced()) { - __ Bind(&done_label); + if (condition == kCondEQ) { + __ Eor(out, out, 1); + } + + return; } + + GenerateConditionIntegralOrNonPrimitive(cond, codegen_); } void LocationsBuilderARMVIXL::VisitEqual(HEqual* comp) { @@ -3119,6 +3597,15 @@ void InstructionCodeGeneratorARMVIXL::VisitDoubleConstant( // Will be generated at use site. } +void LocationsBuilderARMVIXL::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorARMVIXL::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderARMVIXL::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -5296,7 +5783,18 @@ void LocationsBuilderARMVIXL::HandleFieldGet(HInstruction* instruction, } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else { + locations->AddTemp(Location::RequiresRegister()); + } } } @@ -5763,11 +6261,35 @@ void LocationsBuilderARMVIXL::VisitArrayGet(HArrayGet* instruction) { Location::RequiresRegister(), object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap); } - // We need a temporary register for the read barrier marking slow - // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier. - // Also need for String compression feature. - if ((object_array_get_with_read_barrier && kUseBakerReadBarrier) - || (mirror::kUseStringCompression && instruction->IsStringCharAt())) { + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier. + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation() && + !instruction->GetIndex()->IsConstant()) { + // We need a non-scratch temporary for the array data pointer. + locations->AddTemp(Location::RequiresRegister()); + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else { + locations->AddTemp(Location::RequiresRegister()); + } + } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + // Also need a temporary for String compression feature. locations->AddTemp(Location::RequiresRegister()); } } @@ -5878,8 +6400,20 @@ void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) { Location temp = locations->GetTemp(0); // Note that a potential implicit null check is handled in this // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true); + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + data_offset += Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + locations->GetTemp(0), + /* needs_null_check */ false); + } else { + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false); + } } else { vixl32::Register out = OutputRegister(instruction); if (index.IsConstant()) { @@ -6315,6 +6849,16 @@ void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddress(HIntermediateAddr } } +void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) { RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConventionARMVIXL calling_convention; @@ -6707,21 +7251,15 @@ HLoadClass::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadClassKind( UNREACHABLE(); case HLoadClass::LoadKind::kReferrersClass: break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadClass::LoadKind::kBootImageAddress: - break; case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kRuntimeCall: break; } return desired_class_load_kind; @@ -6729,7 +7267,7 @@ HLoadClass::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadClassKind( void LocationsBuilderARMVIXL::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConventionARMVIXL calling_convention; CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( cls, @@ -6769,13 +7307,20 @@ void LocationsBuilderARMVIXL::VisitLoadClass(HLoadClass* cls) { // For non-Baker read barrier we have a temp-clobbering call. } } + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + if (load_kind == HLoadClass::LoadKind::kBssEntry || + (load_kind == HLoadClass::LoadKind::kReferrersClass && + !Runtime::Current()->UseJitCompilation())) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not // move. void InstructionCodeGeneratorARMVIXL::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -6802,13 +7347,6 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadClass(HLoadClass* cls) NO_THREAD_ read_barrier_option); break; } - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: { - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); - __ Ldr(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), - cls->GetTypeIndex())); - break; - } case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); @@ -6844,7 +7382,7 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadClass(HLoadClass* cls) NO_THREAD_ GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, read_barrier_option); break; } - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: case HLoadClass::LoadKind::kInvalid: LOG(FATAL) << "UNREACHABLE"; UNREACHABLE(); @@ -6905,21 +7443,15 @@ void InstructionCodeGeneratorARMVIXL::GenerateClassInitializationCheck( HLoadString::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadString::LoadKind::kBootImageAddress: - break; case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kRuntimeCall: break; } return desired_string_load_kind; @@ -6929,7 +7461,7 @@ void LocationsBuilderARMVIXL::VisitLoadString(HLoadString* load) { LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); HLoadString::LoadKind load_kind = load->GetLoadKind(); - if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadString::LoadKind::kRuntimeCall) { locations->SetOut(LocationFrom(r0)); } else { locations->SetOut(Location::RequiresRegister()); @@ -6945,6 +7477,9 @@ void LocationsBuilderARMVIXL::VisitLoadString(HLoadString* load) { // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK() // that the the kPrimNot result register is the same as the first argument register. locations->SetCustomSlowPathCallerSaves(caller_saves); + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } } else { // For non-Baker read barrier we have a temp-clobbering call. } @@ -6961,11 +7496,6 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadString(HLoadString* load) NO_THRE HLoadString::LoadKind load_kind = load->GetLoadKind(); switch (load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: { - __ Ldr(out, codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(), - load->GetStringIndex())); - return; // No dex cache slow path. - } case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorARMVIXL::PcRelativePatchInfo* labels = @@ -7009,7 +7539,7 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadString(HLoadString* load) NO_THRE } // TODO: Re-add the compiler code to do string dex cache lookup again. - DCHECK_EQ(load->GetLoadKind(), HLoadString::LoadKind::kDexCacheViaMethod); + DCHECK_EQ(load->GetLoadKind(), HLoadString::LoadKind::kRuntimeCall); InvokeRuntimeCallingConventionARMVIXL calling_convention; __ Mov(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_); codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc()); @@ -7107,6 +7637,9 @@ void LocationsBuilderARMVIXL::VisitInstanceOf(HInstanceOf* instruction) { // Note that TypeCheckSlowPathARM uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + codegen_->MaybeAddBakerCcEntrypointTempForFields(locations); + } } void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) { @@ -8005,48 +8538,98 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Location temp = LocationFrom(lr); - SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( - instruction, root, /* entrypoint */ temp); - codegen_->AddSlowPath(slow_path); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + bool narrow = CanEmitNarrowLdr(root_reg, obj, offset); + uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData( + root_reg.GetCode(), narrow); + vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + + vixl::EmissionCheckScope guard(GetVIXLAssembler(), + 4 * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + // Currently the offset is always within range. If that changes, + // we shall have to split the load the same way as for fields. + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset(); + __ ldr(EncodingSize(narrow ? Narrow : Wide), root_reg, MemOperand(obj, offset)); + EmitPlaceholderBne(codegen_, bne_label); + __ Bind(&return_address); + DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(), + narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET + : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET); + } else { + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -8064,6 +8647,16 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( } } +void CodeGeneratorARMVIXL::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields) { + if (!Runtime::Current()->UseJitCompilation()) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } + } +} + void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, vixl32::Register obj, @@ -8073,6 +8666,85 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // HeapReference<mirror::Object> reference = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset); + vixl32::Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + base = RegisterFrom(temp); + DCHECK(!base.Is(kBakerCcEntrypointRegister)); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u))); + offset &= (kReferenceLoadMinFarOffset - 1u); + // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large + // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely + // increase the overall code size when taking the generated thunks into account. + DCHECK(!narrow); + } + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData( + base.GetCode(), obj.GetCode(), narrow); + vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + + vixl::EmissionCheckScope guard( + GetVIXLAssembler(), + (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + EmitPlaceholderBne(this, bne_label); + ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset(); + __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + // Note: We need a specific width for the unpoisoning NEG. + if (kPoisonHeapReferences) { + if (narrow) { + // The only 16-bit encoding is T1 which sets flags outside IT block (i.e. RSBS, not RSB). + __ rsbs(EncodingSize(Narrow), ref_reg, ref_reg, Operand(0)); + } else { + __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0)); + } + } + __ Bind(&return_address); + DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(), + narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET + : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) Location no_index = Location::NoLocation(); ScaleFactor no_scale_factor = TIMES_1; @@ -8093,9 +8765,73 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i static_assert( sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + ScaleFactor scale_factor = TIMES_4; + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // HeapReference<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + vixl32::Register index_reg = RegisterFrom(index, Primitive::kPrimInt); + vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + vixl32::Register data_reg = RegisterFrom(temp, Primitive::kPrimInt); // Raw pointer. + DCHECK(!data_reg.Is(kBakerCcEntrypointRegister)); + + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg.GetCode()); + vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + __ Add(data_reg, obj, Operand(data_offset)); + + vixl::EmissionCheckScope guard( + GetVIXLAssembler(), + (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + EmitPlaceholderBne(this, bne_label); + ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset(); + __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + // Note: We need a Wide NEG for the unpoisoning. + if (kPoisonHeapReferences) { + __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0)); + } + __ Bind(&return_address); + DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(), + BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET); + return; + } + // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - ScaleFactor scale_factor = TIMES_4; GenerateReferenceLoadWithBakerReadBarrier( instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check); } @@ -8107,9 +8843,7 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio Location index, ScaleFactor scale_factor, Location temp, - bool needs_null_check, - bool always_update_field, - vixl32::Register* temp2) { + bool needs_null_check) { DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); @@ -8120,6 +8854,73 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio // not. // // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp2` the read barrier mark entry point + // corresponding to register `ref`. If `temp2` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } + + vixl32::Register temp_reg = RegisterFrom(temp); + + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp2`. + Location temp2 = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + temp_reg, + /* entrypoint */ temp2); + AddSlowPath(slow_path); + + // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp2), tr, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(RegisterFrom(temp2), slow_path->GetEntryLabel()); + // Fast path: the GC is not marking: just load the reference. + GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorARMVIXL::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction, + Location ref, + vixl32::Register obj, + Location field_offset, + Location temp, + bool needs_null_check, + vixl32::Register temp2) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to update the reference + // field within `obj`. Then, in the slow path, check the gray bit + // in the lock word of the reference's holder (`obj`) to decide + // whether to mark `ref` and update the field or not. + // + // Note that we do not actually check the value of `GetIsGcMarking()`; // instead, we load into `temp3` the read barrier mark entry point // corresponding to register `ref`. If `temp3` is null, it means // that `GetIsGcMarking()` is false, and vice versa. @@ -8129,55 +8930,32 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio // // Slow path. // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<mirror::Object> ref = *src; // Original reference load. + // HeapReference<mirror::Object> ref = *(obj + field_offset); // Reference load. // bool is_gray = (rb_state == ReadBarrier::GrayState()); // if (is_gray) { + // old_ref = ref; // ref = temp3(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // compareAndSwapObject(obj, field_offset, old_ref, ref); // } - // } else { - // HeapReference<mirror::Object> ref = *src; // Original reference load. // } vixl32::Register temp_reg = RegisterFrom(temp); - // Slow path marking the object `ref` when the GC is marking. The - // entrypoint will already be loaded in `temp3`. + // Slow path updating the object reference at address `obj + field_offset` + // when the GC is marking. The entrypoint will already be loaded in `temp3`. Location temp3 = LocationFrom(lr); - SlowPathCodeARMVIXL* slow_path; - if (always_update_field) { - DCHECK(temp2 != nullptr); - // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL - // only supports address of the form `obj + field_offset`, where - // `obj` is a register and `field_offset` is a register pair (of - // which only the lower half is used). Thus `offset` and - // `scale_factor` above are expected to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); - Location field_offset = index; - slow_path = - new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL( - instruction, - ref, - obj, - offset, - /* index */ field_offset, - scale_factor, - needs_null_check, - temp_reg, - *temp2, - /* entrypoint */ temp3); - } else { - slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL( - instruction, - ref, - obj, - offset, - index, - scale_factor, - needs_null_check, - temp_reg, - /* entrypoint */ temp3); - } + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL( + instruction, + ref, + obj, + /* offset */ 0u, + /* index */ field_offset, + /* scale_factor */ ScaleFactor::TIMES_1, + needs_null_check, + temp_reg, + temp2, + /* entrypoint */ temp3); AddSlowPath(slow_path); // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() @@ -8189,8 +8967,8 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel()); - // Fast path: just load the reference. - GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + // Fast path: the GC is not marking: nothing to do (the field is + // up-to-date, and we don't need to load the reference). __ Bind(slow_path->GetExitLabel()); } @@ -8348,7 +9126,7 @@ vixl32::Register CodeGeneratorARMVIXL::GetInvokeStaticOrDirectExtraParameter( return RegisterFrom(location); } -Location CodeGeneratorARMVIXL::GenerateCalleeMethodStaticOrDirectCall( +void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall( HInvokeStaticOrDirect* invoke, Location temp) { Location callee_method = temp; // For all kinds except kRecursive, callee will be in temp. switch (invoke->GetMethodLoadKind()) { @@ -8362,6 +9140,13 @@ Location CodeGeneratorARMVIXL::GenerateCalleeMethodStaticOrDirectCall( case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: { + DCHECK(GetCompilerOptions().IsBootImage()); + PcRelativePatchInfo* labels = NewPcRelativeMethodPatch(invoke->GetTargetMethod()); + vixl32::Register temp_reg = RegisterFrom(temp); + EmitMovwMovtPlaceholder(labels, temp_reg); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ Mov(RegisterFrom(temp), Operand::From(invoke->GetMethodAddress())); break; @@ -8399,12 +9184,6 @@ Location CodeGeneratorARMVIXL::GenerateCalleeMethodStaticOrDirectCall( break; } } - return callee_method; -} - -void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, - Location temp) { - Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp); switch (invoke->GetCodePtrLocation()) { case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf: @@ -8478,9 +9257,11 @@ void CodeGeneratorARMVIXL::GenerateVirtualCall(HInvokeVirtual* invoke, Location __ blx(lr); } -CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeStringPatch( - const DexFile& dex_file, dex::StringIndex string_index) { - return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeMethodPatch( + MethodReference target_method) { + return NewPcRelativePatch(*target_method.dex_file, + target_method.dex_method_index, + &pc_relative_method_patches_); } CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeTypePatch( @@ -8493,6 +9274,11 @@ CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewTypeBssEntry return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_); } +CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeStringPatch( + const DexFile& dex_file, dex::StringIndex string_index) { + return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +} + CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativeDexCacheArrayPatch( const DexFile& dex_file, uint32_t element_offset) { return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_); @@ -8504,34 +9290,15 @@ CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativePa return &patches->back(); } -VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageStringLiteral( - const DexFile& dex_file, - dex::StringIndex string_index) { - return boot_image_string_patches_.GetOrCreate( - StringReference(&dex_file, string_index), - [this]() { - return GetAssembler()->CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); - }); -} - -VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageTypeLiteral( - const DexFile& dex_file, - dex::TypeIndex type_index) { - return boot_image_type_patches_.GetOrCreate( - TypeReference(&dex_file, type_index), - [this]() { - return GetAssembler()->CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); - }); +vixl::aarch32::Label* CodeGeneratorARMVIXL::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; } VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageAddressLiteral(uint32_t address) { return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } -VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateDexCacheAddressLiteral(uint32_t address) { - return DeduplicateUint32Literal(address, &uint32_literals_); -} - VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateJitStringLiteral( const DexFile& dex_file, dex::StringIndex string_index, @@ -8581,43 +9348,32 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa DCHECK(linker_patches->empty()); size_t size = /* MOVW+MOVT for each entry */ 2u * pc_relative_dex_cache_patches_.size() + - boot_image_string_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + - boot_image_type_patches_.size() + + /* MOVW+MOVT for each entry */ 2u * pc_relative_method_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + + /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - for (const auto& entry : boot_image_string_patches_) { - const StringReference& target_string = entry.first; - VIXLUInt32Literal* literal = entry.second; - DCHECK(literal->IsBound()); - uint32_t literal_offset = literal->GetLocation(); - linker_patches->push_back(LinkerPatch::StringPatch(literal_offset, - target_string.dex_file, - target_string.string_index.index_)); - } - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(pc_relative_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_, linker_patches); - } else { EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_, linker_patches); + } else { + DCHECK(pc_relative_method_patches_.empty()); + DCHECK(pc_relative_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); - for (const auto& entry : boot_image_type_patches_) { - const TypeReference& target_type = entry.first; - VIXLUInt32Literal* literal = entry.second; - DCHECK(literal->IsBound()); - uint32_t literal_offset = literal->GetLocation(); - linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, - target_type.dex_file, - target_type.type_index.index_)); + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(), + info.custom_data)); } DCHECK_EQ(size, linker_patches->size()); } @@ -8632,16 +9388,6 @@ VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateUint32Literal( }); } -VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateMethodLiteral( - MethodReference target_method, - MethodToLiteralMap* map) { - return map->GetOrCreate( - target_method, - [this]() { - return GetAssembler()->CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); - }); -} - void LocationsBuilderARMVIXL::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall); @@ -8855,14 +9601,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARMVIXL::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + VIXLUInt32Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + VIXLUInt32Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 1e9669dc38..f6e4de33a8 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -24,8 +24,8 @@ #include "nodes.h" #include "string_reference.h" #include "parallel_move_resolver.h" +#include "type_reference.h" #include "utils/arm/assembler_arm_vixl.h" -#include "utils/type_reference.h" // TODO(VIXL): make vixl clean wrt -Wshadow. #pragma GCC diagnostic push @@ -400,10 +400,8 @@ class InstructionCodeGeneratorARMVIXL : public InstructionCodeGenerator { bool far_target = true); void GenerateCompareTestAndBranch(HCondition* condition, vixl::aarch32::Label* true_target, - vixl::aarch32::Label* false_target); - void GenerateLongComparesAndJumps(HCondition* cond, - vixl::aarch32::Label* true_label, - vixl::aarch32::Label* false_label); + vixl::aarch32::Label* false_target, + bool is_far_target = true); void DivRemOneOrMinusOne(HBinaryOperation* instruction); void DivRemByPowerOfTwo(HBinaryOperation* instruction); void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); @@ -540,7 +538,6 @@ class CodeGeneratorARMVIXL : public CodeGenerator { const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info, HInvokeStaticOrDirect* invoke) OVERRIDE; - Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp); void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE; void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; @@ -566,18 +563,19 @@ class CodeGeneratorARMVIXL : public CodeGenerator { vixl::aarch32::Label add_pc_label; }; - PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, - dex::StringIndex string_index); + PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method); PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); + PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, + dex::StringIndex string_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); - VIXLUInt32Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index); - VIXLUInt32Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, - dex::TypeIndex type_index); + + // Add a new baker read barrier patch and return the label to be bound + // before the BNE instruction. + vixl::aarch32::Label* NewBakerReadBarrierPatch(uint32_t custom_data); + VIXLUInt32Literal* DeduplicateBootImageAddressLiteral(uint32_t address); - VIXLUInt32Literal* DeduplicateDexCacheAddressLiteral(uint32_t address); VIXLUInt32Literal* DeduplicateJitStringLiteral(const DexFile& dex_file, dex::StringIndex string_index, Handle<mirror::String> handle); @@ -589,6 +587,10 @@ class CodeGeneratorARMVIXL : public CodeGenerator { void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Maybe add the reserved entrypoint register as a temporary for field load. This temp + // is added only for AOT compilation if link-time generated thunks for fields are enabled. + void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations); + // Fast path implementation of ReadBarrier::Barrier for a heap // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, @@ -612,11 +614,6 @@ class CodeGeneratorARMVIXL : public CodeGenerator { // Load the object reference located at the address // `obj + offset + (index << scale_factor)`, held by object `obj`, into // `ref`, and mark it if needed. - // - // If `always_update_field` is true, the value of the reference is - // atomically updated in the holder (`obj`). This operation - // requires an extra temporary register, which must be provided as a - // non-null pointer (`temp2`). void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, vixl::aarch32::Register obj, @@ -624,9 +621,27 @@ class CodeGeneratorARMVIXL : public CodeGenerator { Location index, ScaleFactor scale_factor, Location temp, - bool needs_null_check, - bool always_update_field = false, - vixl::aarch32::Register* temp2 = nullptr); + bool needs_null_check); + + // Generate code checking whether the the reference field at the + // address `obj + field_offset`, held by object `obj`, needs to be + // marked, and if so, marking it and updating the field within `obj` + // with the marked value. + // + // This routine is used for the implementation of the + // UnsafeCASObject intrinsic with Baker read barriers. + // + // This method has a structure similar to + // GenerateReferenceLoadWithBakerReadBarrier, but note that argument + // `ref` is only as a temporary here, and thus its value should not + // be used afterwards. + void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction, + Location ref, + vixl::aarch32::Register obj, + Location field_offset, + Location temp, + bool needs_null_check, + vixl::aarch32::Register temp2); // Generate a heap reference load (with no read barrier). void GenerateRawReferenceLoad(HInstruction* instruction, @@ -699,13 +714,19 @@ class CodeGeneratorARMVIXL : public CodeGenerator { void EmitMovwMovtPlaceholder(CodeGeneratorARMVIXL::PcRelativePatchInfo* labels, vixl::aarch32::Register out); + // `temp` is an extra temporary register that is used for some conditions; + // callers may not specify it, in which case the method will use a scratch + // register instead. + void GenerateConditionWithZero(IfCondition condition, + vixl::aarch32::Register out, + vixl::aarch32::Register in, + vixl::aarch32::Register temp = vixl32::Register()); + private: vixl::aarch32::Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, vixl::aarch32::Register temp); using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, VIXLUInt32Literal*>; - using MethodToLiteralMap = - ArenaSafeMap<MethodReference, VIXLUInt32Literal*, MethodReferenceComparator>; using StringToLiteralMap = ArenaSafeMap<StringReference, VIXLUInt32Literal*, StringReferenceValueComparator>; @@ -713,9 +734,14 @@ class CodeGeneratorARMVIXL : public CodeGenerator { VIXLUInt32Literal*, TypeReferenceValueComparator>; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + vixl::aarch32::Label label; + uint32_t custom_data; + }; + VIXLUInt32Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); - VIXLUInt32Literal* DeduplicateMethodLiteral(MethodReference target_method, - MethodToLiteralMap* map); PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file, uint32_t offset_or_index, ArenaDeque<PcRelativePatchInfo>* patches); @@ -740,16 +766,16 @@ class CodeGeneratorARMVIXL : public CodeGenerator { Uint32ToLiteralMap uint32_literals_; // PC-relative patch info for each HArmDexCacheArraysBase. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // Deduplication map for boot string literals for kBootImageLinkTimeAddress. - StringToLiteralMap boot_image_string_patches_; - // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). - ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; - // Deduplication map for boot type literals for kBootImageLinkTimeAddress. - TypeToLiteralMap boot_image_type_patches_; + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). + ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index b7e602420c..951d75a708 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -219,15 +219,33 @@ class LoadClassSlowPathMIPS : public SlowPathCodeMIPS { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); + Location out = locations->Out(); CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); - + const bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6(); + const bool r2_baker_or_no_read_barriers = !isR6 && (!kUseReadBarrier || kUseBakerReadBarrier); + InvokeRuntimeCallingConvention calling_convention; + DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_); + const bool is_load_class_bss_entry = + (cls_ == instruction_) && (cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry); __ Bind(GetEntryLabel()); SaveLiveRegisters(codegen, locations); - InvokeRuntimeCallingConvention calling_convention; + // For HLoadClass/kBssEntry/kSaveEverything, make sure we preserve the address of the entry. + Register entry_address = kNoRegister; + if (is_load_class_bss_entry && r2_baker_or_no_read_barriers) { + Register temp = locations->GetTemp(0).AsRegister<Register>(); + bool temp_is_a0 = (temp == calling_convention.GetRegisterAt(0)); + // In the unlucky case that `temp` is A0, we preserve the address in `out` across the + // kSaveEverything call. + entry_address = temp_is_a0 ? out.AsRegister<Register>() : temp; + DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0)); + if (temp_is_a0) { + __ Move(entry_address, temp); + } + } + dex::TypeIndex type_index = cls_->GetTypeIndex(); __ LoadConst32(calling_convention.GetRegisterAt(0), type_index.index_); - QuickEntrypointEnum entrypoint = do_clinit_ ? kQuickInitializeStaticStorage : kQuickInitializeType; mips_codegen->InvokeRuntime(entrypoint, instruction_, dex_pc_, this); @@ -237,25 +255,27 @@ class LoadClassSlowPathMIPS : public SlowPathCodeMIPS { CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>(); } + // For HLoadClass/kBssEntry, store the resolved class to the BSS entry. + if (is_load_class_bss_entry && r2_baker_or_no_read_barriers) { + // The class entry address was preserved in `entry_address` thanks to kSaveEverything. + __ StoreToOffset(kStoreWord, calling_convention.GetRegisterAt(0), entry_address, 0); + } + // Move the class to the desired location. - Location out = locations->Out(); if (out.IsValid()) { DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg())); Primitive::Type type = instruction_->GetType(); - mips_codegen->MoveLocation(out, calling_convention.GetReturnLocation(type), type); + mips_codegen->MoveLocation(out, + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + type); } - RestoreLiveRegisters(codegen, locations); - // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry. - DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_); - if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) { - DCHECK(out.IsValid()); - // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to - // kSaveEverything and use a temporary for the .bss entry address in the fast path, - // so that we can avoid another calculation here. - bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6(); + + // For HLoadClass/kBssEntry, store the resolved class to the BSS entry. + if (is_load_class_bss_entry && !r2_baker_or_no_read_barriers) { + // For non-Baker read barriers (or on R6), we need to re-calculate the address of + // the class entry. Register base = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>(); - DCHECK_NE(out.AsRegister<Register>(), AT); CodeGeneratorMIPS::PcRelativePatchInfo* info = mips_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index); bool reordering = __ SetReorder(false); @@ -286,40 +306,62 @@ class LoadStringSlowPathMIPS : public SlowPathCodeMIPS { explicit LoadStringSlowPathMIPS(HLoadString* instruction) : SlowPathCodeMIPS(instruction) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + DCHECK(instruction_->IsLoadString()); + DCHECK_EQ(instruction_->AsLoadString()->GetLoadKind(), HLoadString::LoadKind::kBssEntry); LocationSummary* locations = instruction_->GetLocations(); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg())); + HLoadString* load = instruction_->AsLoadString(); + const dex::StringIndex string_index = load->GetStringIndex(); + Register out = locations->Out().AsRegister<Register>(); CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); - + const bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6(); + const bool r2_baker_or_no_read_barriers = !isR6 && (!kUseReadBarrier || kUseBakerReadBarrier); + InvokeRuntimeCallingConvention calling_convention; __ Bind(GetEntryLabel()); SaveLiveRegisters(codegen, locations); - InvokeRuntimeCallingConvention calling_convention; - HLoadString* load = instruction_->AsLoadString(); - const dex::StringIndex string_index = load->GetStringIndex(); + // For HLoadString/kBssEntry/kSaveEverything, make sure we preserve the address of the entry. + Register entry_address = kNoRegister; + if (r2_baker_or_no_read_barriers) { + Register temp = locations->GetTemp(0).AsRegister<Register>(); + bool temp_is_a0 = (temp == calling_convention.GetRegisterAt(0)); + // In the unlucky case that `temp` is A0, we preserve the address in `out` across the + // kSaveEverything call. + entry_address = temp_is_a0 ? out : temp; + DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0)); + if (temp_is_a0) { + __ Move(entry_address, temp); + } + } + __ LoadConst32(calling_convention.GetRegisterAt(0), string_index.index_); mips_codegen->InvokeRuntime(kQuickResolveString, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>(); + + // Store the resolved string to the BSS entry. + if (r2_baker_or_no_read_barriers) { + // The string entry address was preserved in `entry_address` thanks to kSaveEverything. + __ StoreToOffset(kStoreWord, calling_convention.GetRegisterAt(0), entry_address, 0); + } + Primitive::Type type = instruction_->GetType(); mips_codegen->MoveLocation(locations->Out(), - calling_convention.GetReturnLocation(type), + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), type); - RestoreLiveRegisters(codegen, locations); - // Store the resolved String to the BSS entry. - // TODO: Change art_quick_resolve_string to kSaveEverything and use a temporary for the - // .bss entry address in the fast path, so that we can avoid another calculation here. - bool isR6 = mips_codegen->GetInstructionSetFeatures().IsR6(); - Register base = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); - DCHECK_NE(out, AT); - CodeGeneratorMIPS::PcRelativePatchInfo* info = - mips_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index); - bool reordering = __ SetReorder(false); - mips_codegen->EmitPcRelativeAddressPlaceholderHigh(info, TMP, base); - __ StoreToOffset(kStoreWord, out, TMP, /* placeholder */ 0x5678); - __ SetReorder(reordering); - + // Store the resolved string to the BSS entry. + if (!r2_baker_or_no_read_barriers) { + // For non-Baker read barriers (or on R6), we need to re-calculate the address of + // the string entry. + Register base = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>(); + CodeGeneratorMIPS::PcRelativePatchInfo* info = + mips_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index); + bool reordering = __ SetReorder(false); + mips_codegen->EmitPcRelativeAddressPlaceholderHigh(info, TMP, base); + __ StoreToOffset(kStoreWord, out, TMP, /* placeholder */ 0x5678); + __ SetReorder(reordering); + } __ B(GetExitLabel()); } @@ -1019,13 +1061,10 @@ CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph, uint32_literals_(std::less<uint32_t>(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_string_patches_(StringReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_type_patches_(TypeReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), clobbered_ra_(false) { @@ -1564,50 +1603,36 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - pc_relative_string_patches_.size() + + pc_relative_method_patches_.size() + pc_relative_type_patches_.size() + type_bss_entry_patches_.size() + - boot_image_string_patches_.size() + - boot_image_type_patches_.size(); + pc_relative_string_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(pc_relative_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_, linker_patches); - } else { EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_, linker_patches); + } else { + DCHECK(pc_relative_method_patches_.empty()); + DCHECK(pc_relative_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); - for (const auto& entry : boot_image_string_patches_) { - const StringReference& target_string = entry.first; - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); - linker_patches->push_back(LinkerPatch::StringPatch(literal_offset, - target_string.dex_file, - target_string.string_index.index_)); - } - for (const auto& entry : boot_image_type_patches_) { - const TypeReference& target_type = entry.first; - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); - linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, - target_type.dex_file, - target_type.type_index.index_)); - } DCHECK_EQ(size, linker_patches->size()); } -CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeStringPatch( - const DexFile& dex_file, dex::StringIndex string_index) { - return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeMethodPatch( + MethodReference target_method) { + return NewPcRelativePatch(*target_method.dex_file, + target_method.dex_method_index, + &pc_relative_method_patches_); } CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeTypePatch( @@ -1620,6 +1645,11 @@ CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewTypeBssEntryPatch( return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_); } +CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeStringPatch( + const DexFile& dex_file, dex::StringIndex string_index) { + return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +} + CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeDexCacheArrayPatch( const DexFile& dex_file, uint32_t element_offset) { return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_); @@ -1637,27 +1667,6 @@ Literal* CodeGeneratorMIPS::DeduplicateUint32Literal(uint32_t value, Uint32ToLit [this, value]() { return __ NewLiteral<uint32_t>(value); }); } -Literal* CodeGeneratorMIPS::DeduplicateMethodLiteral(MethodReference target_method, - MethodToLiteralMap* map) { - return map->GetOrCreate( - target_method, - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - -Literal* CodeGeneratorMIPS::DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index) { - return boot_image_string_patches_.GetOrCreate( - StringReference(&dex_file, string_index), - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - -Literal* CodeGeneratorMIPS::DeduplicateBootImageTypeLiteral(const DexFile& dex_file, - dex::TypeIndex type_index) { - return boot_image_type_patches_.GetOrCreate( - TypeReference(&dex_file, type_index), - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) { return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } @@ -1665,6 +1674,7 @@ Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, Register out, Register base) { + DCHECK_NE(out, base); if (GetInstructionSetFeatures().IsR6()) { DCHECK_EQ(base, ZERO); __ Bind(&info->high_label); @@ -1724,31 +1734,32 @@ void CodeGeneratorMIPS::PatchJitRootUse(uint8_t* code, DCHECK_EQ(code[literal_offset + 1], 0x12); DCHECK_EQ((code[literal_offset + 2] & 0xE0), 0x00); DCHECK_EQ(code[literal_offset + 3], 0x3C); - // lw reg, reg, addr32_low + // instr reg, reg, addr32_low DCHECK_EQ(code[literal_offset + 4], 0x78); DCHECK_EQ(code[literal_offset + 5], 0x56); - DCHECK_EQ((code[literal_offset + 7] & 0xFC), 0x8C); - addr32 += (addr32 & 0x8000) << 1; // Account for sign extension in "lw reg, reg, addr32_low". + addr32 += (addr32 & 0x8000) << 1; // Account for sign extension in "instr reg, reg, addr32_low". // lui reg, addr32_high code[literal_offset + 0] = static_cast<uint8_t>(addr32 >> 16); code[literal_offset + 1] = static_cast<uint8_t>(addr32 >> 24); - // lw reg, reg, addr32_low + // instr reg, reg, addr32_low code[literal_offset + 4] = static_cast<uint8_t>(addr32 >> 0); code[literal_offset + 5] = static_cast<uint8_t>(addr32 >> 8); } void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const JitPatchInfo& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find(StringReference(&info.target_dex_file, - dex::StringIndex(info.index))); + const auto it = jit_string_roots_.find(StringReference(&info.target_dex_file, + dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const JitPatchInfo& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find(TypeReference(&info.target_dex_file, - dex::TypeIndex(info.index))); + const auto it = jit_class_roots_.find(TypeReference(&info.target_dex_file, + dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } @@ -2441,6 +2452,9 @@ void LocationsBuilderMIPS::VisitArrayGet(HArrayGet* instruction) { object_array_get_with_read_barrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall); + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); if (Primitive::IsFloatingPointType(type)) { @@ -3443,8 +3457,6 @@ void InstructionCodeGeneratorMIPS::HandleCondition(HCondition* instruction) { Primitive::Type type = instruction->InputAt(0)->GetType(); LocationSummary* locations = instruction->GetLocations(); - Register dst = locations->Out().AsRegister<Register>(); - MipsLabel true_label; switch (type) { default: @@ -3453,27 +3465,14 @@ void InstructionCodeGeneratorMIPS::HandleCondition(HCondition* instruction) { return; case Primitive::kPrimLong: - // TODO: don't use branches. - GenerateLongCompareAndBranch(instruction->GetCondition(), locations, &true_label); - break; + GenerateLongCompare(instruction->GetCondition(), locations); + return; case Primitive::kPrimFloat: case Primitive::kPrimDouble: GenerateFpCompare(instruction->GetCondition(), instruction->IsGtBias(), type, locations); return; } - - // Convert the branches into the result. - MipsLabel done; - - // False case: result = 0. - __ LoadConst32(dst, 0); - __ B(&done); - - // True case: result = 1. - __ Bind(&true_label); - __ LoadConst32(dst, 1); - __ Bind(&done); } void InstructionCodeGeneratorMIPS::DivRemOneOrMinusOne(HBinaryOperation* instruction) { @@ -4243,6 +4242,221 @@ void InstructionCodeGeneratorMIPS::GenerateIntCompareAndBranch(IfCondition cond, } } +void InstructionCodeGeneratorMIPS::GenerateLongCompare(IfCondition cond, + LocationSummary* locations) { + Register dst = locations->Out().AsRegister<Register>(); + Register lhs_high = locations->InAt(0).AsRegisterPairHigh<Register>(); + Register lhs_low = locations->InAt(0).AsRegisterPairLow<Register>(); + Location rhs_location = locations->InAt(1); + Register rhs_high = ZERO; + Register rhs_low = ZERO; + int64_t imm = 0; + uint32_t imm_high = 0; + uint32_t imm_low = 0; + bool use_imm = rhs_location.IsConstant(); + if (use_imm) { + imm = rhs_location.GetConstant()->AsLongConstant()->GetValue(); + imm_high = High32Bits(imm); + imm_low = Low32Bits(imm); + } else { + rhs_high = rhs_location.AsRegisterPairHigh<Register>(); + rhs_low = rhs_location.AsRegisterPairLow<Register>(); + } + if (use_imm && imm == 0) { + switch (cond) { + case kCondEQ: + case kCondBE: // <= 0 if zero + __ Or(dst, lhs_high, lhs_low); + __ Sltiu(dst, dst, 1); + break; + case kCondNE: + case kCondA: // > 0 if non-zero + __ Or(dst, lhs_high, lhs_low); + __ Sltu(dst, ZERO, dst); + break; + case kCondLT: + __ Slt(dst, lhs_high, ZERO); + break; + case kCondGE: + __ Slt(dst, lhs_high, ZERO); + __ Xori(dst, dst, 1); + break; + case kCondLE: + __ Or(TMP, lhs_high, lhs_low); + __ Sra(AT, lhs_high, 31); + __ Sltu(dst, AT, TMP); + __ Xori(dst, dst, 1); + break; + case kCondGT: + __ Or(TMP, lhs_high, lhs_low); + __ Sra(AT, lhs_high, 31); + __ Sltu(dst, AT, TMP); + break; + case kCondB: // always false + __ Andi(dst, dst, 0); + break; + case kCondAE: // always true + __ Ori(dst, ZERO, 1); + break; + } + } else if (use_imm) { + // TODO: more efficient comparison with constants without loading them into TMP/AT. + switch (cond) { + case kCondEQ: + __ LoadConst32(TMP, imm_high); + __ Xor(TMP, TMP, lhs_high); + __ LoadConst32(AT, imm_low); + __ Xor(AT, AT, lhs_low); + __ Or(dst, TMP, AT); + __ Sltiu(dst, dst, 1); + break; + case kCondNE: + __ LoadConst32(TMP, imm_high); + __ Xor(TMP, TMP, lhs_high); + __ LoadConst32(AT, imm_low); + __ Xor(AT, AT, lhs_low); + __ Or(dst, TMP, AT); + __ Sltu(dst, ZERO, dst); + break; + case kCondLT: + case kCondGE: + if (dst == lhs_low) { + __ LoadConst32(TMP, imm_low); + __ Sltu(dst, lhs_low, TMP); + } + __ LoadConst32(TMP, imm_high); + __ Slt(AT, lhs_high, TMP); + __ Slt(TMP, TMP, lhs_high); + if (dst != lhs_low) { + __ LoadConst32(dst, imm_low); + __ Sltu(dst, lhs_low, dst); + } + __ Slt(dst, TMP, dst); + __ Or(dst, dst, AT); + if (cond == kCondGE) { + __ Xori(dst, dst, 1); + } + break; + case kCondGT: + case kCondLE: + if (dst == lhs_low) { + __ LoadConst32(TMP, imm_low); + __ Sltu(dst, TMP, lhs_low); + } + __ LoadConst32(TMP, imm_high); + __ Slt(AT, TMP, lhs_high); + __ Slt(TMP, lhs_high, TMP); + if (dst != lhs_low) { + __ LoadConst32(dst, imm_low); + __ Sltu(dst, dst, lhs_low); + } + __ Slt(dst, TMP, dst); + __ Or(dst, dst, AT); + if (cond == kCondLE) { + __ Xori(dst, dst, 1); + } + break; + case kCondB: + case kCondAE: + if (dst == lhs_low) { + __ LoadConst32(TMP, imm_low); + __ Sltu(dst, lhs_low, TMP); + } + __ LoadConst32(TMP, imm_high); + __ Sltu(AT, lhs_high, TMP); + __ Sltu(TMP, TMP, lhs_high); + if (dst != lhs_low) { + __ LoadConst32(dst, imm_low); + __ Sltu(dst, lhs_low, dst); + } + __ Slt(dst, TMP, dst); + __ Or(dst, dst, AT); + if (cond == kCondAE) { + __ Xori(dst, dst, 1); + } + break; + case kCondA: + case kCondBE: + if (dst == lhs_low) { + __ LoadConst32(TMP, imm_low); + __ Sltu(dst, TMP, lhs_low); + } + __ LoadConst32(TMP, imm_high); + __ Sltu(AT, TMP, lhs_high); + __ Sltu(TMP, lhs_high, TMP); + if (dst != lhs_low) { + __ LoadConst32(dst, imm_low); + __ Sltu(dst, dst, lhs_low); + } + __ Slt(dst, TMP, dst); + __ Or(dst, dst, AT); + if (cond == kCondBE) { + __ Xori(dst, dst, 1); + } + break; + } + } else { + switch (cond) { + case kCondEQ: + __ Xor(TMP, lhs_high, rhs_high); + __ Xor(AT, lhs_low, rhs_low); + __ Or(dst, TMP, AT); + __ Sltiu(dst, dst, 1); + break; + case kCondNE: + __ Xor(TMP, lhs_high, rhs_high); + __ Xor(AT, lhs_low, rhs_low); + __ Or(dst, TMP, AT); + __ Sltu(dst, ZERO, dst); + break; + case kCondLT: + case kCondGE: + __ Slt(TMP, rhs_high, lhs_high); + __ Sltu(AT, lhs_low, rhs_low); + __ Slt(TMP, TMP, AT); + __ Slt(AT, lhs_high, rhs_high); + __ Or(dst, AT, TMP); + if (cond == kCondGE) { + __ Xori(dst, dst, 1); + } + break; + case kCondGT: + case kCondLE: + __ Slt(TMP, lhs_high, rhs_high); + __ Sltu(AT, rhs_low, lhs_low); + __ Slt(TMP, TMP, AT); + __ Slt(AT, rhs_high, lhs_high); + __ Or(dst, AT, TMP); + if (cond == kCondLE) { + __ Xori(dst, dst, 1); + } + break; + case kCondB: + case kCondAE: + __ Sltu(TMP, rhs_high, lhs_high); + __ Sltu(AT, lhs_low, rhs_low); + __ Slt(TMP, TMP, AT); + __ Sltu(AT, lhs_high, rhs_high); + __ Or(dst, AT, TMP); + if (cond == kCondAE) { + __ Xori(dst, dst, 1); + } + break; + case kCondA: + case kCondBE: + __ Sltu(TMP, lhs_high, rhs_high); + __ Sltu(AT, rhs_low, lhs_low); + __ Slt(TMP, TMP, AT); + __ Sltu(AT, rhs_high, lhs_high); + __ Or(dst, AT, TMP); + if (cond == kCondBE) { + __ Xori(dst, dst, 1); + } + break; + } + } +} + void InstructionCodeGeneratorMIPS::GenerateLongCompareAndBranch(IfCondition cond, LocationSummary* locations, MipsLabel* label) { @@ -5775,6 +5989,9 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall)); + if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::RequiresRegister()); if (generate_volatile) { InvokeRuntimeCallingConvention calling_convention; @@ -6453,6 +6670,7 @@ void CodeGeneratorMIPS::GenerateReadBarrierForRootSlow(HInstruction* instruction void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) { LocationSummary::CallKind call_kind = LocationSummary::kNoCall; TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); + bool baker_read_barrier_slow_path = false; switch (type_check_kind) { case TypeCheckKind::kExactCheck: case TypeCheckKind::kAbstractClassCheck: @@ -6460,6 +6678,7 @@ void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) { case TypeCheckKind::kArrayObjectCheck: call_kind = kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; + baker_read_barrier_slow_path = kUseBakerReadBarrier; break; case TypeCheckKind::kArrayCheck: case TypeCheckKind::kUnresolvedCheck: @@ -6469,6 +6688,9 @@ void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) { } LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); + if (baker_read_barrier_slow_path) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); // The output does overlap inputs. @@ -6738,7 +6960,7 @@ void LocationsBuilderMIPS::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invo DCHECK(!invoke->IsStaticWithExplicitClinitCheck()); bool is_r6 = codegen_->GetInstructionSetFeatures().IsR6(); - bool has_extra_input = invoke->HasPcRelativeDexCache() && !is_r6; + bool has_extra_input = invoke->HasPcRelativeMethodLoadKind() && !is_r6; IntrinsicLocationsBuilderMIPS intrinsic(codegen_); if (intrinsic.TryDispatch(invoke)) { @@ -6784,27 +7006,22 @@ HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind( bool is_r6 = GetInstructionSetFeatures().IsR6(); bool fallback_load = has_irreducible_loops && !is_r6; switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadString::LoadKind::kBootImageAddress: - break; case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; + case HLoadString::LoadKind::kBootImageAddress: + break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); fallback_load = false; break; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kRuntimeCall: fallback_load = false; break; } if (fallback_load) { - desired_string_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; + desired_string_load_kind = HLoadString::LoadKind::kRuntimeCall; } return desired_string_load_kind; } @@ -6823,27 +7040,22 @@ HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: fallback_load = false; break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadClass::LoadKind::kBootImageAddress: - break; case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; + case HLoadClass::LoadKind::kBootImageAddress: + break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); fallback_load = false; break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: fallback_load = false; break; } if (fallback_load) { - desired_class_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod; + desired_class_load_kind = HLoadClass::LoadKind::kRuntimeCall; } return desired_class_load_kind; } @@ -6885,6 +7097,7 @@ HInvokeStaticOrDirect::DispatchInfo CodeGeneratorMIPS::GetSupportedInvokeStaticO bool is_r6 = GetInstructionSetFeatures().IsR6(); bool fallback_load = has_irreducible_loops && !is_r6; switch (dispatch_info.method_load_kind) { + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: break; default: @@ -6904,7 +7117,7 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke HInvokeStaticOrDirect::MethodLoadKind method_load_kind = invoke->GetMethodLoadKind(); HInvokeStaticOrDirect::CodePtrLocation code_ptr_location = invoke->GetCodePtrLocation(); bool is_r6 = GetInstructionSetFeatures().IsR6(); - Register base_reg = (invoke->HasPcRelativeDexCache() && !is_r6) + Register base_reg = (invoke->HasPcRelativeMethodLoadKind() && !is_r6) ? GetInvokeStaticOrDirectExtraParameter(invoke, temp.AsRegister<Register>()) : ZERO; @@ -6922,6 +7135,16 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: { + DCHECK(GetCompilerOptions().IsBootImage()); + PcRelativePatchInfo* info = NewPcRelativeMethodPatch(invoke->GetTargetMethod()); + bool reordering = __ SetReorder(false); + Register temp_reg = temp.AsRegister<Register>(); + EmitPcRelativeAddressPlaceholderHigh(info, TMP, base_reg); + __ Addiu(temp_reg, TMP, /* placeholder */ 0x5678); + __ SetReorder(reordering); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress()); break; @@ -7054,28 +7277,28 @@ void InstructionCodeGeneratorMIPS::VisitInvokeVirtual(HInvokeVirtual* invoke) { void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; - CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( - cls, - Location::RegisterLocation(calling_convention.GetRegisterAt(0)), - calling_convention.GetReturnLocation(Primitive::kPrimNot)); + Location loc = Location::RegisterLocation(calling_convention.GetRegisterAt(0)); + CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(cls, loc, loc); return; } DCHECK(!cls->NeedsAccessCheck()); - + const bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage(); LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier) ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind); + if (kUseBakerReadBarrier && requires_read_barrier && !cls->NeedsEnvironment()) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } switch (load_kind) { // We need an extra register for PC-relative literals on R2. - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: case HLoadClass::LoadKind::kBootImageAddress: case HLoadClass::LoadKind::kBssEntry: - if (codegen_->GetInstructionSetFeatures().IsR6()) { + if (isR6) { break; } FALLTHROUGH_INTENDED; @@ -7086,13 +7309,29 @@ void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) { break; } locations->SetOut(Location::RequiresRegister()); + if (load_kind == HLoadClass::LoadKind::kBssEntry) { + if (!kUseReadBarrier || kUseBakerReadBarrier) { + // Rely on the type resolution or initialization and marking to save everything we need. + // Request a temp to hold the BSS entry location for the slow path on R2 + // (no benefit for R6). + if (!isR6) { + locations->AddTemp(Location::RequiresRegister()); + } + RegisterSet caller_saves = RegisterSet::Empty(); + InvokeRuntimeCallingConvention calling_convention; + caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); + locations->SetCustomSlowPathCallerSaves(caller_saves); + } else { + // For non-Baker read barriers we have a temp-clobbering call. + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not // move. void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -7105,14 +7344,13 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); switch (load_kind) { // We need an extra register for PC-relative literals on R2. - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: case HLoadClass::LoadKind::kBootImageAddress: case HLoadClass::LoadKind::kBssEntry: base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>(); break; case HLoadClass::LoadKind::kReferrersClass: - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: base_or_current_method_reg = locations->InAt(0).AsRegister<Register>(); break; default: @@ -7136,14 +7374,6 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF read_barrier_option); break; } - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); - __ LoadLiteral(out, - base_or_current_method_reg, - codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), - cls->GetTypeIndex())); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); @@ -7168,10 +7398,22 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF case HLoadClass::LoadKind::kBssEntry: { CodeGeneratorMIPS::PcRelativePatchInfo* info = codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex()); - bool reordering = __ SetReorder(false); - codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg); - GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option); - __ SetReorder(reordering); + constexpr bool non_baker_read_barrier = kUseReadBarrier && !kUseBakerReadBarrier; + if (isR6 || non_baker_read_barrier) { + bool reordering = __ SetReorder(false); + codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg); + GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option); + __ SetReorder(reordering); + } else { + // On R2 save the BSS entry address in a temporary register instead of + // recalculating it in the slow path. + Register temp = locations->GetTemp(0).AsRegister<Register>(); + bool reordering = __ SetReorder(false); + codegen_->EmitPcRelativeAddressPlaceholderHigh(info, temp, base_or_current_method_reg); + __ Addiu(temp, temp, /* placeholder */ 0x5678); + __ SetReorder(reordering); + GenerateGcRootFieldLoad(cls, out_loc, temp, /* offset */ 0, read_barrier_option); + } generate_null_check = true; break; } @@ -7186,7 +7428,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF __ SetReorder(reordering); break; } - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: case HLoadClass::LoadKind::kInvalid: LOG(FATAL) << "UNREACHABLE"; UNREACHABLE(); @@ -7235,28 +7477,44 @@ void LocationsBuilderMIPS::VisitLoadString(HLoadString* load) { LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); HLoadString::LoadKind load_kind = load->GetLoadKind(); + const bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); switch (load_kind) { // We need an extra register for PC-relative literals on R2. - case HLoadString::LoadKind::kBootImageLinkTimeAddress: case HLoadString::LoadKind::kBootImageAddress: case HLoadString::LoadKind::kBootImageLinkTimePcRelative: case HLoadString::LoadKind::kBssEntry: - if (codegen_->GetInstructionSetFeatures().IsR6()) { + if (isR6) { break; } FALLTHROUGH_INTENDED; // We need an extra register for PC-relative dex cache accesses. - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kRuntimeCall: locations->SetInAt(0, Location::RequiresRegister()); break; default: break; } - if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadString::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; - locations->SetOut(calling_convention.GetReturnLocation(load->GetType())); + locations->SetOut(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } else { locations->SetOut(Location::RequiresRegister()); + if (load_kind == HLoadString::LoadKind::kBssEntry) { + if (!kUseReadBarrier || kUseBakerReadBarrier) { + // Rely on the pResolveString and marking to save everything we need. + // Request a temp to hold the BSS entry location for the slow path on R2 + // (no benefit for R6). + if (!isR6) { + locations->AddTemp(Location::RequiresRegister()); + } + RegisterSet caller_saves = RegisterSet::Empty(); + InvokeRuntimeCallingConvention calling_convention; + caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); + locations->SetCustomSlowPathCallerSaves(caller_saves); + } else { + // For non-Baker read barriers we have a temp-clobbering call. + } + } } } @@ -7271,7 +7529,6 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); switch (load_kind) { // We need an extra register for PC-relative literals on R2. - case HLoadString::LoadKind::kBootImageLinkTimeAddress: case HLoadString::LoadKind::kBootImageAddress: case HLoadString::LoadKind::kBootImageLinkTimePcRelative: case HLoadString::LoadKind::kBssEntry: @@ -7283,13 +7540,6 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ } switch (load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - __ LoadLiteral(out, - base_or_current_method_reg, - codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(), - load->GetStringIndex())); - return; // No dex cache slow path. case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorMIPS::PcRelativePatchInfo* info = @@ -7313,14 +7563,26 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorMIPS::PcRelativePatchInfo* info = codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex()); - bool reordering = __ SetReorder(false); - codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg); - GenerateGcRootFieldLoad(load, - out_loc, - out, - /* placeholder */ 0x5678, - kCompilerReadBarrierOption); - __ SetReorder(reordering); + constexpr bool non_baker_read_barrier = kUseReadBarrier && !kUseBakerReadBarrier; + if (isR6 || non_baker_read_barrier) { + bool reordering = __ SetReorder(false); + codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg); + GenerateGcRootFieldLoad(load, + out_loc, + out, + /* placeholder */ 0x5678, + kCompilerReadBarrierOption); + __ SetReorder(reordering); + } else { + // On R2 save the BSS entry address in a temporary register instead of + // recalculating it in the slow path. + Register temp = locations->GetTemp(0).AsRegister<Register>(); + bool reordering = __ SetReorder(false); + codegen_->EmitPcRelativeAddressPlaceholderHigh(info, temp, base_or_current_method_reg); + __ Addiu(temp, temp, /* placeholder */ 0x5678); + __ SetReorder(reordering); + GenerateGcRootFieldLoad(load, out_loc, temp, /* offset */ 0, kCompilerReadBarrierOption); + } SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load); codegen_->AddSlowPath(slow_path); __ Beqz(out, slow_path->GetEntryLabel()); @@ -7348,8 +7610,9 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ } // TODO: Re-add the compiler code to do string dex cache lookup again. - DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod); + DCHECK(load_kind == HLoadString::LoadKind::kRuntimeCall); InvokeRuntimeCallingConvention calling_convention; + DCHECK_EQ(calling_convention.GetRegisterAt(0), out); __ LoadConst32(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_); codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc()); CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>(); @@ -7774,6 +8037,15 @@ void InstructionCodeGeneratorMIPS::VisitRem(HRem* instruction) { } } +void LocationsBuilderMIPS::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorMIPS::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderMIPS::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -8093,6 +8365,23 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) { CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong); + + // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum + // value of the output type if the input is outside of the range after the truncation or + // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct + // results. This matches the desired float/double-to-int/long conversion exactly. + // + // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive + // value when the input is either a NaN or is outside of the range of the output type + // after the truncation. IOW, the three special cases (NaN, too small, too big) produce + // the same result. + // + // The code takes care of the different behaviors by first comparing the input to the + // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). + // If the input is greater than or equal to the minimum, it procedes to the truncate + // instruction, which will handle such an input the same way irrespective of NAN2008. + // Otherwise the input is compared to itself to determine whether it is a NaN or not + // in order to return either zero or the minimum value. if (result_type == Primitive::kPrimLong) { if (isR6) { // trunc.l.s/trunc.l.d requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary @@ -8100,62 +8389,6 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi FRegister src = locations->InAt(0).AsFpuRegister<FRegister>(); Register dst_high = locations->Out().AsRegisterPairHigh<Register>(); Register dst_low = locations->Out().AsRegisterPairLow<Register>(); - MipsLabel truncate; - MipsLabel done; - - // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive - // value when the input is either a NaN or is outside of the range of the output type - // after the truncation. IOW, the three special cases (NaN, too small, too big) produce - // the same result. - // - // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum - // value of the output type if the input is outside of the range after the truncation or - // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct - // results. This matches the desired float/double-to-int/long conversion exactly. - // - // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction. - // - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // The code takes care of the different behaviors by first comparing the input to the - // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). - // If the input is greater than or equal to the minimum, it procedes to the truncate - // instruction, which will handle such an input the same way irrespective of NAN2008. - // Otherwise the input is compared to itself to determine whether it is a NaN or not - // in order to return either zero or the minimum value. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - __ CmpLeS(FTMP, FTMP, src); - } else { - uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min()); - __ LoadConst32(TMP, High32Bits(min_val)); - __ Mtc1(ZERO, FTMP); - __ Mthc1(TMP, FTMP); - __ CmpLeD(FTMP, FTMP, src); - } - - __ Bc1nez(FTMP, &truncate); - - if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - __ Move(dst_low, ZERO); - __ LoadConst32(dst_high, std::numeric_limits<int32_t>::min()); - __ Mfc1(TMP, FTMP); - __ And(dst_high, dst_high, TMP); - - __ B(&done); - - __ Bind(&truncate); if (input_type == Primitive::kPrimFloat) { __ TruncLS(FTMP, src); @@ -8164,8 +8397,6 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ Mfc1(dst_low, FTMP); __ Mfhc1(dst_high, FTMP); - - __ Bind(&done); } else { QuickEntrypointEnum entrypoint = (input_type == Primitive::kPrimFloat) ? kQuickF2l : kQuickD2l; @@ -8182,43 +8413,19 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi MipsLabel truncate; MipsLabel done; - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // For details see the large comment above for the truncation of float/double to long on R6. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - } else { - uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, High32Bits(min_val)); - __ Mtc1(ZERO, FTMP); - __ MoveToFpuHigh(TMP, FTMP); - } - - if (isR6) { + if (!isR6) { if (input_type == Primitive::kPrimFloat) { - __ CmpLeS(FTMP, FTMP, src); + uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); + __ LoadConst32(TMP, min_val); + __ Mtc1(TMP, FTMP); } else { - __ CmpLeD(FTMP, FTMP, src); + uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); + __ LoadConst32(TMP, High32Bits(min_val)); + __ Mtc1(ZERO, FTMP); + __ MoveToFpuHigh(TMP, FTMP); } - __ Bc1nez(FTMP, &truncate); if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); - __ Mfc1(TMP, FTMP); - __ And(dst, dst, TMP); - } else { - if (input_type == Primitive::kPrimFloat) { __ ColeS(0, FTMP, src); } else { __ ColeD(0, FTMP, src); @@ -8232,11 +8439,11 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); __ Movf(dst, ZERO, 0); - } - __ B(&done); + __ B(&done); - __ Bind(&truncate); + __ Bind(&truncate); + } if (input_type == Primitive::kPrimFloat) { __ TruncWS(FTMP, src); @@ -8245,7 +8452,9 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ Mfc1(dst, FTMP); - __ Bind(&done); + if (!isR6) { + __ Bind(&done); + } } } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsFloatingPointType(input_type)) { diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h index 3875c4bdba..736b5070d9 100644 --- a/compiler/optimizing/code_generator_mips.h +++ b/compiler/optimizing/code_generator_mips.h @@ -23,8 +23,8 @@ #include "nodes.h" #include "parallel_move_resolver.h" #include "string_reference.h" +#include "type_reference.h" #include "utils/mips/assembler_mips.h" -#include "utils/type_reference.h" namespace art { namespace mips { @@ -229,9 +229,10 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { // We switch to the table-based method starting with 7 cases. static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6; + void GenerateMemoryBarrier(MemBarrierKind kind); + private: void GenerateClassInitializationCheck(SlowPathCodeMIPS* slow_path, Register class_reg); - void GenerateMemoryBarrier(MemBarrierKind kind); void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor); void HandleBinaryOp(HBinaryOperation* operation); void HandleCondition(HCondition* instruction); @@ -294,6 +295,7 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { void GenerateIntCompareAndBranch(IfCondition cond, LocationSummary* locations, MipsLabel* label); + void GenerateLongCompare(IfCondition cond, LocationSummary* locations); void GenerateLongCompareAndBranch(IfCondition cond, LocationSummary* locations, MipsLabel* label); @@ -580,15 +582,13 @@ class CodeGeneratorMIPS : public CodeGenerator { MipsLabel pc_rel_label; }; - PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, - dex::StringIndex string_index); + PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method); PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); + PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, + dex::StringIndex string_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); - Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index); - Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index); Literal* DeduplicateBootImageAddressLiteral(uint32_t address); void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, Register out, Register base); @@ -622,16 +622,8 @@ class CodeGeneratorMIPS : public CodeGenerator { Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp); using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>; - using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>; - using BootStringToLiteralMap = ArenaSafeMap<StringReference, - Literal*, - StringReferenceValueComparator>; - using BootTypeToLiteralMap = ArenaSafeMap<TypeReference, - Literal*, - TypeReferenceValueComparator>; Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); - Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file, uint32_t offset_or_index, ArenaDeque<PcRelativePatchInfo>* patches); @@ -653,16 +645,15 @@ class CodeGeneratorMIPS : public CodeGenerator { Uint32ToLiteralMap uint32_literals_; // PC-relative patch info for each HMipsDexCacheArraysBase. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // Deduplication map for boot string literals for kBootImageLinkTimeAddress. - BootStringToLiteralMap boot_image_string_patches_; - // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). - ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; - // Deduplication map for boot type literals for kBootImageLinkTimeAddress. - BootTypeToLiteralMap boot_image_type_patches_; + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). + ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; + // Patches for string root accesses in JIT compiled code. ArenaDeque<JitPatchInfo> jit_string_patches_; // Patches for class root accesses in JIT compiled code. diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index 0459a033f8..6026814f04 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -141,7 +141,8 @@ class BoundsCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { class DivZeroCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { public: - explicit DivZeroCheckSlowPathMIPS64(HDivZeroCheck* instruction) : SlowPathCodeMIPS64(instruction) {} + explicit DivZeroCheckSlowPathMIPS64(HDivZeroCheck* instruction) + : SlowPathCodeMIPS64(instruction) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); @@ -192,7 +193,9 @@ class LoadClassSlowPathMIPS64 : public SlowPathCodeMIPS64 { if (out.IsValid()) { DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg())); Primitive::Type type = instruction_->GetType(); - mips64_codegen->MoveLocation(out, calling_convention.GetReturnLocation(type), type); + mips64_codegen->MoveLocation(out, + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + type); } RestoreLiveRegisters(codegen, locations); @@ -200,10 +203,6 @@ class LoadClassSlowPathMIPS64 : public SlowPathCodeMIPS64 { DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_); if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) { DCHECK(out.IsValid()); - // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to - // kSaveEverything and use a temporary for the .bss entry address in the fast path, - // so that we can avoid another calculation here. - DCHECK_NE(out.AsRegister<GpuRegister>(), AT); CodeGeneratorMIPS64::PcRelativePatchInfo* info = mips64_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index); mips64_codegen->EmitPcRelativeAddressPlaceholderHigh(info, AT); @@ -250,16 +249,13 @@ class LoadStringSlowPathMIPS64 : public SlowPathCodeMIPS64 { CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>(); Primitive::Type type = instruction_->GetType(); mips64_codegen->MoveLocation(locations->Out(), - calling_convention.GetReturnLocation(type), + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), type); RestoreLiveRegisters(codegen, locations); // Store the resolved String to the BSS entry. - // TODO: Change art_quick_resolve_string to kSaveEverything and use a temporary for the - // .bss entry address in the fast path, so that we can avoid another calculation here. GpuRegister out = locations->Out().AsRegister<GpuRegister>(); - DCHECK_NE(out, AT); CodeGeneratorMIPS64::PcRelativePatchInfo* info = mips64_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index); mips64_codegen->EmitPcRelativeAddressPlaceholderHigh(info, AT); @@ -306,10 +302,13 @@ class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { : SlowPathCodeMIPS64(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // Only saves live vector registers for SIMD. mips64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // Only restores live vector registers for SIMD. if (successor_ == nullptr) { __ Bc(GetReturnLabel()); } else { @@ -952,20 +951,17 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph, location_builder_(graph, this), instruction_visitor_(graph, this), move_resolver_(graph->GetArena(), this), - assembler_(graph->GetArena()), + assembler_(graph->GetArena(), &isa_features), isa_features_(isa_features), uint32_literals_(std::less<uint32_t>(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), uint64_literals_(std::less<uint64_t>(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_string_patches_(StringReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_type_patches_(TypeReferenceValueComparator(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -1445,50 +1441,36 @@ void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pat DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - pc_relative_string_patches_.size() + + pc_relative_method_patches_.size() + pc_relative_type_patches_.size() + type_bss_entry_patches_.size() + - boot_image_string_patches_.size() + - boot_image_type_patches_.size(); + pc_relative_string_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(pc_relative_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_, linker_patches); - } else { EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_, linker_patches); + } else { + DCHECK(pc_relative_method_patches_.empty()); + DCHECK(pc_relative_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(pc_relative_string_patches_, + linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); - for (const auto& entry : boot_image_string_patches_) { - const StringReference& target_string = entry.first; - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); - linker_patches->push_back(LinkerPatch::StringPatch(literal_offset, - target_string.dex_file, - target_string.string_index.index_)); - } - for (const auto& entry : boot_image_type_patches_) { - const TypeReference& target_type = entry.first; - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); - linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, - target_type.dex_file, - target_type.type_index.index_)); - } DCHECK_EQ(size, linker_patches->size()); } -CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeStringPatch( - const DexFile& dex_file, dex::StringIndex string_index) { - return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeMethodPatch( + MethodReference target_method) { + return NewPcRelativePatch(*target_method.dex_file, + target_method.dex_method_index, + &pc_relative_method_patches_); } CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeTypePatch( @@ -1501,6 +1483,11 @@ CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewTypeBssEntryPa return NewPcRelativePatch(dex_file, type_index.index_, &type_bss_entry_patches_); } +CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeStringPatch( + const DexFile& dex_file, dex::StringIndex string_index) { + return NewPcRelativePatch(dex_file, string_index.index_, &pc_relative_string_patches_); +} + CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewPcRelativeDexCacheArrayPatch( const DexFile& dex_file, uint32_t element_offset) { return NewPcRelativePatch(dex_file, element_offset, &pc_relative_dex_cache_patches_); @@ -1524,27 +1511,6 @@ Literal* CodeGeneratorMIPS64::DeduplicateUint64Literal(uint64_t value) { [this, value]() { return __ NewLiteral<uint64_t>(value); }); } -Literal* CodeGeneratorMIPS64::DeduplicateMethodLiteral(MethodReference target_method, - MethodToLiteralMap* map) { - return map->GetOrCreate( - target_method, - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - -Literal* CodeGeneratorMIPS64::DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index) { - return boot_image_string_patches_.GetOrCreate( - StringReference(&dex_file, string_index), - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - -Literal* CodeGeneratorMIPS64::DeduplicateBootImageTypeLiteral(const DexFile& dex_file, - dex::TypeIndex type_index) { - return boot_image_type_patches_.GetOrCreate( - TypeReference(&dex_file, type_index), - [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); }); -} - Literal* CodeGeneratorMIPS64::DeduplicateBootImageAddressLiteral(uint64_t address) { return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } @@ -1590,14 +1556,20 @@ void CodeGeneratorMIPS64::PatchJitRootUse(uint8_t* code, void CodeGeneratorMIPS64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } @@ -1645,13 +1617,19 @@ size_t CodeGeneratorMIPS64::RestoreCoreRegister(size_t stack_index, uint32_t reg } size_t CodeGeneratorMIPS64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ StoreFpuToOffset(kStoreDoubleword, FpuRegister(reg_id), SP, stack_index); - return kMips64DoublewordSize; + __ StoreFpuToOffset(GetGraph()->HasSIMD() ? kStoreQuadword : kStoreDoubleword, + FpuRegister(reg_id), + SP, + stack_index); + return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorMIPS64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ LoadFpuFromOffset(kLoadDoubleword, FpuRegister(reg_id), SP, stack_index); - return kMips64DoublewordSize; + __ LoadFpuFromOffset(GetGraph()->HasSIMD() ? kLoadQuadword : kLoadDoubleword, + FpuRegister(reg_id), + SP, + stack_index); + return GetFloatingPointSpillSlotSize(); } void CodeGeneratorMIPS64::DumpCoreRegister(std::ostream& stream, int reg) const { @@ -1991,6 +1969,9 @@ void LocationsBuilderMIPS64::VisitArrayGet(HArrayGet* instruction) { object_array_get_with_read_barrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall); + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); if (Primitive::IsFloatingPointType(type)) { @@ -3990,6 +3971,9 @@ void LocationsBuilderMIPS64::HandleFieldGet(HInstruction* instruction, object_field_get_with_read_barrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall); + if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::RequiresRegister()); if (Primitive::IsFloatingPointType(instruction->GetType())) { locations->SetOut(Location::RequiresFpuRegister()); @@ -4552,6 +4536,7 @@ void CodeGeneratorMIPS64::GenerateReadBarrierForRootSlow(HInstruction* instructi void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) { LocationSummary::CallKind call_kind = LocationSummary::kNoCall; TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); + bool baker_read_barrier_slow_path = false; switch (type_check_kind) { case TypeCheckKind::kExactCheck: case TypeCheckKind::kAbstractClassCheck: @@ -4559,6 +4544,7 @@ void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) { case TypeCheckKind::kArrayObjectCheck: call_kind = kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; + baker_read_barrier_slow_path = kUseBakerReadBarrier; break; case TypeCheckKind::kArrayCheck: case TypeCheckKind::kUnresolvedCheck: @@ -4568,6 +4554,9 @@ void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) { } LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); + if (baker_read_barrier_slow_path) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); // The output does overlap inputs. @@ -4876,25 +4865,19 @@ HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { bool fallback_load = false; switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadString::LoadKind::kBootImageAddress: - break; case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; - case HLoadString::LoadKind::kDexCacheViaMethod: - break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kRuntimeCall: + break; } if (fallback_load) { - desired_string_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; + desired_string_load_kind = HLoadString::LoadKind::kRuntimeCall; } return desired_string_load_kind; } @@ -4908,25 +4891,19 @@ HLoadClass::LoadKind CodeGeneratorMIPS64::GetSupportedLoadClassKind( UNREACHABLE(); case HLoadClass::LoadKind::kReferrersClass: break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadClass::LoadKind::kBootImageAddress: - break; case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kRuntimeCall: break; } if (fallback_load) { - desired_class_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod; + desired_class_load_kind = HLoadClass::LoadKind::kRuntimeCall; } return desired_class_load_kind; } @@ -4958,6 +4935,14 @@ void CodeGeneratorMIPS64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: { + DCHECK(GetCompilerOptions().IsBootImage()); + CodeGeneratorMIPS64::PcRelativePatchInfo* info = + NewPcRelativeMethodPatch(invoke->GetTargetMethod()); + EmitPcRelativeAddressPlaceholderHigh(info, AT); + __ Daddiu(temp.AsRegister<GpuRegister>(), AT, /* placeholder */ 0x5678); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ LoadLiteral(temp.AsRegister<GpuRegister>(), kLoadDoubleword, @@ -5083,12 +5068,10 @@ void InstructionCodeGeneratorMIPS64::VisitInvokeVirtual(HInvokeVirtual* invoke) void LocationsBuilderMIPS64::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; - CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( - cls, - Location::RegisterLocation(calling_convention.GetRegisterAt(0)), - calling_convention.GetReturnLocation(Primitive::kPrimNot)); + Location loc = Location::RegisterLocation(calling_convention.GetRegisterAt(0)); + CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(cls, loc, loc); return; } DCHECK(!cls->NeedsAccessCheck()); @@ -5098,17 +5081,31 @@ void LocationsBuilderMIPS64::VisitLoadClass(HLoadClass* cls) { ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind); + if (kUseBakerReadBarrier && requires_read_barrier && !cls->NeedsEnvironment()) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } if (load_kind == HLoadClass::LoadKind::kReferrersClass) { locations->SetInAt(0, Location::RequiresRegister()); } locations->SetOut(Location::RequiresRegister()); + if (load_kind == HLoadClass::LoadKind::kBssEntry) { + if (!kUseReadBarrier || kUseBakerReadBarrier) { + // Rely on the type resolution or initialization and marking to save everything we need. + RegisterSet caller_saves = RegisterSet::Empty(); + InvokeRuntimeCallingConvention calling_convention; + caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); + locations->SetCustomSlowPathCallerSaves(caller_saves); + } else { + // For non-Baker read barrier we have a temp-clobbering call. + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not // move. void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -5119,7 +5116,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S GpuRegister out = out_loc.AsRegister<GpuRegister>(); GpuRegister current_method_reg = ZERO; if (load_kind == HLoadClass::LoadKind::kReferrersClass || - load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + load_kind == HLoadClass::LoadKind::kRuntimeCall) { current_method_reg = locations->InAt(0).AsRegister<GpuRegister>(); } @@ -5138,14 +5135,6 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S ArtMethod::DeclaringClassOffset().Int32Value(), read_barrier_option); break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); - __ LoadLiteral(out, - kLoadUnsignedWord, - codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), - cls->GetTypeIndex())); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); @@ -5181,7 +5170,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S cls->GetClass())); GenerateGcRootFieldLoad(cls, out_loc, out, 0, read_barrier_option); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: case HLoadClass::LoadKind::kInvalid: LOG(FATAL) << "UNREACHABLE"; UNREACHABLE(); @@ -5230,11 +5219,22 @@ void LocationsBuilderMIPS64::VisitLoadString(HLoadString* load) { HLoadString::LoadKind load_kind = load->GetLoadKind(); LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); - if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadString::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; - locations->SetOut(calling_convention.GetReturnLocation(load->GetType())); + locations->SetOut(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } else { locations->SetOut(Location::RequiresRegister()); + if (load_kind == HLoadString::LoadKind::kBssEntry) { + if (!kUseReadBarrier || kUseBakerReadBarrier) { + // Rely on the pResolveString and marking to save everything we need. + RegisterSet caller_saves = RegisterSet::Empty(); + InvokeRuntimeCallingConvention calling_convention; + caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); + locations->SetCustomSlowPathCallerSaves(caller_saves); + } else { + // For non-Baker read barrier we have a temp-clobbering call. + } + } } } @@ -5247,13 +5247,6 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA GpuRegister out = out_loc.AsRegister<GpuRegister>(); switch (load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - __ LoadLiteral(out, - kLoadUnsignedWord, - codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(), - load->GetStringIndex())); - return; // No dex cache slow path. case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorMIPS64::PcRelativePatchInfo* info = @@ -5300,8 +5293,9 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA } // TODO: Re-add the compiler code to do string dex cache lookup again. - DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod); + DCHECK(load_kind == HLoadString::LoadKind::kRuntimeCall); InvokeRuntimeCallingConvention calling_convention; + DCHECK_EQ(calling_convention.GetRegisterAt(0), out); __ LoadConst32(calling_convention.GetRegisterAt(0), load->GetStringIndex().index_); codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc()); CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>(); @@ -5661,6 +5655,15 @@ void InstructionCodeGeneratorMIPS64::VisitRem(HRem* instruction) { } } +void LocationsBuilderMIPS64::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorMIPS64::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderMIPS64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -5806,7 +5809,11 @@ void InstructionCodeGeneratorMIPS64::VisitUnresolvedStaticFieldSet( void LocationsBuilderMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5933,68 +5940,6 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong); GpuRegister dst = locations->Out().AsRegister<GpuRegister>(); FpuRegister src = locations->InAt(0).AsFpuRegister<FpuRegister>(); - Mips64Label truncate; - Mips64Label done; - - // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive - // value when the input is either a NaN or is outside of the range of the output type - // after the truncation. IOW, the three special cases (NaN, too small, too big) produce - // the same result. - // - // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum - // value of the output type if the input is outside of the range after the truncation or - // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct - // results. This matches the desired float/double-to-int/long conversion exactly. - // - // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction. - // - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // The code takes care of the different behaviors by first comparing the input to the - // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). - // If the input is greater than or equal to the minimum, it procedes to the truncate - // instruction, which will handle such an input the same way irrespective of NAN2008. - // Otherwise the input is compared to itself to determine whether it is a NaN or not - // in order to return either zero or the minimum value. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = (result_type == Primitive::kPrimLong) - ? bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min()) - : bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - __ CmpLeS(FTMP, FTMP, src); - } else { - uint64_t min_val = (result_type == Primitive::kPrimLong) - ? bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min()) - : bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); - __ LoadConst64(TMP, min_val); - __ Dmtc1(TMP, FTMP); - __ CmpLeD(FTMP, FTMP, src); - } - - __ Bc1nez(FTMP, &truncate); - - if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - if (result_type == Primitive::kPrimLong) { - __ LoadConst64(dst, std::numeric_limits<int64_t>::min()); - } else { - __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); - } - __ Mfc1(TMP, FTMP); - __ And(dst, dst, TMP); - - __ Bc(&done); - - __ Bind(&truncate); if (result_type == Primitive::kPrimLong) { if (input_type == Primitive::kPrimFloat) { @@ -6011,8 +5956,6 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver } __ Mfc1(dst, FTMP); } - - __ Bind(&done); } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsFloatingPointType(input_type)) { FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>(); diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index fd1a174608..9c6b6f62cb 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -21,8 +21,8 @@ #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" +#include "type_reference.h" #include "utils/mips64/assembler_mips64.h" -#include "utils/type_reference.h" namespace art { namespace mips64 { @@ -226,9 +226,10 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator { // We switch to the table-based method starting with 7 cases. static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6; + void GenerateMemoryBarrier(MemBarrierKind kind); + private: void GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path, GpuRegister class_reg); - void GenerateMemoryBarrier(MemBarrierKind kind); void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor); void HandleBinaryOp(HBinaryOperation* operation); void HandleCondition(HCondition* instruction); @@ -313,6 +314,9 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator { uint32_t num_entries, HBasicBlock* switch_block, HBasicBlock* default_block); + int32_t VecAddress(LocationSummary* locations, + size_t size, + /* out */ GpuRegister* adjusted_base); Mips64Assembler* const assembler_; CodeGeneratorMIPS64* const codegen_; @@ -335,7 +339,11 @@ class CodeGeneratorMIPS64 : public CodeGenerator { size_t GetWordSize() const OVERRIDE { return kMips64DoublewordSize; } - size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64DoublewordSize; } + size_t GetFloatingPointSpillSlotSize() const OVERRIDE { + return GetGraph()->HasSIMD() + ? 2 * kMips64DoublewordSize // 16 bytes for each spill. + : 1 * kMips64DoublewordSize; // 8 bytes for each spill. + } uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE { return assembler_.GetLabelLocation(GetLabelOf(block)); @@ -540,17 +548,15 @@ class CodeGeneratorMIPS64 : public CodeGenerator { Mips64Label pc_rel_label; }; - PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, - dex::StringIndex string_index); + PcRelativePatchInfo* NewPcRelativeMethodPatch(MethodReference target_method); PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); + PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, + dex::StringIndex string_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); PcRelativePatchInfo* NewPcRelativeCallPatch(const DexFile& dex_file, uint32_t method_index); - Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, - dex::StringIndex string_index); - Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index); Literal* DeduplicateBootImageAddressLiteral(uint64_t address); void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, GpuRegister out); @@ -569,23 +575,15 @@ class CodeGeneratorMIPS64 : public CodeGenerator { private: using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>; using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, Literal*>; - using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>; using StringToLiteralMap = ArenaSafeMap<StringReference, Literal*, StringReferenceValueComparator>; using TypeToLiteralMap = ArenaSafeMap<TypeReference, Literal*, TypeReferenceValueComparator>; - using BootStringToLiteralMap = ArenaSafeMap<StringReference, - Literal*, - StringReferenceValueComparator>; - using BootTypeToLiteralMap = ArenaSafeMap<TypeReference, - Literal*, - TypeReferenceValueComparator>; Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); Literal* DeduplicateUint64Literal(uint64_t value); - Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file, uint32_t offset_or_index, @@ -611,16 +609,15 @@ class CodeGeneratorMIPS64 : public CodeGenerator { Uint64ToLiteralMap uint64_literals_; // PC-relative patch info. ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // Deduplication map for boot string literals for kBootImageLinkTimeAddress. - BootStringToLiteralMap boot_image_string_patches_; - // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). - ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; - // Deduplication map for boot type literals for kBootImageLinkTimeAddress. - BootTypeToLiteralMap boot_image_type_patches_; + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<PcRelativePatchInfo> pc_relative_method_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC). + ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_; + // Patches for string root accesses in JIT compiled code. StringToLiteralMap jit_string_patches_; // Patches for class root accesses in JIT compiled code. diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 93befa439c..a41adca02c 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -22,6 +22,7 @@ using namespace vixl::aarch64; // NOLINT(build/namespaces) namespace art { namespace arm64 { +using helpers::DRegisterFrom; using helpers::VRegisterFrom; using helpers::HeapOperand; using helpers::InputRegisterAt; @@ -467,7 +468,50 @@ void LocationsBuilderARM64::VisitVecMin(HVecMin* instruction) { } void InstructionCodeGeneratorARM64::VisitVecMin(HVecMin* instruction) { - LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ Umin(dst.V16B(), lhs.V16B(), rhs.V16B()); + } else { + __ Smin(dst.V16B(), lhs.V16B(), rhs.V16B()); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ Umin(dst.V8H(), lhs.V8H(), rhs.V8H()); + } else { + __ Smin(dst.V8H(), lhs.V8H(), rhs.V8H()); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ Umin(dst.V4S(), lhs.V4S(), rhs.V4S()); + } else { + __ Smin(dst.V4S(), lhs.V4S(), rhs.V4S()); + } + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ Fmin(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ Fmin(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderARM64::VisitVecMax(HVecMax* instruction) { @@ -475,7 +519,50 @@ void LocationsBuilderARM64::VisitVecMax(HVecMax* instruction) { } void InstructionCodeGeneratorARM64::VisitVecMax(HVecMax* instruction) { - LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ Umax(dst.V16B(), lhs.V16B(), rhs.V16B()); + } else { + __ Smax(dst.V16B(), lhs.V16B(), rhs.V16B()); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ Umax(dst.V8H(), lhs.V8H(), rhs.V8H()); + } else { + __ Smax(dst.V8H(), lhs.V8H(), rhs.V8H()); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ Umax(dst.V4S(), lhs.V4S(), rhs.V4S()); + } else { + __ Smax(dst.V4S(), lhs.V4S(), rhs.V4S()); + } + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ Fmax(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ Fmax(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) { @@ -771,20 +858,28 @@ static void CreateVecMemLocations(ArenaAllocator* arena, } } -// Helper to set up registers and address for vector memory operations. -MemOperand InstructionCodeGeneratorARM64::CreateVecMemRegisters( +// Helper to set up locations for vector memory operations. Returns the memory operand and, +// if used, sets the output parameter scratch to a temporary register used in this operand, +// so that the client can release it right after the memory operand use. +MemOperand InstructionCodeGeneratorARM64::VecAddress( HVecMemoryOperation* instruction, - Location* reg_loc, - bool is_load, - UseScratchRegisterScope* temps_scope) { + UseScratchRegisterScope* temps_scope, + size_t size, + bool is_string_char_at, + /*out*/ Register* scratch) { LocationSummary* locations = instruction->GetLocations(); Register base = InputRegisterAt(instruction, 0); - Location index = locations->InAt(1); - *reg_loc = is_load ? locations->Out() : locations->InAt(2); - Primitive::Type packed_type = instruction->GetPackedType(); - uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(packed_type)).Uint32Value(); - size_t shift = Primitive::ComponentSizeShift(packed_type); + if (instruction->InputAt(1)->IsIntermediateAddressIndex()) { + DCHECK(!is_string_char_at); + return MemOperand(base.X(), InputRegisterAt(instruction, 1).X()); + } + + Location index = locations->InAt(1); + uint32_t offset = is_string_char_at + ? mirror::String::ValueOffset().Uint32Value() + : mirror::Array::DataOffset(size).Uint32Value(); + size_t shift = ComponentSizeShiftWidth(size); // HIntermediateAddress optimization is only applied for scalar ArrayGet and ArraySet. DCHECK(!instruction->InputAt(0)->IsIntermediateAddress()); @@ -793,10 +888,9 @@ MemOperand InstructionCodeGeneratorARM64::CreateVecMemRegisters( offset += Int64ConstantFrom(index) << shift; return HeapOperand(base, offset); } else { - Register temp = temps_scope->AcquireSameSizeAs(base); - __ Add(temp, base, Operand(WRegisterFrom(index), LSL, shift)); - - return HeapOperand(temp, offset); + *scratch = temps_scope->AcquireSameSizeAs(base); + __ Add(*scratch, base, Operand(WRegisterFrom(index), LSL, shift)); + return HeapOperand(*scratch, offset); } } @@ -805,15 +899,43 @@ void LocationsBuilderARM64::VisitVecLoad(HVecLoad* instruction) { } void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { - Location reg_loc = Location::NoLocation(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + VRegister reg = VRegisterFrom(locations->Out()); UseScratchRegisterScope temps(GetVIXLAssembler()); - MemOperand mem = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true, &temps); - VRegister reg = VRegisterFrom(reg_loc); + Register scratch; switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + DCHECK_EQ(8u, instruction->GetVectorLength()); + // Special handling of compressed/uncompressed string load. + if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + vixl::aarch64::Label uncompressed_load, done; + // Test compression bit. + static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, + "Expecting 0=compressed, 1=uncompressed"); + uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); + Register length = temps.AcquireW(); + __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset)); + __ Tbnz(length.W(), 0, &uncompressed_load); + temps.Release(length); // no longer needed + // Zero extend 8 compressed bytes into 8 chars. + __ Ldr(DRegisterFrom(locations->Out()).V8B(), + VecAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); + __ Uxtl(reg.V8H(), reg.V8B()); + __ B(&done); + if (scratch.IsValid()) { + temps.Release(scratch); // if used, no longer needed + } + // Load 8 direct uncompressed chars. + __ Bind(&uncompressed_load); + __ Ldr(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); + __ Bind(&done); + return; + } + FALLTHROUGH_INTENDED; case Primitive::kPrimBoolean: case Primitive::kPrimByte: - case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimFloat: @@ -821,7 +943,7 @@ void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { case Primitive::kPrimDouble: DCHECK_LE(2u, instruction->GetVectorLength()); DCHECK_LE(instruction->GetVectorLength(), 16u); - __ Ldr(reg, mem); + __ Ldr(reg, VecAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -834,10 +956,11 @@ void LocationsBuilderARM64::VisitVecStore(HVecStore* instruction) { } void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { - Location reg_loc = Location::NoLocation(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + VRegister reg = VRegisterFrom(locations->InAt(2)); UseScratchRegisterScope temps(GetVIXLAssembler()); - MemOperand mem = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false, &temps); - VRegister reg = VRegisterFrom(reg_loc); + Register scratch; switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: @@ -850,7 +973,7 @@ void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { case Primitive::kPrimDouble: DCHECK_LE(2u, instruction->GetVectorLength()); DCHECK_LE(instruction->GetVectorLength(), 16u); - __ Str(reg, mem); + __ Str(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type"; diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc index 50b95c17cb..af9e89e791 100644 --- a/compiler/optimizing/code_generator_vector_mips64.cc +++ b/compiler/optimizing/code_generator_vector_mips64.cc @@ -15,6 +15,7 @@ */ #include "code_generator_mips64.h" +#include "mirror/array-inl.h" namespace art { namespace mips64 { @@ -22,12 +23,72 @@ namespace mips64 { // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<Mips64Assembler*>(GetAssembler())-> // NOLINT +VectorRegister VectorRegisterFrom(Location location) { + DCHECK(location.IsFpuRegister()); + return static_cast<VectorRegister>(location.AsFpuRegister<FpuRegister>()); +} + void LocationsBuilderMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void InstructionCodeGeneratorMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ FillB(dst, locations->InAt(0).AsRegister<GpuRegister>()); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ FillH(dst, locations->InAt(0).AsRegister<GpuRegister>()); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FillW(dst, locations->InAt(0).AsRegister<GpuRegister>()); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FillD(dst, locations->InAt(0).AsRegister<GpuRegister>()); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ ReplicateFPToVectorRegister(dst, + locations->InAt(0).AsFpuRegister<FpuRegister>(), + /* is_double */ false); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ ReplicateFPToVectorRegister(dst, + locations->InAt(0).AsFpuRegister<FpuRegister>(), + /* is_double */ true); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { @@ -51,13 +112,23 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in LocationSummary* locations = new (arena) LocationSummary(instruction); switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), + instruction->IsVecNot() ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + break; case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: + case Primitive::kPrimLong: case Primitive::kPrimFloat: case Primitive::kPrimDouble: - DCHECK(locations); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), + (instruction->IsVecNeg() || instruction->IsVecAbs()) + ? Location::kOutputOverlap + : Location::kNoOutputOverlap); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -70,7 +141,18 @@ void LocationsBuilderMIPS64::VisitVecCnv(HVecCnv* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecCnv(HVecCnv* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister src = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + Primitive::Type from = instruction->GetInputType(); + Primitive::Type to = instruction->GetResultType(); + if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ffint_sW(dst, src); + } else { + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecNeg(HVecNeg* instruction) { @@ -78,7 +160,45 @@ void LocationsBuilderMIPS64::VisitVecNeg(HVecNeg* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecNeg(HVecNeg* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister src = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ FillB(dst, ZERO); + __ SubvB(dst, dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ FillH(dst, ZERO); + __ SubvH(dst, dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FillW(dst, ZERO); + __ SubvW(dst, dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FillD(dst, ZERO); + __ SubvD(dst, dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FillW(dst, ZERO); + __ FsubW(dst, dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FillD(dst, ZERO); + __ FsubD(dst, dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecAbs(HVecAbs* instruction) { @@ -86,7 +206,47 @@ void LocationsBuilderMIPS64::VisitVecAbs(HVecAbs* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecAbs(HVecAbs* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister src = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ FillB(dst, ZERO); // all zeroes + __ Add_aB(dst, dst, src); // dst = abs(0) + abs(src) + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ FillH(dst, ZERO); // all zeroes + __ Add_aH(dst, dst, src); // dst = abs(0) + abs(src) + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FillW(dst, ZERO); // all zeroes + __ Add_aW(dst, dst, src); // dst = abs(0) + abs(src) + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FillD(dst, ZERO); // all zeroes + __ Add_aD(dst, dst, src); // dst = abs(0) + abs(src) + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ LdiW(dst, -1); // all ones + __ SrliW(dst, dst, 1); + __ AndV(dst, dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ LdiD(dst, -1); // all ones + __ SrliD(dst, dst, 1); + __ AndV(dst, dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecNot(HVecNot* instruction) { @@ -94,7 +254,30 @@ void LocationsBuilderMIPS64::VisitVecNot(HVecNot* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecNot(HVecNot* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister src = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: // special case boolean-not + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ LdiB(dst, 1); + __ XorV(dst, dst, src); + break; + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ NorV(dst, src, src); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } // Helper to set up locations for vector binary operations. @@ -106,9 +289,12 @@ static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: + case Primitive::kPrimLong: case Primitive::kPrimFloat: case Primitive::kPrimDouble: - DCHECK(locations); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -121,7 +307,40 @@ void LocationsBuilderMIPS64::VisitVecAdd(HVecAdd* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecAdd(HVecAdd* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ AddvB(dst, lhs, rhs); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ AddvH(dst, lhs, rhs); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ AddvW(dst, lhs, rhs); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ AddvD(dst, lhs, rhs); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FaddW(dst, lhs, rhs); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FaddD(dst, lhs, rhs); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { @@ -129,7 +348,40 @@ void LocationsBuilderMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + instruction->IsRounded() + ? __ Aver_uB(dst, lhs, rhs) + : __ Ave_uB(dst, lhs, rhs); + } else { + instruction->IsRounded() + ? __ Aver_sB(dst, lhs, rhs) + : __ Ave_sB(dst, lhs, rhs); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + instruction->IsRounded() + ? __ Aver_uH(dst, lhs, rhs) + : __ Ave_uH(dst, lhs, rhs); + } else { + instruction->IsRounded() + ? __ Aver_sH(dst, lhs, rhs) + : __ Ave_sH(dst, lhs, rhs); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) { @@ -137,7 +389,40 @@ void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecSub(HVecSub* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ SubvB(dst, lhs, rhs); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ SubvH(dst, lhs, rhs); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ SubvW(dst, lhs, rhs); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ SubvD(dst, lhs, rhs); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FsubW(dst, lhs, rhs); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FsubD(dst, lhs, rhs); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecMul(HVecMul* instruction) { @@ -145,7 +430,40 @@ void LocationsBuilderMIPS64::VisitVecMul(HVecMul* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecMul(HVecMul* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ MulvB(dst, lhs, rhs); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ MulvH(dst, lhs, rhs); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ MulvW(dst, lhs, rhs); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ MulvD(dst, lhs, rhs); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FmulW(dst, lhs, rhs); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FmulD(dst, lhs, rhs); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecDiv(HVecDiv* instruction) { @@ -153,7 +471,23 @@ void LocationsBuilderMIPS64::VisitVecDiv(HVecDiv* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecDiv(HVecDiv* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ FdivW(dst, lhs, rhs); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ FdivD(dst, lhs, rhs); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecMin(HVecMin* instruction) { @@ -177,7 +511,27 @@ void LocationsBuilderMIPS64::VisitVecAnd(HVecAnd* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecAnd(HVecAnd* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ AndV(dst, lhs, rhs); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecAndNot(HVecAndNot* instruction) { @@ -193,7 +547,27 @@ void LocationsBuilderMIPS64::VisitVecOr(HVecOr* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecOr(HVecOr* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ OrV(dst, lhs, rhs); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecXor(HVecXor* instruction) { @@ -201,7 +575,27 @@ void LocationsBuilderMIPS64::VisitVecXor(HVecXor* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecXor(HVecXor* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister rhs = VectorRegisterFrom(locations->InAt(1)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ XorV(dst, lhs, rhs); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } // Helper to set up locations for vector shift operations. @@ -213,7 +607,9 @@ static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimLong: - DCHECK(locations); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant())); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -226,7 +622,32 @@ void LocationsBuilderMIPS64::VisitVecShl(HVecShl* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecShl(HVecShl* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ SlliB(dst, lhs, value); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ SlliH(dst, lhs, value); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ SlliW(dst, lhs, value); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ SlliD(dst, lhs, value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecShr(HVecShr* instruction) { @@ -234,7 +655,32 @@ void LocationsBuilderMIPS64::VisitVecShr(HVecShr* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecShr(HVecShr* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ SraiB(dst, lhs, value); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ SraiH(dst, lhs, value); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ SraiW(dst, lhs, value); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ SraiD(dst, lhs, value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecUShr(HVecUShr* instruction) { @@ -242,7 +688,32 @@ void LocationsBuilderMIPS64::VisitVecUShr(HVecUShr* instruction) { } void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + VectorRegister lhs = VectorRegisterFrom(locations->InAt(0)); + VectorRegister dst = VectorRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ SrliB(dst, lhs, value); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ SrliH(dst, lhs, value); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ SrliW(dst, lhs, value); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ SrliD(dst, lhs, value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { @@ -253,20 +724,143 @@ void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccu LOG(FATAL) << "No SIMD for " << instr->GetId(); } +// Helper to set up locations for vector memory operations. +static void CreateVecMemLocations(ArenaAllocator* arena, + HVecMemoryOperation* instruction, + bool is_load) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (is_load) { + locations->SetOut(Location::RequiresFpuRegister()); + } else { + locations->SetInAt(2, Location::RequiresFpuRegister()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to prepare register and offset for vector memory operations. Returns the offset and sets +// the output parameter adjusted_base to the original base or to a reserved temporary register (AT). +int32_t InstructionCodeGeneratorMIPS64::VecAddress(LocationSummary* locations, + size_t size, + /* out */ GpuRegister* adjusted_base) { + GpuRegister base = locations->InAt(0).AsRegister<GpuRegister>(); + Location index = locations->InAt(1); + int scale = TIMES_1; + switch (size) { + case 2: scale = TIMES_2; break; + case 4: scale = TIMES_4; break; + case 8: scale = TIMES_8; break; + default: break; + } + int32_t offset = mirror::Array::DataOffset(size).Int32Value(); + + if (index.IsConstant()) { + offset += index.GetConstant()->AsIntConstant()->GetValue() << scale; + __ AdjustBaseOffsetAndElementSizeShift(base, offset, scale); + *adjusted_base = base; + } else { + GpuRegister index_reg = index.AsRegister<GpuRegister>(); + if (scale != TIMES_1) { + __ Dlsa(AT, index_reg, base, scale); + } else { + __ Daddu(AT, base, index_reg); + } + *adjusted_base = AT; + } + return offset; +} + void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /* is_load */ true); } void InstructionCodeGeneratorMIPS64::VisitVecLoad(HVecLoad* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + VectorRegister reg = VectorRegisterFrom(locations->Out()); + GpuRegister base; + int32_t offset = VecAddress(locations, size, &base); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ LdB(reg, base, offset); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + // Loading 8-bytes (needed if dealing with compressed strings in StringCharAt) from unaligned + // memory address may cause a trap to the kernel if the CPU doesn't directly support unaligned + // loads and stores. + // TODO: Implement support for StringCharAt. + DCHECK(!instruction->IsStringCharAt()); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ LdH(reg, base, offset); + break; + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ LdW(reg, base, offset); + break; + case Primitive::kPrimLong: + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ LdD(reg, base, offset); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderMIPS64::VisitVecStore(HVecStore* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /* is_load */ false); } void InstructionCodeGeneratorMIPS64::VisitVecStore(HVecStore* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + VectorRegister reg = VectorRegisterFrom(locations->InAt(2)); + GpuRegister base; + int32_t offset = VecAddress(locations, size, &base); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ StB(reg, base, offset); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ StH(reg, base, offset); + break; + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ StW(reg, base, offset); + break; + case Primitive::kPrimLong: + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ StD(reg, base, offset); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } #undef __ diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 013b092b5a..14782d70a1 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -201,6 +201,7 @@ void InstructionCodeGeneratorX86::VisitVecNeg(HVecNeg* instruction) { void LocationsBuilderX86::VisitVecAbs(HVecAbs* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Integral-abs requires a temporary for the comparison. if (instruction->GetPackedType() == Primitive::kPrimInt) { instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); } @@ -482,7 +483,51 @@ void LocationsBuilderX86::VisitVecMin(HVecMin* instruction) { } void InstructionCodeGeneratorX86::VisitVecMin(HVecMin* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pminub(dst, src); + } else { + __ pminsb(dst, src); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pminuw(dst, src); + } else { + __ pminsw(dst, src); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pminud(dst, src); + } else { + __ pminsd(dst, src); + } + break; + // Next cases are sloppy wrt 0.0 vs -0.0. + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ minps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ minpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderX86::VisitVecMax(HVecMax* instruction) { @@ -490,7 +535,51 @@ void LocationsBuilderX86::VisitVecMax(HVecMax* instruction) { } void InstructionCodeGeneratorX86::VisitVecMax(HVecMax* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pmaxub(dst, src); + } else { + __ pmaxsb(dst, src); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pmaxuw(dst, src); + } else { + __ pmaxsw(dst, src); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pmaxud(dst, src); + } else { + __ pmaxsd(dst, src); + } + break; + // Next cases are sloppy wrt 0.0 vs -0.0. + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ maxps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ maxpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) { @@ -766,16 +855,10 @@ static void CreateVecMemLocations(ArenaAllocator* arena, } } -// Helper to set up registers and address for vector memory operations. -static Address CreateVecMemRegisters(HVecMemoryOperation* instruction, - Location* reg_loc, - bool is_load) { - LocationSummary* locations = instruction->GetLocations(); +// Helper to construct address for vector memory operations. +static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) { Location base = locations->InAt(0); Location index = locations->InAt(1); - *reg_loc = is_load ? locations->Out() : locations->InAt(2); - size_t size = Primitive::ComponentSize(instruction->GetPackedType()); - uint32_t offset = mirror::Array::DataOffset(size).Uint32Value(); ScaleFactor scale = TIMES_1; switch (size) { case 2: scale = TIMES_2; break; @@ -783,22 +866,53 @@ static Address CreateVecMemRegisters(HVecMemoryOperation* instruction, case 8: scale = TIMES_8; break; default: break; } + uint32_t offset = is_string_char_at + ? mirror::String::ValueOffset().Uint32Value() + : mirror::Array::DataOffset(size).Uint32Value(); return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset); } void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) { CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true); + // String load requires a temporary for the compressed load. + if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } } void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) { - Location reg_loc = Location::NoLocation(); - Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true); - XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + Address address = VecAddress(locations, size, instruction->IsStringCharAt()); + XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + DCHECK_EQ(8u, instruction->GetVectorLength()); + // Special handling of compressed/uncompressed string load. + if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + NearLabel done, not_compressed; + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + // Test compression bit. + static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, + "Expecting 0=compressed, 1=uncompressed"); + uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); + __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1)); + __ j(kNotZero, ¬_compressed); + // Zero extend 8 compressed bytes into 8 chars. + __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true)); + __ pxor(tmp, tmp); + __ punpcklbw(reg, tmp); + __ jmp(&done); + // Load 4 direct uncompressed chars. + __ Bind(¬_compressed); + is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address); + __ Bind(&done); + return; + } + FALLTHROUGH_INTENDED; case Primitive::kPrimBoolean: case Primitive::kPrimByte: - case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimLong: @@ -825,9 +939,10 @@ void LocationsBuilderX86::VisitVecStore(HVecStore* instruction) { } void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) { - Location reg_loc = Location::NoLocation(); - Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false); - XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + Address address = VecAddress(locations, size, /*is_string_char_at*/ false); + XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>(); bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index 66f19a4376..246044ebb8 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -194,6 +194,7 @@ void InstructionCodeGeneratorX86_64::VisitVecNeg(HVecNeg* instruction) { void LocationsBuilderX86_64::VisitVecAbs(HVecAbs* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Integral-abs requires a temporary for the comparison. if (instruction->GetPackedType() == Primitive::kPrimInt) { instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); } @@ -352,6 +353,10 @@ void InstructionCodeGeneratorX86_64::VisitVecHalvingAdd(HVecHalvingAdd* instruct DCHECK(locations->InAt(0).Equals(locations->Out())); XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + DCHECK(instruction->IsRounded()); + DCHECK(instruction->IsUnsigned()); + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: DCHECK_EQ(16u, instruction->GetVectorLength()); @@ -471,7 +476,51 @@ void LocationsBuilderX86_64::VisitVecMin(HVecMin* instruction) { } void InstructionCodeGeneratorX86_64::VisitVecMin(HVecMin* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pminub(dst, src); + } else { + __ pminsb(dst, src); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pminuw(dst, src); + } else { + __ pminsw(dst, src); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pminud(dst, src); + } else { + __ pminsd(dst, src); + } + break; + // Next cases are sloppy wrt 0.0 vs -0.0. + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ minps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ minpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderX86_64::VisitVecMax(HVecMax* instruction) { @@ -479,7 +528,51 @@ void LocationsBuilderX86_64::VisitVecMax(HVecMax* instruction) { } void InstructionCodeGeneratorX86_64::VisitVecMax(HVecMax* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pmaxub(dst, src); + } else { + __ pmaxsb(dst, src); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pmaxuw(dst, src); + } else { + __ pmaxsw(dst, src); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + __ pmaxud(dst, src); + } else { + __ pmaxsd(dst, src); + } + break; + // Next cases are sloppy wrt 0.0 vs -0.0. + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ maxps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + DCHECK(!instruction->IsUnsigned()); + __ maxpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } void LocationsBuilderX86_64::VisitVecAnd(HVecAnd* instruction) { @@ -755,16 +848,10 @@ static void CreateVecMemLocations(ArenaAllocator* arena, } } -// Helper to set up registers and address for vector memory operations. -static Address CreateVecMemRegisters(HVecMemoryOperation* instruction, - Location* reg_loc, - bool is_load) { - LocationSummary* locations = instruction->GetLocations(); +// Helper to construct address for vector memory operations. +static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) { Location base = locations->InAt(0); Location index = locations->InAt(1); - *reg_loc = is_load ? locations->Out() : locations->InAt(2); - size_t size = Primitive::ComponentSize(instruction->GetPackedType()); - uint32_t offset = mirror::Array::DataOffset(size).Uint32Value(); ScaleFactor scale = TIMES_1; switch (size) { case 2: scale = TIMES_2; break; @@ -772,22 +859,53 @@ static Address CreateVecMemRegisters(HVecMemoryOperation* instruction, case 8: scale = TIMES_8; break; default: break; } + uint32_t offset = is_string_char_at + ? mirror::String::ValueOffset().Uint32Value() + : mirror::Array::DataOffset(size).Uint32Value(); return CodeGeneratorX86_64::ArrayAddress(base.AsRegister<CpuRegister>(), index, scale, offset); } void LocationsBuilderX86_64::VisitVecLoad(HVecLoad* instruction) { CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true); + // String load requires a temporary for the compressed load. + if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } } void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) { - Location reg_loc = Location::NoLocation(); - Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true); - XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + Address address = VecAddress(locations, size, instruction->IsStringCharAt()); + XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + DCHECK_EQ(8u, instruction->GetVectorLength()); + // Special handling of compressed/uncompressed string load. + if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + NearLabel done, not_compressed; + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + // Test compression bit. + static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, + "Expecting 0=compressed, 1=uncompressed"); + uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); + __ testb(Address(locations->InAt(0).AsRegister<CpuRegister>(), count_offset), Immediate(1)); + __ j(kNotZero, ¬_compressed); + // Zero extend 8 compressed bytes into 8 chars. + __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true)); + __ pxor(tmp, tmp); + __ punpcklbw(reg, tmp); + __ jmp(&done); + // Load 8 direct uncompressed chars. + __ Bind(¬_compressed); + is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address); + __ Bind(&done); + return; + } + FALLTHROUGH_INTENDED; case Primitive::kPrimBoolean: case Primitive::kPrimByte: - case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimLong: @@ -814,9 +932,10 @@ void LocationsBuilderX86_64::VisitVecStore(HVecStore* instruction) { } void InstructionCodeGeneratorX86_64::VisitVecStore(HVecStore* instruction) { - Location reg_loc = Location::NoLocation(); - Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false); - XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + LocationSummary* locations = instruction->GetLocations(); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + Address address = VecAddress(locations, size, /*is_string_char_at*/ false); + XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>(); bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 7e640c284f..b8465cd9d5 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -26,6 +26,7 @@ #include "intrinsics_x86.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" +#include "lock_word.h" #include "thread.h" #include "utils/assembler.h" #include "utils/stack_checks.h" @@ -1032,9 +1033,10 @@ CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, assembler_(graph->GetArena()), isa_features_(isa_features), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + boot_image_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), constant_area_start_(-1), @@ -2066,6 +2068,15 @@ void InstructionCodeGeneratorX86::VisitDoubleConstant(HDoubleConstant* constant // Will be generated at use site. } +void LocationsBuilderX86::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorX86::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderX86::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -2158,7 +2169,7 @@ void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invok IntrinsicLocationsBuilderX86 intrinsic(codegen_); if (intrinsic.TryDispatch(invoke)) { - if (invoke->GetLocations()->CanCall() && invoke->HasPcRelativeDexCache()) { + if (invoke->GetLocations()->CanCall() && invoke->HasPcRelativeMethodLoadKind()) { invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::Any()); } return; @@ -2167,7 +2178,7 @@ void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invok HandleInvoke(invoke); // For PC-relative dex cache the invoke has an extra input, the PC-relative address base. - if (invoke->HasPcRelativeDexCache()) { + if (invoke->HasPcRelativeMethodLoadKind()) { invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::RequiresRegister()); } } @@ -4510,18 +4521,16 @@ Register CodeGeneratorX86::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOr // save one load. However, since this is just an intrinsic slow path we prefer this // simple and more robust approach rather that trying to determine if that's the case. SlowPathCode* slow_path = GetCurrentSlowPath(); - if (slow_path != nullptr) { - if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) { - int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>()); - __ movl(temp, Address(ESP, stack_offset)); - return temp; - } + DCHECK(slow_path != nullptr); // For intrinsified invokes the call is emitted on the slow path. + if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) { + int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>()); + __ movl(temp, Address(ESP, stack_offset)); + return temp; } return location.AsRegister<Register>(); } -Location CodeGeneratorX86::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, - Location temp) { +void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) { Location callee_method = temp; // For all kinds except kRecursive, callee will be in temp. switch (invoke->GetMethodLoadKind()) { case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: { @@ -4534,6 +4543,14 @@ Location CodeGeneratorX86::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticO case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: { + DCHECK(GetCompilerOptions().IsBootImage()); + Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke, + temp.AsRegister<Register>()); + __ leal(temp.AsRegister<Register>(), Address(base_reg, CodeGeneratorX86::kDummy32BitOffset)); + RecordBootMethodPatch(invoke); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress())); break; @@ -4571,11 +4588,6 @@ Location CodeGeneratorX86::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticO break; } } - return callee_method; -} - -void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) { - Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp); switch (invoke->GetCodePtrLocation()) { case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf: @@ -4622,27 +4634,18 @@ void CodeGeneratorX86::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value())); } -void CodeGeneratorX86::RecordBootStringPatch(HLoadString* load_string) { - DCHECK(GetCompilerOptions().IsBootImage()); - HX86ComputeBaseMethodAddress* address = nullptr; - if (GetCompilerOptions().GetCompilePic()) { - address = load_string->InputAt(0)->AsX86ComputeBaseMethodAddress(); - } else { - DCHECK_EQ(load_string->InputCount(), 0u); - } - string_patches_.emplace_back(address, - load_string->GetDexFile(), - load_string->GetStringIndex().index_); - __ Bind(&string_patches_.back().label); +void CodeGeneratorX86::RecordBootMethodPatch(HInvokeStaticOrDirect* invoke) { + DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u); + HX86ComputeBaseMethodAddress* address = + invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress(); + boot_image_method_patches_.emplace_back(address, + *invoke->GetTargetMethod().dex_file, + invoke->GetTargetMethod().dex_method_index); + __ Bind(&boot_image_method_patches_.back().label); } void CodeGeneratorX86::RecordBootTypePatch(HLoadClass* load_class) { - HX86ComputeBaseMethodAddress* address = nullptr; - if (GetCompilerOptions().GetCompilePic()) { - address = load_class->InputAt(0)->AsX86ComputeBaseMethodAddress(); - } else { - DCHECK_EQ(load_class->InputCount(), 0u); - } + HX86ComputeBaseMethodAddress* address = load_class->InputAt(0)->AsX86ComputeBaseMethodAddress(); boot_image_type_patches_.emplace_back(address, load_class->GetDexFile(), load_class->GetTypeIndex().index_); @@ -4657,6 +4660,15 @@ Label* CodeGeneratorX86::NewTypeBssEntryPatch(HLoadClass* load_class) { return &type_bss_entry_patches_.back().label; } +void CodeGeneratorX86::RecordBootStringPatch(HLoadString* load_string) { + DCHECK(GetCompilerOptions().IsBootImage()); + HX86ComputeBaseMethodAddress* address = load_string->InputAt(0)->AsX86ComputeBaseMethodAddress(); + string_patches_.emplace_back(address, + load_string->GetDexFile(), + load_string->GetStringIndex().index_); + __ Bind(&string_patches_.back().label); +} + Label* CodeGeneratorX86::NewStringBssEntryPatch(HLoadString* load_string) { DCHECK(!GetCompilerOptions().IsBootImage()); HX86ComputeBaseMethodAddress* address = @@ -4694,29 +4706,23 @@ void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - string_patches_.size() + + boot_image_method_patches_.size() + boot_image_type_patches_.size() + - type_bss_entry_patches_.size(); + type_bss_entry_patches_.size() + + string_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(boot_image_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches); - } else if (GetCompilerOptions().GetCompilePic()) { + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(boot_image_method_patches_, + linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(boot_image_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(string_patches_, linker_patches); } else { - for (const PatchInfo<Label>& info : boot_image_type_patches_) { - uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment; - linker_patches->push_back(LinkerPatch::TypePatch(literal_offset, &info.dex_file, info.index)); - } - for (const PatchInfo<Label>& info : string_patches_) { - uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment; - linker_patches->push_back( - LinkerPatch::StringPatch(literal_offset, &info.dex_file, info.index)); - } + DCHECK(boot_image_method_patches_.empty()); + DCHECK(boot_image_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); @@ -6045,21 +6051,15 @@ HLoadClass::LoadKind CodeGeneratorX86::GetSupportedLoadClassKind( UNREACHABLE(); case HLoadClass::LoadKind::kReferrersClass: break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - FALLTHROUGH_INTENDED; case HLoadClass::LoadKind::kBssEntry: - DCHECK(!Runtime::Current()->UseJitCompilation()); // Note: boot image is also non-JIT. - break; - case HLoadClass::LoadKind::kBootImageAddress: + DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kRuntimeCall: break; } return desired_class_load_kind; @@ -6067,7 +6067,7 @@ HLoadClass::LoadKind CodeGeneratorX86::GetSupportedLoadClassKind( void LocationsBuilderX86::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { InvokeRuntimeCallingConvention calling_convention; CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( cls, @@ -6121,7 +6121,7 @@ Label* CodeGeneratorX86::NewJitRootClassPatch(const DexFile& dex_file, // move. void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -6149,13 +6149,6 @@ void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE read_barrier_option); break; } - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: { - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); - __ movl(out, Immediate(/* placeholder */ 0)); - codegen_->RecordBootTypePatch(cls); - break; - } case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); @@ -6188,7 +6181,7 @@ void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, read_barrier_option); break; } - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kRuntimeCall: case HLoadClass::LoadKind::kInvalid: LOG(FATAL) << "UNREACHABLE"; UNREACHABLE(); @@ -6243,21 +6236,15 @@ void InstructionCodeGeneratorX86::GenerateClassInitializationCheck( HLoadString::LoadKind CodeGeneratorX86::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - break; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - FALLTHROUGH_INTENDED; case HLoadString::LoadKind::kBssEntry: - DCHECK(!Runtime::Current()->UseJitCompilation()); // Note: boot image is also non-JIT. - break; - case HLoadString::LoadKind::kBootImageAddress: + DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kRuntimeCall: break; } return desired_string_load_kind; @@ -6271,7 +6258,7 @@ void LocationsBuilderX86::VisitLoadString(HLoadString* load) { load_kind == HLoadString::LoadKind::kBssEntry) { locations->SetInAt(0, Location::RequiresRegister()); } - if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadString::LoadKind::kRuntimeCall) { locations->SetOut(Location::RegisterLocation(EAX)); } else { locations->SetOut(Location::RequiresRegister()); @@ -6308,12 +6295,6 @@ void InstructionCodeGeneratorX86::VisitLoadString(HLoadString* load) NO_THREAD_S Register out = out_loc.AsRegister<Register>(); switch (load->GetLoadKind()) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: { - DCHECK(codegen_->GetCompilerOptions().IsBootImage()); - __ movl(out, Immediate(/* placeholder */ 0)); - codegen_->RecordBootStringPatch(load); - return; // No dex cache slow path. - } case HLoadString::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); Register method_address = locations->InAt(0).AsRegister<Register>(); @@ -7694,7 +7675,7 @@ void CodeGeneratorX86::Finalize(CodeAllocator* allocator) { constant_area_start_ = assembler->CodeSize(); // Populate any jump tables. - for (auto jump_table : fixups_to_jump_tables_) { + for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) { jump_table->CreateJumpTable(); } @@ -7833,17 +7814,19 @@ void CodeGeneratorX86::PatchJitRootUse(uint8_t* code, void CodeGeneratorX86::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const PatchInfo<Label>& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find( + const auto it = jit_string_roots_.find( StringReference(&info.dex_file, dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const PatchInfo<Label>& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find( + const auto it = jit_class_roots_.find( TypeReference(&info.dex_file, dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index ca3a9eadd2..8130bd9d25 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -408,14 +408,14 @@ class CodeGeneratorX86 : public CodeGenerator { HInvokeStaticOrDirect* invoke) OVERRIDE; // Generate a call to a static or direct method. - Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp); void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE; // Generate a call to a virtual method. void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; - void RecordBootStringPatch(HLoadString* load_string); + void RecordBootMethodPatch(HInvokeStaticOrDirect* invoke); void RecordBootTypePatch(HLoadClass* load_class); Label* NewTypeBssEntryPatch(HLoadClass* load_class); + void RecordBootStringPatch(HLoadString* load_string); Label* NewStringBssEntryPatch(HLoadString* load_string); Label* NewPcRelativeDexCacheArrayPatch(HX86ComputeBaseMethodAddress* method_address, const DexFile& dex_file, @@ -633,16 +633,17 @@ class CodeGeneratorX86 : public CodeGenerator { // PC-relative DexCache access info. ArenaDeque<X86PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // String patch locations; type depends on configuration (app .bss or boot image PIC/non-PIC). - ArenaDeque<X86PcRelativePatchInfo> string_patches_; - // Type patch locations for boot image; type depends on configuration (boot image PIC/non-PIC). + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<X86PcRelativePatchInfo> boot_image_method_patches_; + // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<X86PcRelativePatchInfo> boot_image_type_patches_; // Type patch locations for kBssEntry. ArenaDeque<X86PcRelativePatchInfo> type_bss_entry_patches_; + // String patch locations; type depends on configuration (app .bss or boot image). + ArenaDeque<X86PcRelativePatchInfo> string_patches_; // Patches for string root accesses in JIT compiled code. ArenaDeque<PatchInfo<Label>> jit_string_patches_; - // Patches for class root accesses in JIT compiled code. ArenaDeque<PatchInfo<Label>> jit_class_patches_; diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index dfb11aaba5..8dde298267 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -23,6 +23,7 @@ #include "gc/accounting/card_table.h" #include "intrinsics.h" #include "intrinsics_x86_64.h" +#include "lock_word.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "mirror/object_reference.h" @@ -976,9 +977,10 @@ HInvokeStaticOrDirect::DispatchInfo CodeGeneratorX86_64::GetSupportedInvokeStati return desired_dispatch_info; } -Location CodeGeneratorX86_64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, - Location temp) { +void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, + Location temp) { // All registers are assumed to be correctly set up. + Location callee_method = temp; // For all kinds except kRecursive, callee will be in temp. switch (invoke->GetMethodLoadKind()) { case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: { @@ -991,6 +993,12 @@ Location CodeGeneratorX86_64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStat case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: + DCHECK(GetCompilerOptions().IsBootImage()); + __ leal(temp.AsRegister<CpuRegister>(), + Address::Absolute(kDummy32BitOffset, /* no_rip */ false)); + RecordBootMethodPatch(invoke); + break; case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: Load64BitValue(temp.AsRegister<CpuRegister>(), invoke->GetMethodAddress()); break; @@ -1025,13 +1033,6 @@ Location CodeGeneratorX86_64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStat break; } } - return callee_method; -} - -void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, - Location temp) { - // All registers are assumed to be correctly set up. - Location callee_method = GenerateCalleeMethodStaticOrDirectCall(invoke, temp); switch (invoke->GetCodePtrLocation()) { case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf: @@ -1079,10 +1080,10 @@ void CodeGeneratorX86_64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t kX86_64PointerSize).SizeValue())); } -void CodeGeneratorX86_64::RecordBootStringPatch(HLoadString* load_string) { - DCHECK(GetCompilerOptions().IsBootImage()); - string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_); - __ Bind(&string_patches_.back().label); +void CodeGeneratorX86_64::RecordBootMethodPatch(HInvokeStaticOrDirect* invoke) { + boot_image_method_patches_.emplace_back(*invoke->GetTargetMethod().dex_file, + invoke->GetTargetMethod().dex_method_index); + __ Bind(&boot_image_method_patches_.back().label); } void CodeGeneratorX86_64::RecordBootTypePatch(HLoadClass* load_class) { @@ -1096,6 +1097,12 @@ Label* CodeGeneratorX86_64::NewTypeBssEntryPatch(HLoadClass* load_class) { return &type_bss_entry_patches_.back().label; } +void CodeGeneratorX86_64::RecordBootStringPatch(HLoadString* load_string) { + DCHECK(GetCompilerOptions().IsBootImage()); + string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_); + __ Bind(&string_patches_.back().label); +} + Label* CodeGeneratorX86_64::NewStringBssEntryPatch(HLoadString* load_string) { DCHECK(!GetCompilerOptions().IsBootImage()); string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_); @@ -1128,20 +1135,23 @@ void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pat DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - string_patches_.size() + + boot_image_method_patches_.size() + boot_image_type_patches_.size() + - type_bss_entry_patches_.size(); + type_bss_entry_patches_.size() + + string_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - if (!GetCompilerOptions().IsBootImage()) { - DCHECK(boot_image_type_patches_.empty()); - EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches); - } else { - // These are always PC-relative, see GetSupportedLoadClassKind()/GetSupportedLoadStringKind(). + if (GetCompilerOptions().IsBootImage()) { + EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(boot_image_method_patches_, + linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(boot_image_type_patches_, linker_patches); EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(string_patches_, linker_patches); + } else { + DCHECK(boot_image_method_patches_.empty()); + DCHECK(boot_image_type_patches_.empty()); + EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches); } EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_, linker_patches); @@ -1232,12 +1242,13 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, isa_features_(isa_features), constant_area_start_(0), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + boot_image_method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) { + jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) { AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister)); } @@ -2174,6 +2185,15 @@ void InstructionCodeGeneratorX86_64::VisitDoubleConstant( // Will be generated at use site. } +void LocationsBuilderX86_64::VisitConstructorFence(HConstructorFence* constructor_fence) { + constructor_fence->SetLocations(nullptr); +} + +void InstructionCodeGeneratorX86_64::VisitConstructorFence( + HConstructorFence* constructor_fence ATTRIBUTE_UNUSED) { + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); +} + void LocationsBuilderX86_64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) { memory_barrier->SetLocations(nullptr); } @@ -5449,22 +5469,15 @@ HLoadClass::LoadKind CodeGeneratorX86_64::GetSupportedLoadClassKind( UNREACHABLE(); case HLoadClass::LoadKind::kReferrersClass: break; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - // We prefer the always-available RIP-relative address for the x86-64 boot image. - return HLoadClass::LoadKind::kBootImageLinkTimePcRelative; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadClass::LoadKind::kBootImageAddress: - break; case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadClass::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadClass::LoadKind::kDexCacheViaMethod: + case HLoadClass::LoadKind::kBootImageAddress: + case HLoadClass::LoadKind::kRuntimeCall: break; } return desired_class_load_kind; @@ -5472,7 +5485,7 @@ HLoadClass::LoadKind CodeGeneratorX86_64::GetSupportedLoadClassKind( void LocationsBuilderX86_64::VisitLoadClass(HLoadClass* cls) { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { // Custom calling convention: RAX serves as both input and output. CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( cls, @@ -5523,7 +5536,7 @@ Label* CodeGeneratorX86_64::NewJitRootClassPatch(const DexFile& dex_file, // move. void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFETY_ANALYSIS { HLoadClass::LoadKind load_kind = cls->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) { + if (load_kind == HLoadClass::LoadKind::kRuntimeCall) { codegen_->GenerateLoadClassRuntimeCall(cls); return; } @@ -5626,22 +5639,15 @@ void InstructionCodeGeneratorX86_64::VisitClinitCheck(HClinitCheck* check) { HLoadString::LoadKind CodeGeneratorX86_64::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - DCHECK(!GetCompilerOptions().GetCompilePic()); - // We prefer the always-available RIP-relative address for the x86-64 boot image. - return HLoadString::LoadKind::kBootImageLinkTimePcRelative; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - DCHECK(GetCompilerOptions().GetCompilePic()); - break; - case HLoadString::LoadKind::kBootImageAddress: - break; case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; case HLoadString::LoadKind::kJitTableAddress: DCHECK(Runtime::Current()->UseJitCompilation()); break; - case HLoadString::LoadKind::kDexCacheViaMethod: + case HLoadString::LoadKind::kBootImageAddress: + case HLoadString::LoadKind::kRuntimeCall: break; } return desired_string_load_kind; @@ -5650,7 +5656,7 @@ HLoadString::LoadKind CodeGeneratorX86_64::GetSupportedLoadStringKind( void LocationsBuilderX86_64::VisitLoadString(HLoadString* load) { LocationSummary::CallKind call_kind = CodeGenerator::GetLoadStringCallKind(load); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind); - if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) { + if (load->GetLoadKind() == HLoadString::LoadKind::kRuntimeCall) { locations->SetOut(Location::RegisterLocation(RAX)); } else { locations->SetOut(Location::RequiresRegister()); @@ -7046,7 +7052,7 @@ void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) { constant_area_start_ = assembler->CodeSize(); // Populate any jump tables. - for (auto jump_table : fixups_to_jump_tables_) { + for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) { jump_table->CreateJumpTable(); } @@ -7140,17 +7146,19 @@ void CodeGeneratorX86_64::PatchJitRootUse(uint8_t* code, void CodeGeneratorX86_64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const PatchInfo<Label>& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find( + const auto it = jit_string_roots_.find( StringReference(&info.dex_file, dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const PatchInfo<Label>& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find( + const auto it = jit_class_roots_.find( TypeReference(&info.dex_file, dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index c8336dabd9..25479814d0 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -404,13 +404,13 @@ class CodeGeneratorX86_64 : public CodeGenerator { const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info, HInvokeStaticOrDirect* invoke) OVERRIDE; - Location GenerateCalleeMethodStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp); void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE; void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; - void RecordBootStringPatch(HLoadString* load_string); + void RecordBootMethodPatch(HInvokeStaticOrDirect* invoke); void RecordBootTypePatch(HLoadClass* load_class); Label* NewTypeBssEntryPatch(HLoadClass* load_class); + void RecordBootStringPatch(HLoadString* load_string); Label* NewStringBssEntryPatch(HLoadString* load_string); Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); Label* NewJitRootStringPatch(const DexFile& dex_file, @@ -603,22 +603,23 @@ class CodeGeneratorX86_64 : public CodeGenerator { // PC-relative DexCache access info. ArenaDeque<PatchInfo<Label>> pc_relative_dex_cache_patches_; - // String patch locations; type depends on configuration (app .bss or boot image PIC). - ArenaDeque<PatchInfo<Label>> string_patches_; - // Type patch locations for boot image (always PIC). + // PC-relative method patch info for kBootImageLinkTimePcRelative. + ArenaDeque<PatchInfo<Label>> boot_image_method_patches_; + // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PatchInfo<Label>> boot_image_type_patches_; // Type patch locations for kBssEntry. ArenaDeque<PatchInfo<Label>> type_bss_entry_patches_; - - // Fixups for jump tables need to be handled specially. - ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_; + // String patch locations; type depends on configuration (app .bss or boot image). + ArenaDeque<PatchInfo<Label>> string_patches_; // Patches for string literals in JIT compiled code. ArenaDeque<PatchInfo<Label>> jit_string_patches_; - // Patches for class literals in JIT compiled code. ArenaDeque<PatchInfo<Label>> jit_class_patches_; + // Fixups for jump tables need to be handled specially. + ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_; + DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64); }; diff --git a/compiler/optimizing/code_sinking.cc b/compiler/optimizing/code_sinking.cc index 0b4dcd30a1..e598e19b67 100644 --- a/compiler/optimizing/code_sinking.cc +++ b/compiler/optimizing/code_sinking.cc @@ -56,6 +56,17 @@ static bool IsInterestingInstruction(HInstruction* instruction) { return true; } + // Check it is safe to move ConstructorFence. + // (Safe to move ConstructorFence for only protecting the new-instance but not for finals.) + if (instruction->IsConstructorFence()) { + HConstructorFence* ctor_fence = instruction->AsConstructorFence(); + + // A fence with "0" inputs is dead and should've been removed in a prior pass. + DCHECK_NE(0u, ctor_fence->InputCount()); + + return ctor_fence->GetAssociatedAllocation() != nullptr; + } + // All other instructions that can throw cannot be moved. if (instruction->CanThrow()) { return false; @@ -134,11 +145,11 @@ static bool ShouldFilterUse(HInstruction* instruction, HInstruction* user, const ArenaBitVector& post_dominated) { if (instruction->IsNewInstance()) { - return user->IsInstanceFieldSet() && + return (user->IsInstanceFieldSet() || user->IsConstructorFence()) && (user->InputAt(0) == instruction) && !post_dominated.IsBitSet(user->GetBlock()->GetBlockId()); } else if (instruction->IsNewArray()) { - return user->IsArraySet() && + return (user->IsArraySet() || user->IsConstructorFence()) && (user->InputAt(0) == instruction) && !post_dominated.IsBitSet(user->GetBlock()->GetBlockId()); } @@ -372,7 +383,9 @@ void CodeSinking::SinkCodeToUncommonBranch(HBasicBlock* end_block) { // Step (3): Try to move sinking candidates. for (HInstruction* instruction : move_in_order) { HInstruction* position = nullptr; - if (instruction->IsArraySet() || instruction->IsInstanceFieldSet()) { + if (instruction->IsArraySet() + || instruction->IsInstanceFieldSet() + || instruction->IsConstructorFence()) { if (!instructions_that_can_move.IsBitSet(instruction->InputAt(0)->GetId())) { // A store can trivially move, but it can safely do so only if the heap // location it stores to can also move. diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 4ba5c5580f..fe25b7690d 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -64,7 +64,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { #endif }; - for (auto test_config : test_config_candidates) { + for (const CodegenTargetConfig& test_config : test_config_candidates) { if (CanExecute(test_config.GetInstructionSet())) { v.push_back(test_config); } @@ -76,7 +76,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { static void TestCode(const uint16_t* data, bool has_result = false, int32_t expected = 0) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { + for (const CodegenTargetConfig& target_config : GetTargetConfigs()) { ArenaPool pool; ArenaAllocator arena(&pool); HGraph* graph = CreateCFG(&arena, data); @@ -89,7 +89,7 @@ static void TestCode(const uint16_t* data, static void TestCodeLong(const uint16_t* data, bool has_result, int64_t expected) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { + for (const CodegenTargetConfig& target_config : GetTargetConfigs()) { ArenaPool pool; ArenaAllocator arena(&pool); HGraph* graph = CreateCFG(&arena, data, Primitive::kPrimLong); @@ -754,7 +754,28 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverB34760542) { // // Assertion failed (!available->IsEmpty()) // - // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable. + // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable, + // because of the following situation: + // + // 1. a temp register (IP0) is allocated as a scratch register by + // the parallel move resolver to solve a cycle (swap): + // + // [ source=DS0 destination=DS257 type=PrimDouble instruction=null ] + // [ source=DS257 destination=DS0 type=PrimDouble instruction=null ] + // + // 2. within CodeGeneratorARM64::MoveLocation, another temp + // register (IP1) is allocated to generate the swap between two + // double stack slots; + // + // 3. VIXL requires a third temp register to emit the `Ldr` or + // `Str` operation from CodeGeneratorARM64::MoveLocation (as + // one of the stack slots' offsets cannot be encoded as an + // immediate), but the pool of (core) temp registers is now + // empty. + // + // The solution used so far is to use a floating-point temp register + // (D31) in step #2, so that IP1 is available for step #3. + HParallelMove* move = new (graph->GetArena()) HParallelMove(graph->GetArena()); move->AddMove(Location::DoubleStackSlot(0), Location::DoubleStackSlot(257), @@ -807,7 +828,6 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) { InternalCodeAllocator code_allocator; codegen.Finalize(&code_allocator); } - #endif #ifdef ART_ENABLE_CODEGEN_mips diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h index 31cd204c9f..00a16fe849 100644 --- a/compiler/optimizing/codegen_test_utils.h +++ b/compiler/optimizing/codegen_test_utils.h @@ -243,7 +243,7 @@ static void ValidateGraph(HGraph* graph) { GraphChecker graph_checker(graph); graph_checker.Run(); if (!graph_checker.IsValid()) { - for (const auto& error : graph_checker.GetErrors()) { + for (const std::string& error : graph_checker.GetErrors()) { std::cout << error << std::endl; } } diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc index 34b52a87b5..aea901dec7 100644 --- a/compiler/optimizing/graph_checker.cc +++ b/compiler/optimizing/graph_checker.cc @@ -338,14 +338,21 @@ void GraphChecker::VisitInstruction(HInstruction* instruction) { // Ensure the inputs of `instruction` are defined in a block of the graph. for (HInstruction* input : instruction->GetInputs()) { - const HInstructionList& list = input->IsPhi() - ? input->GetBlock()->GetPhis() - : input->GetBlock()->GetInstructions(); - if (!list.Contains(input)) { - AddError(StringPrintf("Input %d of instruction %d is not defined " - "in a basic block of the control-flow graph.", + if (input->GetBlock() == nullptr) { + AddError(StringPrintf("Input %d of instruction %d is not in any " + "basic block of the control-flow graph.", input->GetId(), instruction->GetId())); + } else { + const HInstructionList& list = input->IsPhi() + ? input->GetBlock()->GetPhis() + : input->GetBlock()->GetInstructions(); + if (!list.Contains(input)) { + AddError(StringPrintf("Input %d of instruction %d is not defined " + "in a basic block of the control-flow graph.", + input->GetId(), + instruction->GetId())); + } } } @@ -497,8 +504,7 @@ void GraphChecker::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { "has a null pointer as last input.", invoke->DebugName(), invoke->GetId())); - } - if (!last_input->IsClinitCheck() && !last_input->IsLoadClass()) { + } else if (!last_input->IsClinitCheck() && !last_input->IsLoadClass()) { AddError(StringPrintf("Static invoke %s:%d marked as having an explicit clinit check " "has a last instruction (%s:%d) which is neither a clinit check " "nor a load class instruction.", diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index e5d94c3504..02816cf7ce 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -514,6 +514,14 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha; } + void VisitVecMin(HVecMin* min) OVERRIDE { + StartAttributeStream("unsigned") << std::boolalpha << min->IsUnsigned() << std::noboolalpha; + } + + void VisitVecMax(HVecMax* max) OVERRIDE { + StartAttributeStream("unsigned") << std::boolalpha << max->IsUnsigned() << std::noboolalpha; + } + void VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) OVERRIDE { StartAttributeStream("kind") << instruction->GetOpKind(); } diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc index c93bc210be..8ea312d0ea 100644 --- a/compiler/optimizing/gvn.cc +++ b/compiler/optimizing/gvn.cc @@ -516,13 +516,13 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const { DCHECK(visited_blocks_.IsBitSet(block->GetBlockId())); - for (auto dominated_block : block->GetDominatedBlocks()) { + for (const HBasicBlock* dominated_block : block->GetDominatedBlocks()) { if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) { return true; } } - for (auto successor : block->GetSuccessors()) { + for (const HBasicBlock* successor : block->GetSuccessors()) { if (!visited_blocks_.IsBitSet(successor->GetBlockId())) { return true; } diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc index 4f6ca17de0..142c95780e 100644 --- a/compiler/optimizing/inliner.cc +++ b/compiler/optimizing/inliner.cc @@ -140,6 +140,14 @@ void HInliner::Run() { DCHECK_NE(total_number_of_instructions_, 0u); DCHECK_NE(inlining_budget_, 0u); + // If we're compiling with a core image (which is only used for + // test purposes), honor inlining directives in method names: + // - if a method's name contains the substring "$inline$", ensure + // that this method is actually inlined; + // - if a method's name contains the substring "$noinline$", do not + // inline that method. + const bool honor_inlining_directives = IsCompilingWithCoreImage(); + // Keep a copy of all blocks when starting the visit. ArenaVector<HBasicBlock*> blocks = graph_->GetReversePostOrder(); DCHECK(!blocks.empty()); @@ -152,7 +160,7 @@ void HInliner::Run() { HInvoke* call = instruction->AsInvoke(); // As long as the call is not intrinsified, it is worth trying to inline. if (call != nullptr && call->GetIntrinsic() == Intrinsics::kNone) { - if (kIsDebugBuild && IsCompilingWithCoreImage()) { + if (honor_inlining_directives) { // Debugging case: directives in method names control or assert on inlining. std::string callee_name = outer_compilation_unit_.GetDexFile()->PrettyMethod( call->GetDexMethodIndex(), /* with_signature */ false); @@ -1501,8 +1509,13 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction, } } if (needs_constructor_barrier) { - HMemoryBarrier* barrier = new (graph_->GetArena()) HMemoryBarrier(kStoreStore, kNoDexPc); - invoke_instruction->GetBlock()->InsertInstructionBefore(barrier, invoke_instruction); + // See CompilerDriver::RequiresConstructorBarrier for more details. + DCHECK(obj != nullptr) << "only non-static methods can have a constructor fence"; + + HConstructorFence* constructor_fence = + new (graph_->GetArena()) HConstructorFence(obj, kNoDexPc, graph_->GetArena()); + invoke_instruction->GetBlock()->InsertInstructionBefore(constructor_fence, + invoke_instruction); } *return_replacement = nullptr; break; @@ -1870,7 +1883,7 @@ void HInliner::RunOptimizations(HGraph* callee_graph, HDeadCodeElimination dce(callee_graph, inline_stats_, "dead_code_elimination$inliner"); HConstantFolding fold(callee_graph, "constant_folding$inliner"); HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_, handles_); - InstructionSimplifier simplify(callee_graph, codegen_, inline_stats_); + InstructionSimplifier simplify(callee_graph, codegen_, compiler_driver_, inline_stats_); IntrinsicsRecognizer intrinsics(callee_graph, inline_stats_); HOptimization* optimizations[] = { diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc index 978c6a2d71..df9e7164ed 100644 --- a/compiler/optimizing/instruction_builder.cc +++ b/compiler/optimizing/instruction_builder.cc @@ -451,10 +451,13 @@ void HInstructionBuilder::InitializeParameters() { referrer_method_id.class_idx_, parameter_index++, Primitive::kPrimNot, - true); + /* is_this */ true); AppendInstruction(parameter); UpdateLocal(locals_index++, parameter); number_of_parameters--; + current_this_parameter_ = parameter; + } else { + DCHECK(current_this_parameter_ == nullptr); } const DexFile::ProtoId& proto = dex_file_->GetMethodPrototype(referrer_method_id); @@ -465,7 +468,7 @@ void HInstructionBuilder::InitializeParameters() { arg_types->GetTypeItem(shorty_pos - 1).type_idx_, parameter_index++, Primitive::GetType(shorty[shorty_pos]), - false); + /* is_this */ false); ++shorty_pos; AppendInstruction(parameter); // Store the parameter value in the local that the dex code will use @@ -588,6 +591,8 @@ void HInstructionBuilder::Binop_22b(const Instruction& instruction, bool reverse UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction()); } +// Does the method being compiled need any constructor barriers being inserted? +// (Always 'false' for methods that aren't <init>.) static bool RequiresConstructorBarrier(const DexCompilationUnit* cu, CompilerDriver* driver) { // Can be null in unit tests only. if (UNLIKELY(cu == nullptr)) { @@ -596,6 +601,11 @@ static bool RequiresConstructorBarrier(const DexCompilationUnit* cu, CompilerDri Thread* self = Thread::Current(); return cu->IsConstructor() + && !cu->IsStatic() + // RequiresConstructorBarrier must only be queried for <init> methods; + // it's effectively "false" for every other method. + // + // See CompilerDriver::RequiresConstructBarrier for more explanation. && driver->RequiresConstructorBarrier(self, cu->GetDexFile(), cu->GetClassDefIndex()); } @@ -639,13 +649,24 @@ void HInstructionBuilder::BuildReturn(const Instruction& instruction, Primitive::Type type, uint32_t dex_pc) { if (type == Primitive::kPrimVoid) { + // Only <init> (which is a return-void) could possibly have a constructor fence. // This may insert additional redundant constructor fences from the super constructors. // TODO: remove redundant constructor fences (b/36656456). if (RequiresConstructorBarrier(dex_compilation_unit_, compiler_driver_)) { - AppendInstruction(new (arena_) HMemoryBarrier(kStoreStore, dex_pc)); + // Compiling instance constructor. + if (kIsDebugBuild) { + std::string method_name = graph_->GetMethodName(); + CHECK_EQ(std::string("<init>"), method_name); + } + + HInstruction* fence_target = current_this_parameter_; + DCHECK(fence_target != nullptr); + + AppendInstruction(new (arena_) HConstructorFence(fence_target, dex_pc, arena_)); } AppendInstruction(new (arena_) HReturnVoid(dex_pc)); } else { + DCHECK(!RequiresConstructorBarrier(dex_compilation_unit_, compiler_driver_)); HInstruction* value = LoadLocal(instruction.VRegA(), type); AppendInstruction(new (arena_) HReturn(value, dex_pc)); } @@ -941,7 +962,7 @@ bool HInstructionBuilder::BuildInvokePolymorphic(const Instruction& instruction false /* is_unresolved */); } -bool HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) { +HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) { ScopedObjectAccess soa(Thread::Current()); HLoadClass* load_class = BuildLoadClass(type_index, dex_pc); @@ -965,14 +986,65 @@ bool HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t d // Consider classes we haven't resolved as potentially finalizable. bool finalizable = (klass == nullptr) || klass->IsFinalizable(); - AppendInstruction(new (arena_) HNewInstance( + HNewInstance* new_instance = new (arena_) HNewInstance( cls, dex_pc, type_index, *dex_compilation_unit_->GetDexFile(), finalizable, - entrypoint)); - return true; + entrypoint); + AppendInstruction(new_instance); + + return new_instance; +} + +void HInstructionBuilder::BuildConstructorFenceForAllocation(HInstruction* allocation) { + DCHECK(allocation != nullptr && + (allocation->IsNewInstance() || + allocation->IsNewArray())); // corresponding to "new" keyword in JLS. + + if (allocation->IsNewInstance()) { + // STRING SPECIAL HANDLING: + // ------------------------------- + // Strings have a real HNewInstance node but they end up always having 0 uses. + // All uses of a String HNewInstance are always transformed to replace their input + // of the HNewInstance with an input of the invoke to StringFactory. + // + // Do not emit an HConstructorFence here since it can inhibit some String new-instance + // optimizations (to pass checker tests that rely on those optimizations). + HNewInstance* new_inst = allocation->AsNewInstance(); + HLoadClass* load_class = new_inst->GetLoadClass(); + + Thread* self = Thread::Current(); + ScopedObjectAccess soa(self); + StackHandleScope<1> hs(self); + Handle<mirror::Class> klass = load_class->GetClass(); + if (klass != nullptr && klass->IsStringClass()) { + return; + // Note: Do not use allocation->IsStringAlloc which requires + // a valid ReferenceTypeInfo, but that doesn't get made until after reference type + // propagation (and instruction builder is too early). + } + // (In terms of correctness, the StringFactory needs to provide its own + // default initialization barrier, see below.) + } + + // JLS 17.4.5 "Happens-before Order" describes: + // + // The default initialization of any object happens-before any other actions (other than + // default-writes) of a program. + // + // In our implementation the default initialization of an object to type T means + // setting all of its initial data (object[0..size)) to 0, and setting the + // object's class header (i.e. object.getClass() == T.class). + // + // In practice this fence ensures that the writes to the object header + // are visible to other threads if this object escapes the current thread. + // (and in theory the 0-initializing, but that happens automatically + // when new memory pages are mapped in by the OS). + HConstructorFence* ctor_fence = + new (arena_) HConstructorFence(allocation, allocation->GetDexPc(), arena_); + AppendInstruction(ctor_fence); } static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class) @@ -1501,15 +1573,15 @@ void HInstructionBuilder::BuildArrayAccess(const Instruction& instruction, graph_->SetHasBoundsChecks(true); } -void HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc, - dex::TypeIndex type_index, - uint32_t number_of_vreg_arguments, - bool is_range, - uint32_t* args, - uint32_t register_index) { +HNewArray* HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc, + dex::TypeIndex type_index, + uint32_t number_of_vreg_arguments, + bool is_range, + uint32_t* args, + uint32_t register_index) { HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc); HLoadClass* cls = BuildLoadClass(type_index, dex_pc); - HInstruction* object = new (arena_) HNewArray(cls, length, dex_pc); + HNewArray* const object = new (arena_) HNewArray(cls, length, dex_pc); AppendInstruction(object); const char* descriptor = dex_file_->StringByTypeIdx(type_index); @@ -1529,6 +1601,8 @@ void HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc, AppendInstruction(aset); } latest_result_ = object; + + return object; } template <typename T> @@ -2513,10 +2587,12 @@ bool HInstructionBuilder::ProcessDexInstruction(const Instruction& instruction, } case Instruction::NEW_INSTANCE: { - if (!BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc)) { - return false; - } + HNewInstance* new_instance = + BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc); + DCHECK(new_instance != nullptr); + UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction()); + BuildConstructorFenceForAllocation(new_instance); break; } @@ -2524,8 +2600,11 @@ bool HInstructionBuilder::ProcessDexInstruction(const Instruction& instruction, dex::TypeIndex type_index(instruction.VRegC_22c()); HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt); HLoadClass* cls = BuildLoadClass(type_index, dex_pc); - AppendInstruction(new (arena_) HNewArray(cls, length, dex_pc)); + + HNewArray* new_array = new (arena_) HNewArray(cls, length, dex_pc); + AppendInstruction(new_array); UpdateLocal(instruction.VRegA_22c(), current_block_->GetLastInstruction()); + BuildConstructorFenceForAllocation(new_array); break; } @@ -2534,7 +2613,13 @@ bool HInstructionBuilder::ProcessDexInstruction(const Instruction& instruction, dex::TypeIndex type_index(instruction.VRegB_35c()); uint32_t args[5]; instruction.GetVarArgs(args); - BuildFilledNewArray(dex_pc, type_index, number_of_vreg_arguments, false, args, 0); + HNewArray* new_array = BuildFilledNewArray(dex_pc, + type_index, + number_of_vreg_arguments, + /* is_range */ false, + args, + /* register_index */ 0); + BuildConstructorFenceForAllocation(new_array); break; } @@ -2542,8 +2627,13 @@ bool HInstructionBuilder::ProcessDexInstruction(const Instruction& instruction, uint32_t number_of_vreg_arguments = instruction.VRegA_3rc(); dex::TypeIndex type_index(instruction.VRegB_3rc()); uint32_t register_index = instruction.VRegC_3rc(); - BuildFilledNewArray( - dex_pc, type_index, number_of_vreg_arguments, true, nullptr, register_index); + HNewArray* new_array = BuildFilledNewArray(dex_pc, + type_index, + number_of_vreg_arguments, + /* is_range */ true, + /* args*/ nullptr, + register_index); + BuildConstructorFenceForAllocation(new_array); break; } diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h index 7fdc1883ca..e968760d84 100644 --- a/compiler/optimizing/instruction_builder.h +++ b/compiler/optimizing/instruction_builder.h @@ -62,6 +62,7 @@ class HInstructionBuilder : public ValueObject { current_block_(nullptr), current_locals_(nullptr), latest_result_(nullptr), + current_this_parameter_(nullptr), compiler_driver_(driver), code_generator_(code_generator), dex_compilation_unit_(dex_compilation_unit), @@ -193,12 +194,12 @@ class HInstructionBuilder : public ValueObject { uint32_t register_index); // Builds a new array node and the instructions that fill it. - void BuildFilledNewArray(uint32_t dex_pc, - dex::TypeIndex type_index, - uint32_t number_of_vreg_arguments, - bool is_range, - uint32_t* args, - uint32_t register_index); + HNewArray* BuildFilledNewArray(uint32_t dex_pc, + dex::TypeIndex type_index, + uint32_t number_of_vreg_arguments, + bool is_range, + uint32_t* args, + uint32_t register_index); void BuildFillArrayData(const Instruction& instruction, uint32_t dex_pc); @@ -287,7 +288,11 @@ class HInstructionBuilder : public ValueObject { REQUIRES_SHARED(Locks::mutator_lock_); // Build a HNewInstance instruction. - bool BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc); + HNewInstance* BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc); + + // Build a HConstructorFence for HNewInstance and HNewArray instructions. This ensures the + // happens-before ordering for default-initialization of the object referred to by new_instance. + void BuildConstructorFenceForAllocation(HInstruction* allocation); // Return whether the compiler can assume `cls` is initialized. bool IsInitialized(Handle<mirror::Class> cls) const @@ -325,6 +330,11 @@ class HInstructionBuilder : public ValueObject { HBasicBlock* current_block_; ArenaVector<HInstruction*>* current_locals_; HInstruction* latest_result_; + // Current "this" parameter. + // Valid only after InitializeParameters() finishes. + // * Null for static methods. + // * Non-null for instance methods. + HParameterValue* current_this_parameter_; CompilerDriver* const compiler_driver_; diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc index 2dcc12e294..d14716601c 100644 --- a/compiler/optimizing/instruction_simplifier.cc +++ b/compiler/optimizing/instruction_simplifier.cc @@ -30,9 +30,11 @@ class InstructionSimplifierVisitor : public HGraphDelegateVisitor { public: InstructionSimplifierVisitor(HGraph* graph, CodeGenerator* codegen, + CompilerDriver* compiler_driver, OptimizingCompilerStats* stats) : HGraphDelegateVisitor(graph), codegen_(codegen), + compiler_driver_(compiler_driver), stats_(stats) {} void Run(); @@ -119,6 +121,7 @@ class InstructionSimplifierVisitor : public HGraphDelegateVisitor { void SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind); CodeGenerator* codegen_; + CompilerDriver* compiler_driver_; OptimizingCompilerStats* stats_; bool simplification_occurred_ = false; int simplifications_at_current_position_ = 0; @@ -130,7 +133,7 @@ class InstructionSimplifierVisitor : public HGraphDelegateVisitor { }; void InstructionSimplifier::Run() { - InstructionSimplifierVisitor visitor(graph_, codegen_, stats_); + InstructionSimplifierVisitor visitor(graph_, codegen_, compiler_driver_, stats_); visitor.Run(); } @@ -257,7 +260,8 @@ void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) { if (shift_amount->IsConstant()) { int64_t cst = Int64FromConstant(shift_amount->AsConstant()); - if ((cst & implicit_mask) == 0) { + int64_t masked_cst = cst & implicit_mask; + if (masked_cst == 0) { // Replace code looking like // SHL dst, value, 0 // with @@ -266,6 +270,17 @@ void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) { instruction->GetBlock()->RemoveInstruction(instruction); RecordSimplification(); return; + } else if (masked_cst != cst) { + // Replace code looking like + // SHL dst, value, cst + // where cst exceeds maximum distance with the equivalent + // SHL dst, value, cst & implicit_mask + // (as defined by shift semantics). This ensures other + // optimizations do not need to special case for such situations. + DCHECK_EQ(shift_amount->GetType(), Primitive::kPrimInt); + instruction->ReplaceInput(GetGraph()->GetIntConstant(masked_cst), /* index */ 1); + RecordSimplification(); + return; } } @@ -1884,7 +1899,7 @@ void InstructionSimplifierVisitor::SimplifySystemArrayCopy(HInvoke* instruction) // the invoke, as we would need to look it up in the current dex file, and it // is unlikely that it exists. The most usual situation for such typed // arraycopy methods is a direct pointer to the boot image. - HSharpening::SharpenInvokeStaticOrDirect(invoke, codegen_); + HSharpening::SharpenInvokeStaticOrDirect(invoke, codegen_, compiler_driver_); } } } diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h index f7329a4a1f..5e2045580b 100644 --- a/compiler/optimizing/instruction_simplifier.h +++ b/compiler/optimizing/instruction_simplifier.h @@ -24,6 +24,7 @@ namespace art { class CodeGenerator; +class CompilerDriver; /** * Implements optimizations specific to each instruction. @@ -37,12 +38,14 @@ class CodeGenerator; */ class InstructionSimplifier : public HOptimization { public: - explicit InstructionSimplifier(HGraph* graph, - CodeGenerator* codegen, - OptimizingCompilerStats* stats = nullptr, - const char* name = kInstructionSimplifierPassName) + InstructionSimplifier(HGraph* graph, + CodeGenerator* codegen, + CompilerDriver* compiler_driver, + OptimizingCompilerStats* stats = nullptr, + const char* name = kInstructionSimplifierPassName) : HOptimization(graph, name, stats), - codegen_(codegen) {} + codegen_(codegen), + compiler_driver_(compiler_driver) {} static constexpr const char* kInstructionSimplifierPassName = "instruction_simplifier"; @@ -50,6 +53,7 @@ class InstructionSimplifier : public HOptimization { private: CodeGenerator* codegen_; + CompilerDriver* compiler_driver_; DISALLOW_COPY_AND_ASSIGN(InstructionSimplifier); }; diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc index f16e3727c8..311be1fb49 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.cc +++ b/compiler/optimizing/instruction_simplifier_arm64.cc @@ -216,5 +216,18 @@ void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) { } } +void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) { + if (!instruction->IsStringCharAt() + && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + RecordSimplification(); + } +} + +void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) { + if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + RecordSimplification(); + } +} + } // namespace arm64 } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h index eec4e49792..8596f6ad40 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.h +++ b/compiler/optimizing/instruction_simplifier_arm64.h @@ -75,6 +75,8 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor { void VisitUShr(HUShr* instruction) OVERRIDE; void VisitXor(HXor* instruction) OVERRIDE; void VisitVecMul(HVecMul* instruction) OVERRIDE; + void VisitVecLoad(HVecLoad* instruction) OVERRIDE; + void VisitVecStore(HVecStore* instruction) OVERRIDE; OptimizingCompilerStats* stats_; }; diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc index 7d1f146587..e5a8499ff4 100644 --- a/compiler/optimizing/instruction_simplifier_shared.cc +++ b/compiler/optimizing/instruction_simplifier_shared.cc @@ -16,6 +16,8 @@ #include "instruction_simplifier_shared.h" +#include "mirror/array-inl.h" + namespace art { namespace { @@ -247,6 +249,7 @@ bool TryExtractArrayAccessAddress(HInstruction* access, access->GetType() == Primitive::kPrimNot) { // For object arrays, the read barrier instrumentation requires // the original array pointer. + // TODO: This can be relaxed for Baker CC. return false; } @@ -345,4 +348,59 @@ bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) { return false; } +bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) { + if (index->IsConstant()) { + // If index is constant the whole address calculation often can be done by LDR/STR themselves. + // TODO: Treat the case with not-embedable constant. + return false; + } + + HGraph* graph = access->GetBlock()->GetGraph(); + ArenaAllocator* arena = graph->GetArena(); + Primitive::Type packed_type = access->GetPackedType(); + uint32_t data_offset = mirror::Array::DataOffset( + Primitive::ComponentSize(packed_type)).Uint32Value(); + size_t component_shift = Primitive::ComponentSizeShift(packed_type); + + bool is_extracting_beneficial = false; + // It is beneficial to extract index intermediate address only if there are at least 2 users. + for (const HUseListNode<HInstruction*>& use : index->GetUses()) { + HInstruction* user = use.GetUser(); + if (user->IsVecMemoryOperation() && user != access) { + HVecMemoryOperation* another_access = user->AsVecMemoryOperation(); + Primitive::Type another_packed_type = another_access->GetPackedType(); + uint32_t another_data_offset = mirror::Array::DataOffset( + Primitive::ComponentSize(another_packed_type)).Uint32Value(); + size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type); + if (another_data_offset == data_offset && another_component_shift == component_shift) { + is_extracting_beneficial = true; + break; + } + } else if (user->IsIntermediateAddressIndex()) { + HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex(); + uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue(); + size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue(); + if (another_data_offset == data_offset && another_component_shift == component_shift) { + is_extracting_beneficial = true; + break; + } + } + } + + if (!is_extracting_beneficial) { + return false; + } + + // Proceed to extract the index + data_offset address computation. + HIntConstant* offset = graph->GetIntConstant(data_offset); + HIntConstant* shift = graph->GetIntConstant(component_shift); + HIntermediateAddressIndex* address = + new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc); + + access->GetBlock()->InsertInstructionBefore(address, access); + access->ReplaceInput(address, 1); + + return true; +} + } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h index 2ea103a518..371619fa2e 100644 --- a/compiler/optimizing/instruction_simplifier_shared.h +++ b/compiler/optimizing/instruction_simplifier_shared.h @@ -59,6 +59,7 @@ bool TryExtractArrayAccessAddress(HInstruction* access, size_t data_offset); bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa); +bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index); } // namespace art diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc index 6236bd87ab..b664d41013 100644 --- a/compiler/optimizing/intrinsics.cc +++ b/compiler/optimizing/intrinsics.cc @@ -25,7 +25,7 @@ #include "mirror/dex_cache-inl.h" #include "nodes.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "utils.h" namespace art { @@ -146,7 +146,7 @@ void IntrinsicsRecognizer::Run() { Intrinsics intrinsic = static_cast<Intrinsics>(art_method->GetIntrinsic()); if (!CheckInvokeType(intrinsic, invoke)) { LOG(WARNING) << "Found an intrinsic with unexpected invoke type: " - << intrinsic << " for " + << static_cast<uint32_t>(intrinsic) << " for " << art_method->PrettyMethod() << invoke->DebugName(); } else { diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 750f9cc213..ae5f8d1760 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -28,7 +28,7 @@ #include "mirror/reference.h" #include "mirror/string.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "utils/arm/assembler_arm.h" namespace art { @@ -1010,17 +1010,14 @@ static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorARM* code if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Need to make sure the reference stored in the field is a to-space // one before attempting the CAS or the CAS could fail incorrectly. - codegen->GenerateReferenceLoadWithBakerReadBarrier( + codegen->UpdateReferenceFieldWithBakerReadBarrier( invoke, out_loc, // Unused, used only as a "temporary" within the read barrier. base, - /* offset */ 0u, - /* index */ offset_loc, - ScaleFactor::TIMES_1, + /* field_offset */ offset_loc, tmp_ptr_loc, /* needs_null_check */ false, - /* always_update_field */ true, - &tmp); + tmp); } } @@ -1648,6 +1645,8 @@ void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { // is clobbered by ReadBarrierMarkRegX entry points). Get an extra // temporary register from the register allocator. locations->AddTemp(Location::RequiresRegister()); + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen_); + arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations); } } @@ -2599,11 +2598,7 @@ void IntrinsicCodeGeneratorARM::VisitFloatIsInfinite(HInvoke* invoke) { // We don't care about the sign bit, so shift left. __ Lsl(out, out, 1); __ eor(out, out, ShifterOperand(infinity)); - // If the result is 0, then it has 32 leading zeros, and less than that otherwise. - __ clz(out, out); - // Any number less than 32 logically shifted right by 5 bits results in 0; - // the same operation on 32 yields 1. - __ Lsr(out, out, 5); + codegen_->GenerateConditionWithZero(kCondEQ, out, out); } void IntrinsicLocationsBuilderARM::VisitDoubleIsInfinite(HInvoke* invoke) { @@ -2626,63 +2621,7 @@ void IntrinsicCodeGeneratorARM::VisitDoubleIsInfinite(HInvoke* invoke) { __ eor(out, out, ShifterOperand(infinity_high2)); // We don't care about the sign bit, so shift left. __ orr(out, IP, ShifterOperand(out, LSL, 1)); - // If the result is 0, then it has 32 leading zeros, and less than that otherwise. - __ clz(out, out); - // Any number less than 32 logically shifted right by 5 bits results in 0; - // the same operation on 32 yields 1. - __ Lsr(out, out, 5); -} - -void IntrinsicLocationsBuilderARM::VisitReferenceGetReferent(HInvoke* invoke) { - if (kEmitCompilerReadBarrier) { - // Do not intrinsify this call with the read barrier configuration. - return; - } - LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnSlowPath, - kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorARM::VisitReferenceGetReferent(HInvoke* invoke) { - DCHECK(!kEmitCompilerReadBarrier); - ArmAssembler* const assembler = GetAssembler(); - LocationSummary* locations = invoke->GetLocations(); - - Register obj = locations->InAt(0).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); - - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke); - codegen_->AddSlowPath(slow_path); - - // Load ArtMethod first. - HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect(); - DCHECK(invoke_direct != nullptr); - Register temp = codegen_->GenerateCalleeMethodStaticOrDirectCall( - invoke_direct, locations->GetTemp(0)).AsRegister<Register>(); - - // Now get declaring class. - __ ldr(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value())); - - uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset(); - uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset(); - DCHECK_NE(slow_path_flag_offset, 0u); - DCHECK_NE(disable_flag_offset, 0u); - DCHECK_NE(slow_path_flag_offset, disable_flag_offset); - - // Check static flags that prevent using intrinsic. - __ ldr(IP, Address(temp, disable_flag_offset)); - __ ldr(temp, Address(temp, slow_path_flag_offset)); - __ orr(IP, IP, ShifterOperand(temp)); - __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel()); - - // Fast path. - __ ldr(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value())); - codegen_->MaybeRecordImplicitNullCheck(invoke); - __ MaybeUnpoisonHeapReference(out); - __ Bind(slow_path->GetExitLabel()); + codegen_->GenerateConditionWithZero(kCondEQ, out, out); } void IntrinsicLocationsBuilderARM::VisitIntegerValueOf(HInvoke* invoke) { @@ -2754,6 +2693,30 @@ void IntrinsicCodeGeneratorARM::VisitIntegerValueOf(HInvoke* invoke) { } } +void IntrinsicLocationsBuilderARM::VisitThreadInterrupted(HInvoke* invoke) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorARM::VisitThreadInterrupted(HInvoke* invoke) { + ArmAssembler* assembler = GetAssembler(); + Register out = invoke->GetLocations()->Out().AsRegister<Register>(); + int32_t offset = Thread::InterruptedOffset<kArmPointerSize>().Int32Value(); + __ LoadFromOffset(kLoadWord, out, TR, offset); + Label done; + Label* const final_label = codegen_->GetFinalLabel(invoke, &done); + __ CompareAndBranchIfZero(out, final_label); + __ dmb(ISH); + __ LoadImmediate(IP, 0); + __ StoreToOffset(kStoreWord, IP, TR, offset); + __ dmb(ISH); + if (done.IsLinked()) { + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble) UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat) UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble) @@ -2767,6 +2730,7 @@ UNIMPLEMENTED_INTRINSIC(ARM, MathRoundDouble) // Could be done by changing rou UNIMPLEMENTED_INTRINSIC(ARM, MathRoundFloat) // Could be done by changing rounding mode, maybe? UNIMPLEMENTED_INTRINSIC(ARM, UnsafeCASLong) // High register pressure. UNIMPLEMENTED_INTRINSIC(ARM, SystemArrayCopyChar) +UNIMPLEMENTED_INTRINSIC(ARM, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(ARM, IntegerHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARM, LongHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARM, IntegerLowestOneBit) diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 4d360158a2..990a773a95 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -28,7 +28,7 @@ #include "mirror/reference.h" #include "mirror/string-inl.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "utils/arm64/assembler_arm64.h" using namespace vixl::aarch64; // NOLINT(build/namespaces) @@ -1154,17 +1154,14 @@ static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorARM64* co Register temp = WRegisterFrom(locations->GetTemp(0)); // Need to make sure the reference stored in the field is a to-space // one before attempting the CAS or the CAS could fail incorrectly. - codegen->GenerateReferenceLoadWithBakerReadBarrier( + codegen->UpdateReferenceFieldWithBakerReadBarrier( invoke, out_loc, // Unused, used only as a "temporary" within the read barrier. base, - /* offset */ 0u, - /* index */ offset_loc, - /* scale_factor */ 0u, + /* field_offset */ offset_loc, temp, /* needs_null_check */ false, - /* use_load_acquire */ false, - /* always_update_field */ true); + /* use_load_acquire */ false); } } @@ -2900,69 +2897,6 @@ void IntrinsicCodeGeneratorARM64::VisitDoubleIsInfinite(HInvoke* invoke) { GenIsInfinite(invoke->GetLocations(), /* is64bit */ true, GetVIXLAssembler()); } -void IntrinsicLocationsBuilderARM64::VisitReferenceGetReferent(HInvoke* invoke) { - if (kEmitCompilerReadBarrier) { - // Do not intrinsify this call with the read barrier configuration. - return; - } - LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnSlowPath, - kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorARM64::VisitReferenceGetReferent(HInvoke* invoke) { - DCHECK(!kEmitCompilerReadBarrier); - MacroAssembler* masm = GetVIXLAssembler(); - LocationSummary* locations = invoke->GetLocations(); - - Register obj = InputRegisterAt(invoke, 0); - Register out = OutputRegister(invoke); - - SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); - codegen_->AddSlowPath(slow_path); - - // Load ArtMethod first. - HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect(); - DCHECK(invoke_direct != nullptr); - Register temp0 = XRegisterFrom(codegen_->GenerateCalleeMethodStaticOrDirectCall( - invoke_direct, locations->GetTemp(0))); - - // Now get declaring class. - __ Ldr(temp0.W(), MemOperand(temp0, ArtMethod::DeclaringClassOffset().Int32Value())); - - uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset(); - uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset(); - DCHECK_NE(slow_path_flag_offset, 0u); - DCHECK_NE(disable_flag_offset, 0u); - DCHECK_NE(slow_path_flag_offset, disable_flag_offset); - - // Check static flags that prevent using intrinsic. - if (slow_path_flag_offset == disable_flag_offset + 1) { - // Load two adjacent flags in one 64-bit load. - __ Ldr(temp0, MemOperand(temp0, disable_flag_offset)); - } else { - UseScratchRegisterScope temps(masm); - Register temp1 = temps.AcquireW(); - __ Ldr(temp1.W(), MemOperand(temp0, disable_flag_offset)); - __ Ldr(temp0.W(), MemOperand(temp0, slow_path_flag_offset)); - __ Orr(temp0, temp1, temp0); - } - __ Cbnz(temp0, slow_path->GetEntryLabel()); - - { - // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted. - vixl::EmissionCheckScope guard(codegen_->GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); - // Fast path. - __ Ldr(out, HeapOperand(obj, mirror::Reference::ReferentOffset().Int32Value())); - codegen_->MaybeRecordImplicitNullCheck(invoke); - } - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(out); - __ Bind(slow_path->GetExitLabel()); -} - void IntrinsicLocationsBuilderARM64::VisitIntegerValueOf(HInvoke* invoke) { InvokeRuntimeCallingConvention calling_convention; IntrinsicVisitor::ComputeIntegerValueOfLocations( @@ -3036,6 +2970,29 @@ void IntrinsicCodeGeneratorARM64::VisitIntegerValueOf(HInvoke* invoke) { } } +void IntrinsicLocationsBuilderARM64::VisitThreadInterrupted(HInvoke* invoke) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorARM64::VisitThreadInterrupted(HInvoke* invoke) { + MacroAssembler* masm = GetVIXLAssembler(); + Register out = RegisterFrom(invoke->GetLocations()->Out(), Primitive::kPrimInt); + UseScratchRegisterScope temps(masm); + Register temp = temps.AcquireX(); + + __ Add(temp, tr, Thread::InterruptedOffset<kArm64PointerSize>().Int32Value()); + __ Ldar(out.W(), MemOperand(temp)); + + vixl::aarch64::Label done; + __ Cbz(out.W(), &done); + __ Stlr(wzr, MemOperand(temp)); + __ Bind(&done); +} + +UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(ARM64, IntegerHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARM64, LongHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARM64, IntegerLowestOneBit) diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h index 3c53517b28..ff59ce9658 100644 --- a/compiler/optimizing/intrinsics_arm64.h +++ b/compiler/optimizing/intrinsics_arm64.h @@ -24,7 +24,8 @@ namespace aarch64 { class MacroAssembler; -}} // namespace vixl::aarch64 +} // namespace aarch64 +} // namespace vixl namespace art { diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index fd8a37ae05..0e04b9a950 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -26,7 +26,7 @@ #include "mirror/reference.h" #include "mirror/string.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "aarch32/constants-aarch32.h" @@ -1347,17 +1347,14 @@ static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorARMVIXL* if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Need to make sure the reference stored in the field is a to-space // one before attempting the CAS or the CAS could fail incorrectly. - codegen->GenerateReferenceLoadWithBakerReadBarrier( + codegen->UpdateReferenceFieldWithBakerReadBarrier( invoke, out_loc, // Unused, used only as a "temporary" within the read barrier. base, - /* offset */ 0u, - /* index */ offset_loc, - ScaleFactor::TIMES_1, + /* field_offset */ offset_loc, tmp_ptr_loc, /* needs_null_check */ false, - /* always_update_field */ true, - &tmp); + tmp); } } @@ -2026,6 +2023,8 @@ void IntrinsicLocationsBuilderARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) { // is clobbered by ReadBarrierMarkRegX entry points). Get an extra // temporary register from the register allocator. locations->AddTemp(Location::RequiresRegister()); + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen_); + arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations); } } @@ -2972,11 +2971,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) { // We don't care about the sign bit, so shift left. __ Lsl(out, out, 1); __ Eor(out, out, infinity); - // If the result is 0, then it has 32 leading zeros, and less than that otherwise. - __ Clz(out, out); - // Any number less than 32 logically shifted right by 5 bits results in 0; - // the same operation on 32 yields 1. - __ Lsr(out, out, 5); + codegen_->GenerateConditionWithZero(kCondEQ, out, out); } void IntrinsicLocationsBuilderARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) { @@ -3002,65 +2997,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) { __ Eor(out, out, infinity_high2); // We don't care about the sign bit, so shift left. __ Orr(out, temp, Operand(out, vixl32::LSL, 1)); - // If the result is 0, then it has 32 leading zeros, and less than that otherwise. - __ Clz(out, out); - // Any number less than 32 logically shifted right by 5 bits results in 0; - // the same operation on 32 yields 1. - __ Lsr(out, out, 5); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) { - if (kEmitCompilerReadBarrier) { - // Do not intrinsify this call with the read barrier configuration. - return; - } - LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnSlowPath, - kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) { - DCHECK(!kEmitCompilerReadBarrier); - ArmVIXLAssembler* assembler = GetAssembler(); - LocationSummary* locations = invoke->GetLocations(); - - vixl32::Register obj = InputRegisterAt(invoke, 0); - vixl32::Register out = OutputRegister(invoke); - - SlowPathCodeARMVIXL* slow_path = new (GetAllocator()) IntrinsicSlowPathARMVIXL(invoke); - codegen_->AddSlowPath(slow_path); - - // Load ArtMethod first. - HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect(); - DCHECK(invoke_direct != nullptr); - vixl32::Register temp0 = RegisterFrom(codegen_->GenerateCalleeMethodStaticOrDirectCall( - invoke_direct, locations->GetTemp(0))); - - // Now get declaring class. - __ Ldr(temp0, MemOperand(temp0, ArtMethod::DeclaringClassOffset().Int32Value())); - - uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset(); - uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset(); - DCHECK_NE(slow_path_flag_offset, 0u); - DCHECK_NE(disable_flag_offset, 0u); - DCHECK_NE(slow_path_flag_offset, disable_flag_offset); - - // Check static flags that prevent using intrinsic. - UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); - vixl32::Register temp1 = temps.Acquire(); - __ Ldr(temp1, MemOperand(temp0, disable_flag_offset)); - __ Ldr(temp0, MemOperand(temp0, slow_path_flag_offset)); - __ Orr(temp0, temp1, temp0); - __ CompareAndBranchIfNonZero(temp0, slow_path->GetEntryLabel()); - - // Fast path. - __ Ldr(out, MemOperand(obj, mirror::Reference::ReferentOffset().Int32Value())); - codegen_->MaybeRecordImplicitNullCheck(invoke); - assembler->MaybeUnpoisonHeapReference(out); - __ Bind(slow_path->GetExitLabel()); + codegen_->GenerateConditionWithZero(kCondEQ, out, out); } void IntrinsicLocationsBuilderARMVIXL::VisitMathCeil(HInvoke* invoke) { @@ -3136,7 +3073,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitIntegerValueOf(HInvoke* invoke) { __ Add(out, in, -info.low); __ Cmp(out, info.high - info.low + 1); vixl32::Label allocate, done; - __ B(hs, &allocate); + __ B(hs, &allocate, /* is_far_target */ false); // If the value is within the bounds, load the j.l.Integer directly from the array. uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); @@ -3158,9 +3095,36 @@ void IntrinsicCodeGeneratorARMVIXL::VisitIntegerValueOf(HInvoke* invoke) { } } +void IntrinsicLocationsBuilderARMVIXL::VisitThreadInterrupted(HInvoke* invoke) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorARMVIXL::VisitThreadInterrupted(HInvoke* invoke) { + ArmVIXLAssembler* assembler = GetAssembler(); + vixl32::Register out = RegisterFrom(invoke->GetLocations()->Out()); + int32_t offset = Thread::InterruptedOffset<kArmPointerSize>().Int32Value(); + __ Ldr(out, MemOperand(tr, offset)); + UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); + vixl32::Register temp = temps.Acquire(); + vixl32::Label done; + vixl32::Label* const final_label = codegen_->GetFinalLabel(invoke, &done); + __ CompareAndBranchIfZero(out, final_label, /* far_target */ false); + __ Dmb(vixl32::ISH); + __ Mov(temp, 0); + assembler->StoreToOffset(kStoreWord, temp, tr, offset); + __ Dmb(vixl32::ISH); + if (done.IsReferenced()) { + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble) // Could be done by changing rounding mode, maybe? UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeCASLong) // High register pressure. UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyChar) +UNIMPLEMENTED_INTRINSIC(ARMVIXL, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(ARMVIXL, IntegerHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARMVIXL, LongHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARMVIXL, IntegerLowestOneBit) diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index 41df56b514..ea3e9e5ec9 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -23,6 +23,7 @@ #include "intrinsics.h" #include "mirror/array-inl.h" #include "mirror/string.h" +#include "scoped_thread_state_change-inl.h" #include "thread.h" #include "utils/mips/assembler_mips.h" #include "utils/mips/constants_mips.h" @@ -32,7 +33,7 @@ namespace art { namespace mips { IntrinsicLocationsBuilderMIPS::IntrinsicLocationsBuilderMIPS(CodeGeneratorMIPS* codegen) - : arena_(codegen->GetGraph()->GetArena()) { + : codegen_(codegen), arena_(codegen->GetGraph()->GetArena()) { } MipsAssembler* IntrinsicCodeGeneratorMIPS::GetAssembler() { @@ -1525,6 +1526,9 @@ static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall), kIntrinsified); + if (can_call && kUseBakerReadBarrier) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); locations->SetInAt(2, Location::RequiresRegister()); @@ -2552,101 +2556,110 @@ void IntrinsicCodeGeneratorMIPS::VisitMathRoundFloat(HInvoke* invoke) { Register out = locations->Out().AsRegister<Register>(); MipsLabel done; - MipsLabel finite; - MipsLabel add; - // if (in.isNaN) { - // return 0; - // } - // - // out = floor.w.s(in); - // - // /* - // * This "if" statement is only needed for the pre-R6 version of floor.w.s - // * which outputs Integer.MAX_VALUE for negative numbers with magnitudes - // * too large to fit in a 32-bit integer. - // * - // * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative - // * numbers which are too large to be represented in a 32-bit signed - // * integer will be processed by floor.w.s to output Integer.MIN_VALUE, - // * and will no longer be processed by this "if" statement. - // */ - // if (out == Integer.MAX_VALUE) { - // TMP = (in < 0.0f) ? 1 : 0; - // /* - // * If TMP is 1, then adding it to out will wrap its value from - // * Integer.MAX_VALUE to Integer.MIN_VALUE. - // */ - // return out += TMP; - // } - // - // /* - // * For negative values not handled by the previous "if" statement the - // * test here will correctly set the value of TMP. - // */ - // TMP = ((in - out) >= 0.5f) ? 1 : 0; - // return out += TMP; - - // Test for NaN. if (IsR6()) { - __ CmpUnS(FTMP, in, in); + // out = floor(in); + // + // if (out != MAX_VALUE && out != MIN_VALUE) { + // TMP = ((in - out) >= 0.5) ? 1 : 0; + // return out += TMP; + // } + // return out; + + // out = floor(in); + __ FloorWS(FTMP, in); + __ Mfc1(out, FTMP); + + // if (out != MAX_VALUE && out != MIN_VALUE) + __ Addiu(TMP, out, 1); + __ Aui(TMP, TMP, 0x8000); // TMP = out + 0x8000 0001 + // or out - 0x7FFF FFFF. + // IOW, TMP = 1 if out = Int.MIN_VALUE + // or TMP = 0 if out = Int.MAX_VALUE. + __ Srl(TMP, TMP, 1); // TMP = 0 if out = Int.MIN_VALUE + // or out = Int.MAX_VALUE. + __ Beqz(TMP, &done); + + // TMP = (0.5f <= (in - out)) ? -1 : 0; + __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". + __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); + __ SubS(FTMP, in, FTMP); + __ Mtc1(AT, half); + + __ CmpLeS(FTMP, half, FTMP); + __ Mfc1(TMP, FTMP); + + // Return out -= TMP. + __ Subu(out, out, TMP); } else { + // if (in.isNaN) { + // return 0; + // } + // + // out = floor.w.s(in); + // + // /* + // * This "if" statement is only needed for the pre-R6 version of floor.w.s + // * which outputs Integer.MAX_VALUE for negative numbers with magnitudes + // * too large to fit in a 32-bit integer. + // */ + // if (out == Integer.MAX_VALUE) { + // TMP = (in < 0.0f) ? 1 : 0; + // /* + // * If TMP is 1, then adding it to out will wrap its value from + // * Integer.MAX_VALUE to Integer.MIN_VALUE. + // */ + // return out += TMP; + // } + // + // /* + // * For negative values not handled by the previous "if" statement the + // * test here will correctly set the value of TMP. + // */ + // TMP = ((in - out) >= 0.5f) ? 1 : 0; + // return out += TMP; + + MipsLabel finite; + MipsLabel add; + + // Test for NaN. __ CunS(in, in); - } - // Return zero for NaN. - __ Move(out, ZERO); - if (IsR6()) { - __ Bc1nez(FTMP, &done); - } else { + // Return zero for NaN. + __ Move(out, ZERO); __ Bc1t(&done); - } - // out = floor(in); - __ FloorWS(FTMP, in); - __ Mfc1(out, FTMP); + // out = floor(in); + __ FloorWS(FTMP, in); + __ Mfc1(out, FTMP); - if (!IsR6()) { __ LoadConst32(TMP, -1); - } - // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0; - __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); - __ Bne(AT, out, &finite); + // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0; + __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); + __ Bne(AT, out, &finite); - __ Mtc1(ZERO, FTMP); - if (IsR6()) { - __ CmpLtS(FTMP, in, FTMP); - __ Mfc1(TMP, FTMP); - } else { + __ Mtc1(ZERO, FTMP); __ ColtS(in, FTMP); - } - __ B(&add); + __ B(&add); - __ Bind(&finite); + __ Bind(&finite); - // TMP = (0.5f <= (in - out)) ? -1 : 0; - __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". - __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); - __ SubS(FTMP, in, FTMP); - __ Mtc1(AT, half); - if (IsR6()) { - __ CmpLeS(FTMP, half, FTMP); - __ Mfc1(TMP, FTMP); - } else { + // TMP = (0.5f <= (in - out)) ? -1 : 0; + __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". + __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); + __ SubS(FTMP, in, FTMP); + __ Mtc1(AT, half); __ ColeS(half, FTMP); - } - __ Bind(&add); + __ Bind(&add); - if (!IsR6()) { __ Movf(TMP, ZERO); - } - - // Return out -= TMP. - __ Subu(out, out, TMP); + // Return out -= TMP. + __ Subu(out, out, TMP); + } __ Bind(&done); } @@ -3133,6 +3146,89 @@ void IntrinsicCodeGeneratorMIPS::VisitSystemArrayCopyChar(HInvoke* invoke) { __ Bind(slow_path->GetExitLabel()); } +// long java.lang.Integer.valueOf(long) +void IntrinsicLocationsBuilderMIPS::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConvention calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + calling_convention.GetReturnLocation(Primitive::kPrimNot), + Location::RegisterLocation(calling_convention.GetRegisterAt(0))); +} + +void IntrinsicCodeGeneratorMIPS::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + MipsAssembler* assembler = GetAssembler(); + InstructionCodeGeneratorMIPS* icodegen = + down_cast<InstructionCodeGeneratorMIPS*>(codegen_->GetInstructionVisitor()); + + Register out = locations->Out().AsRegister<Register>(); + InvokeRuntimeCallingConvention calling_convention; + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ LoadConst32(out, address); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + uint32_t address = + dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ LoadConst32(calling_convention.GetRegisterAt(0), address); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ StoreConstToOffset(kStoreWord, value, out, info.value_offset, TMP); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + } + } else { + Register in = locations->InAt(0).AsRegister<Register>(); + MipsLabel allocate, done; + int32_t count = static_cast<uint32_t>(info.high) - info.low + 1; + + // Is (info.low <= in) && (in <= info.high)? + __ Addiu32(out, in, -info.low); + // As unsigned quantities is out < (info.high - info.low + 1)? + if (IsInt<16>(count)) { + __ Sltiu(AT, out, count); + } else { + __ LoadConst32(AT, count); + __ Sltu(AT, out, AT); + } + // Branch if out >= (info.high - info.low + 1). + // This means that "in" is outside of the range [info.low, info.high]. + __ Beqz(AT, &allocate); + + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + __ LoadConst32(TMP, data_offset + address); + __ ShiftAndAdd(out, out, TMP, TIMES_4); + __ Lw(out, out, 0); + __ MaybeUnpoisonHeapReference(out); + __ B(&done); + + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ LoadConst32(calling_convention.GetRegisterAt(0), address); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ StoreToOffset(kStoreWord, in, out, info.value_offset); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + __ Bind(&done); + } +} + // Unimplemented intrinsics. UNIMPLEMENTED_INTRINSIC(MIPS, MathCeil) @@ -3162,7 +3258,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetInt) UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetLong) UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetObject) -UNIMPLEMENTED_INTRINSIC(MIPS, IntegerValueOf) +UNIMPLEMENTED_INTRINSIC(MIPS, ThreadInterrupted) UNREACHABLE_INTRINSICS(MIPS) diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h index e134cb882e..eaadad2515 100644 --- a/compiler/optimizing/intrinsics_mips.h +++ b/compiler/optimizing/intrinsics_mips.h @@ -49,6 +49,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) bool TryDispatch(HInvoke* invoke); private: + CodeGeneratorMIPS* codegen_; ArenaAllocator* arena_; DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderMIPS); diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index b57b41f686..2ecb1a3b02 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -23,6 +23,7 @@ #include "intrinsics.h" #include "mirror/array-inl.h" #include "mirror/string.h" +#include "scoped_thread_state_change-inl.h" #include "thread.h" #include "utils/mips64/assembler_mips64.h" #include "utils/mips64/constants_mips64.h" @@ -32,7 +33,7 @@ namespace art { namespace mips64 { IntrinsicLocationsBuilderMIPS64::IntrinsicLocationsBuilderMIPS64(CodeGeneratorMIPS64* codegen) - : arena_(codegen->GetGraph()->GetArena()) { + : codegen_(codegen), arena_(codegen->GetGraph()->GetArena()) { } Mips64Assembler* IntrinsicCodeGeneratorMIPS64::GetAssembler() { @@ -890,54 +891,14 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri DCHECK(type == Primitive::kPrimFloat || type == Primitive::kPrimDouble); Mips64Label done; - Mips64Label finite; - Mips64Label add; - // if (in.isNaN) { - // return 0; - // } - // // out = floor(in); // - // /* - // * TODO: Amend this code when emulator FCSR.NAN2008=1 bug is fixed. - // * - // * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative - // * numbers which are too large to be represented in a 32-/64-bit - // * signed integer will be processed by floor.X.Y to output - // * Integer.MIN_VALUE/Long.MIN_VALUE, and will no longer be - // * processed by this "if" statement. - // * - // * However, this bug in the 64-bit MIPS emulator causes the - // * behavior of floor.X.Y to be the same as pre-R6 implementations - // * of MIPS64. When that bug is fixed this logic should be amended. - // */ - // if (out == MAX_VALUE) { - // TMP = (in < 0.0) ? 1 : 0; - // /* - // * If TMP is 1, then adding it to out will wrap its value from - // * MAX_VALUE to MIN_VALUE. - // */ + // if (out != MAX_VALUE && out != MIN_VALUE) { + // TMP = ((in - out) >= 0.5) ? 1 : 0; // return out += TMP; // } - // - // /* - // * For negative values not handled by the previous "if" statement the - // * test here will correctly set the value of TMP. - // */ - // TMP = ((in - out) >= 0.5) ? 1 : 0; - // return out += TMP; - - // Test for NaN. - if (type == Primitive::kPrimDouble) { - __ CmpUnD(FTMP, in, in); - } else { - __ CmpUnS(FTMP, in, in); - } - - // Return zero for NaN. - __ Move(out, ZERO); - __ Bc1nez(FTMP, &done); + // return out; // out = floor(in); if (type == Primitive::kPrimDouble) { @@ -948,27 +909,26 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri __ Mfc1(out, FTMP); } - // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0; + // if (out != MAX_VALUE && out != MIN_VALUE) if (type == Primitive::kPrimDouble) { - __ LoadConst64(AT, std::numeric_limits<int64_t>::max()); + __ Daddiu(TMP, out, 1); + __ Dati(TMP, 0x8000); // TMP = out + 0x8000 0000 0000 0001 + // or out - 0x7FFF FFFF FFFF FFFF. + // IOW, TMP = 1 if out = Long.MIN_VALUE + // or TMP = 0 if out = Long.MAX_VALUE. + __ Dsrl(TMP, TMP, 1); // TMP = 0 if out = Long.MIN_VALUE + // or out = Long.MAX_VALUE. + __ Beqzc(TMP, &done); } else { - __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); + __ Addiu(TMP, out, 1); + __ Aui(TMP, TMP, 0x8000); // TMP = out + 0x8000 0001 + // or out - 0x7FFF FFFF. + // IOW, TMP = 1 if out = Int.MIN_VALUE + // or TMP = 0 if out = Int.MAX_VALUE. + __ Srl(TMP, TMP, 1); // TMP = 0 if out = Int.MIN_VALUE + // or out = Int.MAX_VALUE. + __ Beqzc(TMP, &done); } - __ Bnec(AT, out, &finite); - - if (type == Primitive::kPrimDouble) { - __ Dmtc1(ZERO, FTMP); - __ CmpLtD(FTMP, in, FTMP); - __ Dmfc1(AT, FTMP); - } else { - __ Mtc1(ZERO, FTMP); - __ CmpLtS(FTMP, in, FTMP); - __ Mfc1(AT, FTMP); - } - - __ Bc(&add); - - __ Bind(&finite); // TMP = (0.5 <= (in - out)) ? -1 : 0; if (type == Primitive::kPrimDouble) { @@ -977,23 +937,21 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri __ SubD(FTMP, in, FTMP); __ Dmtc1(AT, half); __ CmpLeD(FTMP, half, FTMP); - __ Dmfc1(AT, FTMP); + __ Dmfc1(TMP, FTMP); } else { __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); __ SubS(FTMP, in, FTMP); __ Mtc1(AT, half); __ CmpLeS(FTMP, half, FTMP); - __ Mfc1(AT, FTMP); + __ Mfc1(TMP, FTMP); } - __ Bind(&add); - // Return out -= TMP. if (type == Primitive::kPrimDouble) { - __ Dsubu(out, out, AT); + __ Dsubu(out, out, TMP); } else { - __ Subu(out, out, AT); + __ Subu(out, out, TMP); } __ Bind(&done); @@ -1168,6 +1126,9 @@ static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall), kIntrinsified); + if (can_call && kUseBakerReadBarrier) { + locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + } locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); locations->SetInAt(2, Location::RequiresRegister()); @@ -2564,6 +2525,84 @@ void IntrinsicCodeGeneratorMIPS64::VisitMathTanh(HInvoke* invoke) { GenFPToFPCall(invoke, codegen_, kQuickTanh); } +// long java.lang.Integer.valueOf(long) +void IntrinsicLocationsBuilderMIPS64::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConvention calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + calling_convention.GetReturnLocation(Primitive::kPrimNot), + Location::RegisterLocation(calling_convention.GetRegisterAt(0))); +} + +void IntrinsicCodeGeneratorMIPS64::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + Mips64Assembler* assembler = GetAssembler(); + InstructionCodeGeneratorMIPS64* icodegen = + down_cast<InstructionCodeGeneratorMIPS64*>(codegen_->GetInstructionVisitor()); + + GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + InvokeRuntimeCallingConvention calling_convention; + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ LoadConst64(out, address); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + uint32_t address = + dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ LoadConst64(calling_convention.GetRegisterAt(0), address); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ StoreConstToOffset(kStoreWord, value, out, info.value_offset, TMP); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + } + } else { + GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>(); + Mips64Label allocate, done; + int32_t count = static_cast<uint32_t>(info.high) - info.low + 1; + + // Is (info.low <= in) && (in <= info.high)? + __ Addiu32(out, in, -info.low); + // As unsigned quantities is out < (info.high - info.low + 1)? + __ LoadConst32(AT, count); + // Branch if out >= (info.high - info.low + 1). + // This means that "in" is outside of the range [info.low, info.high]. + __ Bgeuc(out, AT, &allocate); + + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + __ LoadConst64(TMP, data_offset + address); + __ Dlsa(out, out, TMP, TIMES_4); + __ Lwu(out, out, 0); + __ MaybeUnpoisonHeapReference(out); + __ Bc(&done); + + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ LoadConst64(calling_convention.GetRegisterAt(0), address); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ StoreToOffset(kStoreWord, in, out, info.value_offset); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + icodegen->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(MIPS64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(MIPS64, SystemArrayCopy) @@ -2583,7 +2622,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetInt) UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetLong) UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetObject) -UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerValueOf) +UNIMPLEMENTED_INTRINSIC(MIPS64, ThreadInterrupted) UNREACHABLE_INTRINSICS(MIPS64) diff --git a/compiler/optimizing/intrinsics_mips64.h b/compiler/optimizing/intrinsics_mips64.h index 5b95c26a21..179627ab20 100644 --- a/compiler/optimizing/intrinsics_mips64.h +++ b/compiler/optimizing/intrinsics_mips64.h @@ -49,6 +49,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) bool TryDispatch(HInvoke* invoke); private: + CodeGeneratorMIPS64* codegen_; ArenaAllocator* arena_; DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderMIPS64); diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 8e4574774f..a9da15d2ce 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -31,7 +31,7 @@ #include "mirror/reference.h" #include "mirror/string.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "utils/x86/assembler_x86.h" #include "utils/x86/constants_x86.h" @@ -2819,65 +2819,6 @@ void IntrinsicCodeGeneratorX86::VisitLongNumberOfTrailingZeros(HInvoke* invoke) GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true); } -void IntrinsicLocationsBuilderX86::VisitReferenceGetReferent(HInvoke* invoke) { - if (kEmitCompilerReadBarrier) { - // Do not intrinsify this call with the read barrier configuration. - return; - } - LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnSlowPath, - kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorX86::VisitReferenceGetReferent(HInvoke* invoke) { - DCHECK(!kEmitCompilerReadBarrier); - LocationSummary* locations = invoke->GetLocations(); - X86Assembler* assembler = GetAssembler(); - - Register obj = locations->InAt(0).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); - - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke); - codegen_->AddSlowPath(slow_path); - - // Load ArtMethod first. - HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect(); - DCHECK(invoke_direct != nullptr); - Location temp_loc = codegen_->GenerateCalleeMethodStaticOrDirectCall( - invoke_direct, locations->GetTemp(0)); - DCHECK(temp_loc.Equals(locations->GetTemp(0))); - Register temp = temp_loc.AsRegister<Register>(); - - // Now get declaring class. - __ movl(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value())); - - uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset(); - uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset(); - DCHECK_NE(slow_path_flag_offset, 0u); - DCHECK_NE(disable_flag_offset, 0u); - DCHECK_NE(slow_path_flag_offset, disable_flag_offset); - - // Check static flags preventing us for using intrinsic. - if (slow_path_flag_offset == disable_flag_offset + 1) { - __ cmpw(Address(temp, disable_flag_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - } else { - __ cmpb(Address(temp, disable_flag_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - __ cmpb(Address(temp, slow_path_flag_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - } - - // Fast path. - __ movl(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value())); - codegen_->MaybeRecordImplicitNullCheck(invoke); - __ MaybeUnpoisonHeapReference(out); - __ Bind(slow_path->GetExitLabel()); -} - static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1) { return instruction->InputAt(input0) == instruction->InputAt(input1); } @@ -3407,7 +3348,29 @@ void IntrinsicCodeGeneratorX86::VisitIntegerValueOf(HInvoke* invoke) { } } +void IntrinsicLocationsBuilderX86::VisitThreadInterrupted(HInvoke* invoke) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorX86::VisitThreadInterrupted(HInvoke* invoke) { + X86Assembler* assembler = GetAssembler(); + Register out = invoke->GetLocations()->Out().AsRegister<Register>(); + Address address = Address::Absolute(Thread::InterruptedOffset<kX86PointerSize>().Int32Value()); + NearLabel done; + __ fs()->movl(out, address); + __ testl(out, out); + __ j(kEqual, &done); + __ fs()->movl(address, Immediate(0)); + codegen_->MemoryFence(); + __ Bind(&done); +} + + UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble) +UNIMPLEMENTED_INTRINSIC(X86, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite) UNIMPLEMENTED_INTRINSIC(X86, DoubleIsInfinite) UNIMPLEMENTED_INTRINSIC(X86, IntegerHighestOneBit) diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 8ed2ad86bf..8100645e54 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -31,7 +31,7 @@ #include "mirror/reference.h" #include "mirror/string.h" #include "scoped_thread_state_change-inl.h" -#include "thread-inl.h" +#include "thread-current-inl.h" #include "utils/x86_64/assembler_x86_64.h" #include "utils/x86_64/constants_x86_64.h" @@ -759,7 +759,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, // We have to ensure that the native code doesn't clobber the XMM registers which are // non-volatile for ART, but volatile for Native calls. This will ensure that they are // saved in the prologue and properly restored. - for (auto fp_reg : non_volatile_xmm_regs) { + for (FloatRegister fp_reg : non_volatile_xmm_regs) { locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); } } @@ -898,7 +898,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, // We have to ensure that the native code doesn't clobber the XMM registers which are // non-volatile for ART, but volatile for Native calls. This will ensure that they are // saved in the prologue and properly restored. - for (auto fp_reg : non_volatile_xmm_regs) { + for (FloatRegister fp_reg : non_volatile_xmm_regs) { locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); } } @@ -2959,65 +2959,6 @@ void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invok GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true); } -void IntrinsicLocationsBuilderX86_64::VisitReferenceGetReferent(HInvoke* invoke) { - if (kEmitCompilerReadBarrier) { - // Do not intrinsify this call with the read barrier configuration. - return; - } - LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnSlowPath, - kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorX86_64::VisitReferenceGetReferent(HInvoke* invoke) { - DCHECK(!kEmitCompilerReadBarrier); - LocationSummary* locations = invoke->GetLocations(); - X86_64Assembler* assembler = GetAssembler(); - - CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>(); - CpuRegister out = locations->Out().AsRegister<CpuRegister>(); - - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke); - codegen_->AddSlowPath(slow_path); - - // Load ArtMethod first. - HInvokeStaticOrDirect* invoke_direct = invoke->AsInvokeStaticOrDirect(); - DCHECK(invoke_direct != nullptr); - Location temp_loc = codegen_->GenerateCalleeMethodStaticOrDirectCall( - invoke_direct, locations->GetTemp(0)); - DCHECK(temp_loc.Equals(locations->GetTemp(0))); - CpuRegister temp = temp_loc.AsRegister<CpuRegister>(); - - // Now get declaring class. - __ movl(temp, Address(temp, ArtMethod::DeclaringClassOffset().Int32Value())); - - uint32_t slow_path_flag_offset = codegen_->GetReferenceSlowFlagOffset(); - uint32_t disable_flag_offset = codegen_->GetReferenceDisableFlagOffset(); - DCHECK_NE(slow_path_flag_offset, 0u); - DCHECK_NE(disable_flag_offset, 0u); - DCHECK_NE(slow_path_flag_offset, disable_flag_offset); - - // Check static flags preventing us for using intrinsic. - if (slow_path_flag_offset == disable_flag_offset + 1) { - __ cmpw(Address(temp, disable_flag_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - } else { - __ cmpb(Address(temp, disable_flag_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - __ cmpb(Address(temp, slow_path_flag_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - } - - // Fast path. - __ movl(out, Address(obj, mirror::Reference::ReferentOffset().Int32Value())); - codegen_->MaybeRecordImplicitNullCheck(invoke); - __ MaybeUnpoisonHeapReference(out); - __ Bind(slow_path->GetExitLabel()); -} - void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) { InvokeRuntimeCallingConvention calling_convention; IntrinsicVisitor::ComputeIntegerValueOfLocations( @@ -3085,6 +3026,28 @@ void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) { } } +void IntrinsicLocationsBuilderX86_64::VisitThreadInterrupted(HInvoke* invoke) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorX86_64::VisitThreadInterrupted(HInvoke* invoke) { + X86_64Assembler* assembler = GetAssembler(); + CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>(); + Address address = Address::Absolute + (Thread::InterruptedOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true); + NearLabel done; + __ gs()->movl(out, address); + __ testl(out, out); + __ j(kEqual, &done); + __ gs()->movl(address, Immediate(0)); + codegen_->MemoryFence(); + __ Bind(&done); +} + +UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite) UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite) diff --git a/compiler/optimizing/load_store_analysis.cc b/compiler/optimizing/load_store_analysis.cc new file mode 100644 index 0000000000..f2ee345c8c --- /dev/null +++ b/compiler/optimizing/load_store_analysis.cc @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "load_store_analysis.h" + +namespace art { + +// A cap for the number of heap locations to prevent pathological time/space consumption. +// The number of heap locations for most of the methods stays below this threshold. +constexpr size_t kMaxNumberOfHeapLocations = 32; + +void LoadStoreAnalysis::Run() { + for (HBasicBlock* block : graph_->GetReversePostOrder()) { + heap_location_collector_.VisitBasicBlock(block); + } + + if (heap_location_collector_.GetNumberOfHeapLocations() > kMaxNumberOfHeapLocations) { + // Bail out if there are too many heap locations to deal with. + heap_location_collector_.CleanUp(); + return; + } + if (!heap_location_collector_.HasHeapStores()) { + // Without heap stores, this pass would act mostly as GVN on heap accesses. + heap_location_collector_.CleanUp(); + return; + } + if (heap_location_collector_.HasVolatile() || heap_location_collector_.HasMonitorOps()) { + // Don't do load/store elimination if the method has volatile field accesses or + // monitor operations, for now. + // TODO: do it right. + heap_location_collector_.CleanUp(); + return; + } + + heap_location_collector_.BuildAliasingMatrix(); +} + +} // namespace art diff --git a/compiler/optimizing/load_store_analysis.h b/compiler/optimizing/load_store_analysis.h new file mode 100644 index 0000000000..4e940f30bf --- /dev/null +++ b/compiler/optimizing/load_store_analysis.h @@ -0,0 +1,518 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_LOAD_STORE_ANALYSIS_H_ +#define ART_COMPILER_OPTIMIZING_LOAD_STORE_ANALYSIS_H_ + +#include "escape.h" +#include "nodes.h" +#include "optimization.h" + +namespace art { + +// A ReferenceInfo contains additional info about a reference such as +// whether it's a singleton, returned, etc. +class ReferenceInfo : public ArenaObject<kArenaAllocMisc> { + public: + ReferenceInfo(HInstruction* reference, size_t pos) + : reference_(reference), + position_(pos), + is_singleton_(true), + is_singleton_and_not_returned_(true), + is_singleton_and_not_deopt_visible_(true), + has_index_aliasing_(false) { + CalculateEscape(reference_, + nullptr, + &is_singleton_, + &is_singleton_and_not_returned_, + &is_singleton_and_not_deopt_visible_); + } + + HInstruction* GetReference() const { + return reference_; + } + + size_t GetPosition() const { + return position_; + } + + // Returns true if reference_ is the only name that can refer to its value during + // the lifetime of the method. So it's guaranteed to not have any alias in + // the method (including its callees). + bool IsSingleton() const { + return is_singleton_; + } + + // Returns true if reference_ is a singleton and not returned to the caller or + // used as an environment local of an HDeoptimize instruction. + // The allocation and stores into reference_ may be eliminated for such cases. + bool IsSingletonAndRemovable() const { + return is_singleton_and_not_returned_ && is_singleton_and_not_deopt_visible_; + } + + // Returns true if reference_ is a singleton and returned to the caller or + // used as an environment local of an HDeoptimize instruction. + bool IsSingletonAndNonRemovable() const { + return is_singleton_ && + (!is_singleton_and_not_returned_ || !is_singleton_and_not_deopt_visible_); + } + + bool HasIndexAliasing() { + return has_index_aliasing_; + } + + void SetHasIndexAliasing(bool has_index_aliasing) { + // Only allow setting to true. + DCHECK(has_index_aliasing); + has_index_aliasing_ = has_index_aliasing; + } + + private: + HInstruction* const reference_; + const size_t position_; // position in HeapLocationCollector's ref_info_array_. + + // Can only be referred to by a single name in the method. + bool is_singleton_; + // Is singleton and not returned to caller. + bool is_singleton_and_not_returned_; + // Is singleton and not used as an environment local of HDeoptimize. + bool is_singleton_and_not_deopt_visible_; + // Some heap locations with reference_ have array index aliasing, + // e.g. arr[i] and arr[j] may be the same location. + bool has_index_aliasing_; + + DISALLOW_COPY_AND_ASSIGN(ReferenceInfo); +}; + +// A heap location is a reference-offset/index pair that a value can be loaded from +// or stored to. +class HeapLocation : public ArenaObject<kArenaAllocMisc> { + public: + static constexpr size_t kInvalidFieldOffset = -1; + + // TODO: more fine-grained array types. + static constexpr int16_t kDeclaringClassDefIndexForArrays = -1; + + HeapLocation(ReferenceInfo* ref_info, + size_t offset, + HInstruction* index, + int16_t declaring_class_def_index) + : ref_info_(ref_info), + offset_(offset), + index_(index), + declaring_class_def_index_(declaring_class_def_index), + value_killed_by_loop_side_effects_(true) { + DCHECK(ref_info != nullptr); + DCHECK((offset == kInvalidFieldOffset && index != nullptr) || + (offset != kInvalidFieldOffset && index == nullptr)); + if (ref_info->IsSingleton() && !IsArrayElement()) { + // Assume this location's value cannot be killed by loop side effects + // until proven otherwise. + value_killed_by_loop_side_effects_ = false; + } + } + + ReferenceInfo* GetReferenceInfo() const { return ref_info_; } + size_t GetOffset() const { return offset_; } + HInstruction* GetIndex() const { return index_; } + + // Returns the definition of declaring class' dex index. + // It's kDeclaringClassDefIndexForArrays for an array element. + int16_t GetDeclaringClassDefIndex() const { + return declaring_class_def_index_; + } + + bool IsArrayElement() const { + return index_ != nullptr; + } + + bool IsValueKilledByLoopSideEffects() const { + return value_killed_by_loop_side_effects_; + } + + void SetValueKilledByLoopSideEffects(bool val) { + value_killed_by_loop_side_effects_ = val; + } + + private: + ReferenceInfo* const ref_info_; // reference for instance/static field or array access. + const size_t offset_; // offset of static/instance field. + HInstruction* const index_; // index of an array element. + const int16_t declaring_class_def_index_; // declaring class's def's dex index. + bool value_killed_by_loop_side_effects_; // value of this location may be killed by loop + // side effects because this location is stored + // into inside a loop. This gives + // better info on whether a singleton's location + // value may be killed by loop side effects. + + DISALLOW_COPY_AND_ASSIGN(HeapLocation); +}; + +// A HeapLocationCollector collects all relevant heap locations and keeps +// an aliasing matrix for all locations. +class HeapLocationCollector : public HGraphVisitor { + public: + static constexpr size_t kHeapLocationNotFound = -1; + // Start with a single uint32_t word. That's enough bits for pair-wise + // aliasing matrix of 8 heap locations. + static constexpr uint32_t kInitialAliasingMatrixBitVectorSize = 32; + + explicit HeapLocationCollector(HGraph* graph) + : HGraphVisitor(graph), + ref_info_array_(graph->GetArena()->Adapter(kArenaAllocLSE)), + heap_locations_(graph->GetArena()->Adapter(kArenaAllocLSE)), + aliasing_matrix_(graph->GetArena(), + kInitialAliasingMatrixBitVectorSize, + true, + kArenaAllocLSE), + has_heap_stores_(false), + has_volatile_(false), + has_monitor_operations_(false) {} + + void CleanUp() { + heap_locations_.clear(); + ref_info_array_.clear(); + } + + size_t GetNumberOfHeapLocations() const { + return heap_locations_.size(); + } + + HeapLocation* GetHeapLocation(size_t index) const { + return heap_locations_[index]; + } + + HInstruction* HuntForOriginalReference(HInstruction* ref) const { + DCHECK(ref != nullptr); + while (ref->IsNullCheck() || ref->IsBoundType()) { + ref = ref->InputAt(0); + } + return ref; + } + + ReferenceInfo* FindReferenceInfoOf(HInstruction* ref) const { + for (size_t i = 0; i < ref_info_array_.size(); i++) { + ReferenceInfo* ref_info = ref_info_array_[i]; + if (ref_info->GetReference() == ref) { + DCHECK_EQ(i, ref_info->GetPosition()); + return ref_info; + } + } + return nullptr; + } + + bool HasHeapStores() const { + return has_heap_stores_; + } + + bool HasVolatile() const { + return has_volatile_; + } + + bool HasMonitorOps() const { + return has_monitor_operations_; + } + + // Find and return the heap location index in heap_locations_. + size_t FindHeapLocationIndex(ReferenceInfo* ref_info, + size_t offset, + HInstruction* index, + int16_t declaring_class_def_index) const { + for (size_t i = 0; i < heap_locations_.size(); i++) { + HeapLocation* loc = heap_locations_[i]; + if (loc->GetReferenceInfo() == ref_info && + loc->GetOffset() == offset && + loc->GetIndex() == index && + loc->GetDeclaringClassDefIndex() == declaring_class_def_index) { + return i; + } + } + return kHeapLocationNotFound; + } + + // Returns true if heap_locations_[index1] and heap_locations_[index2] may alias. + bool MayAlias(size_t index1, size_t index2) const { + if (index1 < index2) { + return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index1, index2)); + } else if (index1 > index2) { + return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index2, index1)); + } else { + DCHECK(false) << "index1 and index2 are expected to be different"; + return true; + } + } + + void BuildAliasingMatrix() { + const size_t number_of_locations = heap_locations_.size(); + if (number_of_locations == 0) { + return; + } + size_t pos = 0; + // Compute aliasing info between every pair of different heap locations. + // Save the result in a matrix represented as a BitVector. + for (size_t i = 0; i < number_of_locations - 1; i++) { + for (size_t j = i + 1; j < number_of_locations; j++) { + if (ComputeMayAlias(i, j)) { + aliasing_matrix_.SetBit(CheckedAliasingMatrixPosition(i, j, pos)); + } + pos++; + } + } + } + + private: + // An allocation cannot alias with a name which already exists at the point + // of the allocation, such as a parameter or a load happening before the allocation. + bool MayAliasWithPreexistenceChecking(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const { + if (ref_info1->GetReference()->IsNewInstance() || ref_info1->GetReference()->IsNewArray()) { + // Any reference that can alias with the allocation must appear after it in the block/in + // the block's successors. In reverse post order, those instructions will be visited after + // the allocation. + return ref_info2->GetPosition() >= ref_info1->GetPosition(); + } + return true; + } + + bool CanReferencesAlias(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const { + if (ref_info1 == ref_info2) { + return true; + } else if (ref_info1->IsSingleton()) { + return false; + } else if (ref_info2->IsSingleton()) { + return false; + } else if (!MayAliasWithPreexistenceChecking(ref_info1, ref_info2) || + !MayAliasWithPreexistenceChecking(ref_info2, ref_info1)) { + return false; + } + return true; + } + + // `index1` and `index2` are indices in the array of collected heap locations. + // Returns the position in the bit vector that tracks whether the two heap + // locations may alias. + size_t AliasingMatrixPosition(size_t index1, size_t index2) const { + DCHECK(index2 > index1); + const size_t number_of_locations = heap_locations_.size(); + // It's (num_of_locations - 1) + ... + (num_of_locations - index1) + (index2 - index1 - 1). + return (number_of_locations * index1 - (1 + index1) * index1 / 2 + (index2 - index1 - 1)); + } + + // An additional position is passed in to make sure the calculated position is correct. + size_t CheckedAliasingMatrixPosition(size_t index1, size_t index2, size_t position) { + size_t calculated_position = AliasingMatrixPosition(index1, index2); + DCHECK_EQ(calculated_position, position); + return calculated_position; + } + + // Compute if two locations may alias to each other. + bool ComputeMayAlias(size_t index1, size_t index2) const { + HeapLocation* loc1 = heap_locations_[index1]; + HeapLocation* loc2 = heap_locations_[index2]; + if (loc1->GetOffset() != loc2->GetOffset()) { + // Either two different instance fields, or one is an instance + // field and the other is an array element. + return false; + } + if (loc1->GetDeclaringClassDefIndex() != loc2->GetDeclaringClassDefIndex()) { + // Different types. + return false; + } + if (!CanReferencesAlias(loc1->GetReferenceInfo(), loc2->GetReferenceInfo())) { + return false; + } + if (loc1->IsArrayElement() && loc2->IsArrayElement()) { + HInstruction* array_index1 = loc1->GetIndex(); + HInstruction* array_index2 = loc2->GetIndex(); + DCHECK(array_index1 != nullptr); + DCHECK(array_index2 != nullptr); + if (array_index1->IsIntConstant() && + array_index2->IsIntConstant() && + array_index1->AsIntConstant()->GetValue() != array_index2->AsIntConstant()->GetValue()) { + // Different constant indices do not alias. + return false; + } + ReferenceInfo* ref_info = loc1->GetReferenceInfo(); + ref_info->SetHasIndexAliasing(true); + } + return true; + } + + ReferenceInfo* GetOrCreateReferenceInfo(HInstruction* instruction) { + ReferenceInfo* ref_info = FindReferenceInfoOf(instruction); + if (ref_info == nullptr) { + size_t pos = ref_info_array_.size(); + ref_info = new (GetGraph()->GetArena()) ReferenceInfo(instruction, pos); + ref_info_array_.push_back(ref_info); + } + return ref_info; + } + + void CreateReferenceInfoForReferenceType(HInstruction* instruction) { + if (instruction->GetType() != Primitive::kPrimNot) { + return; + } + DCHECK(FindReferenceInfoOf(instruction) == nullptr); + GetOrCreateReferenceInfo(instruction); + } + + HeapLocation* GetOrCreateHeapLocation(HInstruction* ref, + size_t offset, + HInstruction* index, + int16_t declaring_class_def_index) { + HInstruction* original_ref = HuntForOriginalReference(ref); + ReferenceInfo* ref_info = GetOrCreateReferenceInfo(original_ref); + size_t heap_location_idx = FindHeapLocationIndex( + ref_info, offset, index, declaring_class_def_index); + if (heap_location_idx == kHeapLocationNotFound) { + HeapLocation* heap_loc = new (GetGraph()->GetArena()) + HeapLocation(ref_info, offset, index, declaring_class_def_index); + heap_locations_.push_back(heap_loc); + return heap_loc; + } + return heap_locations_[heap_location_idx]; + } + + HeapLocation* VisitFieldAccess(HInstruction* ref, const FieldInfo& field_info) { + if (field_info.IsVolatile()) { + has_volatile_ = true; + } + const uint16_t declaring_class_def_index = field_info.GetDeclaringClassDefIndex(); + const size_t offset = field_info.GetFieldOffset().SizeValue(); + return GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index); + } + + void VisitArrayAccess(HInstruction* array, HInstruction* index) { + GetOrCreateHeapLocation(array, HeapLocation::kInvalidFieldOffset, + index, HeapLocation::kDeclaringClassDefIndexForArrays); + } + + void VisitInstanceFieldGet(HInstanceFieldGet* instruction) OVERRIDE { + VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE { + HeapLocation* location = VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); + has_heap_stores_ = true; + if (location->GetReferenceInfo()->IsSingleton()) { + // A singleton's location value may be killed by loop side effects if it's + // defined before that loop, and it's stored into inside that loop. + HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation(); + if (loop_info != nullptr) { + HInstruction* ref = location->GetReferenceInfo()->GetReference(); + DCHECK(ref->IsNewInstance()); + if (loop_info->IsDefinedOutOfTheLoop(ref)) { + // ref's location value may be killed by this loop's side effects. + location->SetValueKilledByLoopSideEffects(true); + } else { + // ref is defined inside this loop so this loop's side effects cannot + // kill its location value at the loop header since ref/its location doesn't + // exist yet at the loop header. + } + } + } else { + // For non-singletons, value_killed_by_loop_side_effects_ is inited to + // true. + DCHECK_EQ(location->IsValueKilledByLoopSideEffects(), true); + } + } + + void VisitStaticFieldGet(HStaticFieldGet* instruction) OVERRIDE { + VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE { + VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); + has_heap_stores_ = true; + } + + // We intentionally don't collect HUnresolvedInstanceField/HUnresolvedStaticField accesses + // since we cannot accurately track the fields. + + void VisitArrayGet(HArrayGet* instruction) OVERRIDE { + VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1)); + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitArraySet(HArraySet* instruction) OVERRIDE { + VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1)); + has_heap_stores_ = true; + } + + void VisitNewInstance(HNewInstance* new_instance) OVERRIDE { + // Any references appearing in the ref_info_array_ so far cannot alias with new_instance. + CreateReferenceInfoForReferenceType(new_instance); + } + + void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* instruction) OVERRIDE { + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitInvokeVirtual(HInvokeVirtual* instruction) OVERRIDE { + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitInvokeInterface(HInvokeInterface* instruction) OVERRIDE { + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitParameterValue(HParameterValue* instruction) OVERRIDE { + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitSelect(HSelect* instruction) OVERRIDE { + CreateReferenceInfoForReferenceType(instruction); + } + + void VisitMonitorOperation(HMonitorOperation* monitor ATTRIBUTE_UNUSED) OVERRIDE { + has_monitor_operations_ = true; + } + + ArenaVector<ReferenceInfo*> ref_info_array_; // All references used for heap accesses. + ArenaVector<HeapLocation*> heap_locations_; // All heap locations. + ArenaBitVector aliasing_matrix_; // aliasing info between each pair of locations. + bool has_heap_stores_; // If there is no heap stores, LSE acts as GVN with better + // alias analysis and won't be as effective. + bool has_volatile_; // If there are volatile field accesses. + bool has_monitor_operations_; // If there are monitor operations. + + DISALLOW_COPY_AND_ASSIGN(HeapLocationCollector); +}; + +class LoadStoreAnalysis : public HOptimization { + public: + explicit LoadStoreAnalysis(HGraph* graph) + : HOptimization(graph, kLoadStoreAnalysisPassName), + heap_location_collector_(graph) {} + + const HeapLocationCollector& GetHeapLocationCollector() const { + return heap_location_collector_; + } + + void Run() OVERRIDE; + + static constexpr const char* kLoadStoreAnalysisPassName = "load_store_analysis"; + + private: + HeapLocationCollector heap_location_collector_; + + DISALLOW_COPY_AND_ASSIGN(LoadStoreAnalysis); +}; + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_LOAD_STORE_ANALYSIS_H_ diff --git a/compiler/optimizing/load_store_analysis_test.cc b/compiler/optimizing/load_store_analysis_test.cc new file mode 100644 index 0000000000..24187777f6 --- /dev/null +++ b/compiler/optimizing/load_store_analysis_test.cc @@ -0,0 +1,187 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "load_store_analysis.h" +#include "nodes.h" +#include "optimizing_unit_test.h" + +#include "gtest/gtest.h" + +namespace art { + +class LoadStoreAnalysisTest : public CommonCompilerTest { + public: + LoadStoreAnalysisTest() : pool_(), allocator_(&pool_) { + graph_ = CreateGraph(&allocator_); + } + + ArenaPool pool_; + ArenaAllocator allocator_; + HGraph* graph_; +}; + +TEST_F(LoadStoreAnalysisTest, ArrayHeapLocations) { + HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_); + graph_->AddBlock(entry); + graph_->SetEntryBlock(entry); + + // entry: + // array ParameterValue + // index ParameterValue + // c1 IntConstant + // c2 IntConstant + // c3 IntConstant + // array_get1 ArrayGet [array, c1] + // array_get2 ArrayGet [array, c2] + // array_set1 ArraySet [array, c1, c3] + // array_set2 ArraySet [array, index, c3] + HInstruction* array = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(0), 0, Primitive::kPrimNot); + HInstruction* index = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(1), 1, Primitive::kPrimInt); + HInstruction* c1 = graph_->GetIntConstant(1); + HInstruction* c2 = graph_->GetIntConstant(2); + HInstruction* c3 = graph_->GetIntConstant(3); + HInstruction* array_get1 = new (&allocator_) HArrayGet(array, c1, Primitive::kPrimInt, 0); + HInstruction* array_get2 = new (&allocator_) HArrayGet(array, c2, Primitive::kPrimInt, 0); + HInstruction* array_set1 = new (&allocator_) HArraySet(array, c1, c3, Primitive::kPrimInt, 0); + HInstruction* array_set2 = new (&allocator_) HArraySet(array, index, c3, Primitive::kPrimInt, 0); + entry->AddInstruction(array); + entry->AddInstruction(index); + entry->AddInstruction(array_get1); + entry->AddInstruction(array_get2); + entry->AddInstruction(array_set1); + entry->AddInstruction(array_set2); + + // Test HeapLocationCollector initialization. + // Should be no heap locations, no operations on the heap. + HeapLocationCollector heap_location_collector(graph_); + ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 0U); + ASSERT_FALSE(heap_location_collector.HasHeapStores()); + + // Test that after visiting the graph_, it must see following heap locations + // array[c1], array[c2], array[index]; and it should see heap stores. + heap_location_collector.VisitBasicBlock(entry); + ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 3U); + ASSERT_TRUE(heap_location_collector.HasHeapStores()); + + // Test queries on HeapLocationCollector's ref info and index records. + ReferenceInfo* ref = heap_location_collector.FindReferenceInfoOf(array); + size_t field_off = HeapLocation::kInvalidFieldOffset; + size_t class_def = HeapLocation::kDeclaringClassDefIndexForArrays; + size_t loc1 = heap_location_collector.FindHeapLocationIndex(ref, field_off, c1, class_def); + size_t loc2 = heap_location_collector.FindHeapLocationIndex(ref, field_off, c2, class_def); + size_t loc3 = heap_location_collector.FindHeapLocationIndex(ref, field_off, index, class_def); + // must find this reference info for array in HeapLocationCollector. + ASSERT_TRUE(ref != nullptr); + // must find these heap locations; + // and array[1], array[2], array[3] should be different heap locations. + ASSERT_TRUE(loc1 != HeapLocationCollector::kHeapLocationNotFound); + ASSERT_TRUE(loc2 != HeapLocationCollector::kHeapLocationNotFound); + ASSERT_TRUE(loc3 != HeapLocationCollector::kHeapLocationNotFound); + ASSERT_TRUE(loc1 != loc2); + ASSERT_TRUE(loc2 != loc3); + ASSERT_TRUE(loc1 != loc3); + + // Test alias relationships after building aliasing matrix. + // array[1] and array[2] clearly should not alias; + // array[index] should alias with the others, because index is an unknow value. + heap_location_collector.BuildAliasingMatrix(); + ASSERT_FALSE(heap_location_collector.MayAlias(loc1, loc2)); + ASSERT_TRUE(heap_location_collector.MayAlias(loc1, loc3)); + ASSERT_TRUE(heap_location_collector.MayAlias(loc1, loc3)); +} + +TEST_F(LoadStoreAnalysisTest, FieldHeapLocations) { + HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_); + graph_->AddBlock(entry); + graph_->SetEntryBlock(entry); + + // entry: + // object ParameterValue + // c1 IntConstant + // set_field10 InstanceFieldSet [object, c1, 10] + // get_field10 InstanceFieldGet [object, 10] + // get_field20 InstanceFieldGet [object, 20] + + HInstruction* c1 = graph_->GetIntConstant(1); + HInstruction* object = new (&allocator_) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), + 0, + Primitive::kPrimNot); + HInstanceFieldSet* set_field10 = new (&allocator_) HInstanceFieldSet(object, + c1, + nullptr, + Primitive::kPrimInt, + MemberOffset(10), + false, + kUnknownFieldIndex, + kUnknownClassDefIndex, + graph_->GetDexFile(), + 0); + HInstanceFieldGet* get_field10 = new (&allocator_) HInstanceFieldGet(object, + nullptr, + Primitive::kPrimInt, + MemberOffset(10), + false, + kUnknownFieldIndex, + kUnknownClassDefIndex, + graph_->GetDexFile(), + 0); + HInstanceFieldGet* get_field20 = new (&allocator_) HInstanceFieldGet(object, + nullptr, + Primitive::kPrimInt, + MemberOffset(20), + false, + kUnknownFieldIndex, + kUnknownClassDefIndex, + graph_->GetDexFile(), + 0); + entry->AddInstruction(object); + entry->AddInstruction(set_field10); + entry->AddInstruction(get_field10); + entry->AddInstruction(get_field20); + + // Test HeapLocationCollector initialization. + // Should be no heap locations, no operations on the heap. + HeapLocationCollector heap_location_collector(graph_); + ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 0U); + ASSERT_FALSE(heap_location_collector.HasHeapStores()); + + // Test that after visiting the graph, it must see following heap locations + // object.field10, object.field20 and it should see heap stores. + heap_location_collector.VisitBasicBlock(entry); + ASSERT_EQ(heap_location_collector.GetNumberOfHeapLocations(), 2U); + ASSERT_TRUE(heap_location_collector.HasHeapStores()); + + // Test queries on HeapLocationCollector's ref info and index records. + ReferenceInfo* ref = heap_location_collector.FindReferenceInfoOf(object); + size_t loc1 = heap_location_collector.FindHeapLocationIndex( + ref, 10, nullptr, kUnknownClassDefIndex); + size_t loc2 = heap_location_collector.FindHeapLocationIndex( + ref, 20, nullptr, kUnknownClassDefIndex); + // must find references info for object and in HeapLocationCollector. + ASSERT_TRUE(ref != nullptr); + // must find these heap locations. + ASSERT_TRUE(loc1 != HeapLocationCollector::kHeapLocationNotFound); + ASSERT_TRUE(loc2 != HeapLocationCollector::kHeapLocationNotFound); + // different fields of same object. + ASSERT_TRUE(loc1 != loc2); + // accesses to different fields of the same object should not alias. + ASSERT_FALSE(heap_location_collector.MayAlias(loc1, loc2)); +} + +} // namespace art diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc index 48699b33ae..211528b4bd 100644 --- a/compiler/optimizing/load_store_elimination.cc +++ b/compiler/optimizing/load_store_elimination.cc @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "load_store_analysis.h" #include "load_store_elimination.h" #include "escape.h" @@ -23,477 +24,6 @@ namespace art { -class ReferenceInfo; - -// A cap for the number of heap locations to prevent pathological time/space consumption. -// The number of heap locations for most of the methods stays below this threshold. -constexpr size_t kMaxNumberOfHeapLocations = 32; - -// A ReferenceInfo contains additional info about a reference such as -// whether it's a singleton, returned, etc. -class ReferenceInfo : public ArenaObject<kArenaAllocMisc> { - public: - ReferenceInfo(HInstruction* reference, size_t pos) - : reference_(reference), - position_(pos), - is_singleton_(true), - is_singleton_and_not_returned_(true), - is_singleton_and_not_deopt_visible_(true), - has_index_aliasing_(false) { - CalculateEscape(reference_, - nullptr, - &is_singleton_, - &is_singleton_and_not_returned_, - &is_singleton_and_not_deopt_visible_); - } - - HInstruction* GetReference() const { - return reference_; - } - - size_t GetPosition() const { - return position_; - } - - // Returns true if reference_ is the only name that can refer to its value during - // the lifetime of the method. So it's guaranteed to not have any alias in - // the method (including its callees). - bool IsSingleton() const { - return is_singleton_; - } - - // Returns true if reference_ is a singleton and not returned to the caller or - // used as an environment local of an HDeoptimize instruction. - // The allocation and stores into reference_ may be eliminated for such cases. - bool IsSingletonAndRemovable() const { - return is_singleton_and_not_returned_ && is_singleton_and_not_deopt_visible_; - } - - // Returns true if reference_ is a singleton and returned to the caller or - // used as an environment local of an HDeoptimize instruction. - bool IsSingletonAndNonRemovable() const { - return is_singleton_ && - (!is_singleton_and_not_returned_ || !is_singleton_and_not_deopt_visible_); - } - - bool HasIndexAliasing() { - return has_index_aliasing_; - } - - void SetHasIndexAliasing(bool has_index_aliasing) { - // Only allow setting to true. - DCHECK(has_index_aliasing); - has_index_aliasing_ = has_index_aliasing; - } - - private: - HInstruction* const reference_; - const size_t position_; // position in HeapLocationCollector's ref_info_array_. - - // Can only be referred to by a single name in the method. - bool is_singleton_; - // Is singleton and not returned to caller. - bool is_singleton_and_not_returned_; - // Is singleton and not used as an environment local of HDeoptimize. - bool is_singleton_and_not_deopt_visible_; - // Some heap locations with reference_ have array index aliasing, - // e.g. arr[i] and arr[j] may be the same location. - bool has_index_aliasing_; - - DISALLOW_COPY_AND_ASSIGN(ReferenceInfo); -}; - -// A heap location is a reference-offset/index pair that a value can be loaded from -// or stored to. -class HeapLocation : public ArenaObject<kArenaAllocMisc> { - public: - static constexpr size_t kInvalidFieldOffset = -1; - - // TODO: more fine-grained array types. - static constexpr int16_t kDeclaringClassDefIndexForArrays = -1; - - HeapLocation(ReferenceInfo* ref_info, - size_t offset, - HInstruction* index, - int16_t declaring_class_def_index) - : ref_info_(ref_info), - offset_(offset), - index_(index), - declaring_class_def_index_(declaring_class_def_index), - value_killed_by_loop_side_effects_(true) { - DCHECK(ref_info != nullptr); - DCHECK((offset == kInvalidFieldOffset && index != nullptr) || - (offset != kInvalidFieldOffset && index == nullptr)); - if (ref_info->IsSingleton() && !IsArrayElement()) { - // Assume this location's value cannot be killed by loop side effects - // until proven otherwise. - value_killed_by_loop_side_effects_ = false; - } - } - - ReferenceInfo* GetReferenceInfo() const { return ref_info_; } - size_t GetOffset() const { return offset_; } - HInstruction* GetIndex() const { return index_; } - - // Returns the definition of declaring class' dex index. - // It's kDeclaringClassDefIndexForArrays for an array element. - int16_t GetDeclaringClassDefIndex() const { - return declaring_class_def_index_; - } - - bool IsArrayElement() const { - return index_ != nullptr; - } - - bool IsValueKilledByLoopSideEffects() const { - return value_killed_by_loop_side_effects_; - } - - void SetValueKilledByLoopSideEffects(bool val) { - value_killed_by_loop_side_effects_ = val; - } - - private: - ReferenceInfo* const ref_info_; // reference for instance/static field or array access. - const size_t offset_; // offset of static/instance field. - HInstruction* const index_; // index of an array element. - const int16_t declaring_class_def_index_; // declaring class's def's dex index. - bool value_killed_by_loop_side_effects_; // value of this location may be killed by loop - // side effects because this location is stored - // into inside a loop. This gives - // better info on whether a singleton's location - // value may be killed by loop side effects. - - DISALLOW_COPY_AND_ASSIGN(HeapLocation); -}; - -static HInstruction* HuntForOriginalReference(HInstruction* ref) { - DCHECK(ref != nullptr); - while (ref->IsNullCheck() || ref->IsBoundType()) { - ref = ref->InputAt(0); - } - return ref; -} - -// A HeapLocationCollector collects all relevant heap locations and keeps -// an aliasing matrix for all locations. -class HeapLocationCollector : public HGraphVisitor { - public: - static constexpr size_t kHeapLocationNotFound = -1; - // Start with a single uint32_t word. That's enough bits for pair-wise - // aliasing matrix of 8 heap locations. - static constexpr uint32_t kInitialAliasingMatrixBitVectorSize = 32; - - explicit HeapLocationCollector(HGraph* graph) - : HGraphVisitor(graph), - ref_info_array_(graph->GetArena()->Adapter(kArenaAllocLSE)), - heap_locations_(graph->GetArena()->Adapter(kArenaAllocLSE)), - aliasing_matrix_(graph->GetArena(), - kInitialAliasingMatrixBitVectorSize, - true, - kArenaAllocLSE), - has_heap_stores_(false), - has_volatile_(false), - has_monitor_operations_(false) {} - - size_t GetNumberOfHeapLocations() const { - return heap_locations_.size(); - } - - HeapLocation* GetHeapLocation(size_t index) const { - return heap_locations_[index]; - } - - ReferenceInfo* FindReferenceInfoOf(HInstruction* ref) const { - for (size_t i = 0; i < ref_info_array_.size(); i++) { - ReferenceInfo* ref_info = ref_info_array_[i]; - if (ref_info->GetReference() == ref) { - DCHECK_EQ(i, ref_info->GetPosition()); - return ref_info; - } - } - return nullptr; - } - - bool HasHeapStores() const { - return has_heap_stores_; - } - - bool HasVolatile() const { - return has_volatile_; - } - - bool HasMonitorOps() const { - return has_monitor_operations_; - } - - // Find and return the heap location index in heap_locations_. - size_t FindHeapLocationIndex(ReferenceInfo* ref_info, - size_t offset, - HInstruction* index, - int16_t declaring_class_def_index) const { - for (size_t i = 0; i < heap_locations_.size(); i++) { - HeapLocation* loc = heap_locations_[i]; - if (loc->GetReferenceInfo() == ref_info && - loc->GetOffset() == offset && - loc->GetIndex() == index && - loc->GetDeclaringClassDefIndex() == declaring_class_def_index) { - return i; - } - } - return kHeapLocationNotFound; - } - - // Returns true if heap_locations_[index1] and heap_locations_[index2] may alias. - bool MayAlias(size_t index1, size_t index2) const { - if (index1 < index2) { - return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index1, index2)); - } else if (index1 > index2) { - return aliasing_matrix_.IsBitSet(AliasingMatrixPosition(index2, index1)); - } else { - DCHECK(false) << "index1 and index2 are expected to be different"; - return true; - } - } - - void BuildAliasingMatrix() { - const size_t number_of_locations = heap_locations_.size(); - if (number_of_locations == 0) { - return; - } - size_t pos = 0; - // Compute aliasing info between every pair of different heap locations. - // Save the result in a matrix represented as a BitVector. - for (size_t i = 0; i < number_of_locations - 1; i++) { - for (size_t j = i + 1; j < number_of_locations; j++) { - if (ComputeMayAlias(i, j)) { - aliasing_matrix_.SetBit(CheckedAliasingMatrixPosition(i, j, pos)); - } - pos++; - } - } - } - - private: - // An allocation cannot alias with a name which already exists at the point - // of the allocation, such as a parameter or a load happening before the allocation. - bool MayAliasWithPreexistenceChecking(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const { - if (ref_info1->GetReference()->IsNewInstance() || ref_info1->GetReference()->IsNewArray()) { - // Any reference that can alias with the allocation must appear after it in the block/in - // the block's successors. In reverse post order, those instructions will be visited after - // the allocation. - return ref_info2->GetPosition() >= ref_info1->GetPosition(); - } - return true; - } - - bool CanReferencesAlias(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const { - if (ref_info1 == ref_info2) { - return true; - } else if (ref_info1->IsSingleton()) { - return false; - } else if (ref_info2->IsSingleton()) { - return false; - } else if (!MayAliasWithPreexistenceChecking(ref_info1, ref_info2) || - !MayAliasWithPreexistenceChecking(ref_info2, ref_info1)) { - return false; - } - return true; - } - - // `index1` and `index2` are indices in the array of collected heap locations. - // Returns the position in the bit vector that tracks whether the two heap - // locations may alias. - size_t AliasingMatrixPosition(size_t index1, size_t index2) const { - DCHECK(index2 > index1); - const size_t number_of_locations = heap_locations_.size(); - // It's (num_of_locations - 1) + ... + (num_of_locations - index1) + (index2 - index1 - 1). - return (number_of_locations * index1 - (1 + index1) * index1 / 2 + (index2 - index1 - 1)); - } - - // An additional position is passed in to make sure the calculated position is correct. - size_t CheckedAliasingMatrixPosition(size_t index1, size_t index2, size_t position) { - size_t calculated_position = AliasingMatrixPosition(index1, index2); - DCHECK_EQ(calculated_position, position); - return calculated_position; - } - - // Compute if two locations may alias to each other. - bool ComputeMayAlias(size_t index1, size_t index2) const { - HeapLocation* loc1 = heap_locations_[index1]; - HeapLocation* loc2 = heap_locations_[index2]; - if (loc1->GetOffset() != loc2->GetOffset()) { - // Either two different instance fields, or one is an instance - // field and the other is an array element. - return false; - } - if (loc1->GetDeclaringClassDefIndex() != loc2->GetDeclaringClassDefIndex()) { - // Different types. - return false; - } - if (!CanReferencesAlias(loc1->GetReferenceInfo(), loc2->GetReferenceInfo())) { - return false; - } - if (loc1->IsArrayElement() && loc2->IsArrayElement()) { - HInstruction* array_index1 = loc1->GetIndex(); - HInstruction* array_index2 = loc2->GetIndex(); - DCHECK(array_index1 != nullptr); - DCHECK(array_index2 != nullptr); - if (array_index1->IsIntConstant() && - array_index2->IsIntConstant() && - array_index1->AsIntConstant()->GetValue() != array_index2->AsIntConstant()->GetValue()) { - // Different constant indices do not alias. - return false; - } - ReferenceInfo* ref_info = loc1->GetReferenceInfo(); - ref_info->SetHasIndexAliasing(true); - } - return true; - } - - ReferenceInfo* GetOrCreateReferenceInfo(HInstruction* instruction) { - ReferenceInfo* ref_info = FindReferenceInfoOf(instruction); - if (ref_info == nullptr) { - size_t pos = ref_info_array_.size(); - ref_info = new (GetGraph()->GetArena()) ReferenceInfo(instruction, pos); - ref_info_array_.push_back(ref_info); - } - return ref_info; - } - - void CreateReferenceInfoForReferenceType(HInstruction* instruction) { - if (instruction->GetType() != Primitive::kPrimNot) { - return; - } - DCHECK(FindReferenceInfoOf(instruction) == nullptr); - GetOrCreateReferenceInfo(instruction); - } - - HeapLocation* GetOrCreateHeapLocation(HInstruction* ref, - size_t offset, - HInstruction* index, - int16_t declaring_class_def_index) { - HInstruction* original_ref = HuntForOriginalReference(ref); - ReferenceInfo* ref_info = GetOrCreateReferenceInfo(original_ref); - size_t heap_location_idx = FindHeapLocationIndex( - ref_info, offset, index, declaring_class_def_index); - if (heap_location_idx == kHeapLocationNotFound) { - HeapLocation* heap_loc = new (GetGraph()->GetArena()) - HeapLocation(ref_info, offset, index, declaring_class_def_index); - heap_locations_.push_back(heap_loc); - return heap_loc; - } - return heap_locations_[heap_location_idx]; - } - - HeapLocation* VisitFieldAccess(HInstruction* ref, const FieldInfo& field_info) { - if (field_info.IsVolatile()) { - has_volatile_ = true; - } - const uint16_t declaring_class_def_index = field_info.GetDeclaringClassDefIndex(); - const size_t offset = field_info.GetFieldOffset().SizeValue(); - return GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index); - } - - void VisitArrayAccess(HInstruction* array, HInstruction* index) { - GetOrCreateHeapLocation(array, HeapLocation::kInvalidFieldOffset, - index, HeapLocation::kDeclaringClassDefIndexForArrays); - } - - void VisitInstanceFieldGet(HInstanceFieldGet* instruction) OVERRIDE { - VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE { - HeapLocation* location = VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); - has_heap_stores_ = true; - if (location->GetReferenceInfo()->IsSingleton()) { - // A singleton's location value may be killed by loop side effects if it's - // defined before that loop, and it's stored into inside that loop. - HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation(); - if (loop_info != nullptr) { - HInstruction* ref = location->GetReferenceInfo()->GetReference(); - DCHECK(ref->IsNewInstance()); - if (loop_info->IsDefinedOutOfTheLoop(ref)) { - // ref's location value may be killed by this loop's side effects. - location->SetValueKilledByLoopSideEffects(true); - } else { - // ref is defined inside this loop so this loop's side effects cannot - // kill its location value at the loop header since ref/its location doesn't - // exist yet at the loop header. - } - } - } else { - // For non-singletons, value_killed_by_loop_side_effects_ is inited to - // true. - DCHECK_EQ(location->IsValueKilledByLoopSideEffects(), true); - } - } - - void VisitStaticFieldGet(HStaticFieldGet* instruction) OVERRIDE { - VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE { - VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo()); - has_heap_stores_ = true; - } - - // We intentionally don't collect HUnresolvedInstanceField/HUnresolvedStaticField accesses - // since we cannot accurately track the fields. - - void VisitArrayGet(HArrayGet* instruction) OVERRIDE { - VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1)); - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitArraySet(HArraySet* instruction) OVERRIDE { - VisitArrayAccess(instruction->InputAt(0), instruction->InputAt(1)); - has_heap_stores_ = true; - } - - void VisitNewInstance(HNewInstance* new_instance) OVERRIDE { - // Any references appearing in the ref_info_array_ so far cannot alias with new_instance. - CreateReferenceInfoForReferenceType(new_instance); - } - - void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* instruction) OVERRIDE { - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitInvokeVirtual(HInvokeVirtual* instruction) OVERRIDE { - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitInvokeInterface(HInvokeInterface* instruction) OVERRIDE { - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitParameterValue(HParameterValue* instruction) OVERRIDE { - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitSelect(HSelect* instruction) OVERRIDE { - CreateReferenceInfoForReferenceType(instruction); - } - - void VisitMonitorOperation(HMonitorOperation* monitor ATTRIBUTE_UNUSED) OVERRIDE { - has_monitor_operations_ = true; - } - - ArenaVector<ReferenceInfo*> ref_info_array_; // All references used for heap accesses. - ArenaVector<HeapLocation*> heap_locations_; // All heap locations. - ArenaBitVector aliasing_matrix_; // aliasing info between each pair of locations. - bool has_heap_stores_; // If there is no heap stores, LSE acts as GVN with better - // alias analysis and won't be as effective. - bool has_volatile_; // If there are volatile field accesses. - bool has_monitor_operations_; // If there are monitor operations. - - DISALLOW_COPY_AND_ASSIGN(HeapLocationCollector); -}; - // An unknown heap value. Loads with such a value in the heap location cannot be eliminated. // A heap location can be set to kUnknownHeapValue when: // - initially set a value. @@ -516,7 +46,7 @@ class LSEVisitor : public HGraphVisitor { side_effects_(side_effects), heap_values_for_(graph->GetBlocks().size(), ArenaVector<HInstruction*>(heap_locations_collector. - GetNumberOfHeapLocations(), + GetNumberOfHeapLocations(), kUnknownHeapValue, graph->GetArena()->Adapter(kArenaAllocLSE)), graph->GetArena()->Adapter(kArenaAllocLSE)), @@ -566,14 +96,20 @@ class LSEVisitor : public HGraphVisitor { store->GetBlock()->RemoveInstruction(store); } - // Eliminate allocations that are not used. + // Eliminate singleton-classified instructions: + // * - Constructor fences (they never escape this thread). + // * - Allocations (if they are unused). for (HInstruction* new_instance : singleton_new_instances_) { + HConstructorFence::RemoveConstructorFences(new_instance); + if (!new_instance->HasNonEnvironmentUses()) { new_instance->RemoveEnvironmentUsers(); new_instance->GetBlock()->RemoveInstruction(new_instance); } } for (HInstruction* new_array : singleton_new_arrays_) { + HConstructorFence::RemoveConstructorFences(new_array); + if (!new_array->HasNonEnvironmentUses()) { new_array->RemoveEnvironmentUsers(); new_array->GetBlock()->RemoveInstruction(new_array); @@ -754,7 +290,7 @@ class LSEVisitor : public HGraphVisitor { size_t offset, HInstruction* index, int16_t declaring_class_def_index) { - HInstruction* original_ref = HuntForOriginalReference(ref); + HInstruction* original_ref = heap_location_collector_.HuntForOriginalReference(ref); ReferenceInfo* ref_info = heap_location_collector_.FindReferenceInfoOf(original_ref); size_t idx = heap_location_collector_.FindHeapLocationIndex( ref_info, offset, index, declaring_class_def_index); @@ -821,7 +357,7 @@ class LSEVisitor : public HGraphVisitor { HInstruction* index, int16_t declaring_class_def_index, HInstruction* value) { - HInstruction* original_ref = HuntForOriginalReference(ref); + HInstruction* original_ref = heap_location_collector_.HuntForOriginalReference(ref); ReferenceInfo* ref_info = heap_location_collector_.FindReferenceInfoOf(original_ref); size_t idx = heap_location_collector_.FindHeapLocationIndex( ref_info, offset, index, declaring_class_def_index); @@ -1121,25 +657,12 @@ void LoadStoreElimination::Run() { // Skip this optimization. return; } - HeapLocationCollector heap_location_collector(graph_); - for (HBasicBlock* block : graph_->GetReversePostOrder()) { - heap_location_collector.VisitBasicBlock(block); - } - if (heap_location_collector.GetNumberOfHeapLocations() > kMaxNumberOfHeapLocations) { - // Bail out if there are too many heap locations to deal with. - return; - } - if (!heap_location_collector.HasHeapStores()) { - // Without heap stores, this pass would act mostly as GVN on heap accesses. + const HeapLocationCollector& heap_location_collector = lsa_.GetHeapLocationCollector(); + if (heap_location_collector.GetNumberOfHeapLocations() == 0) { + // No HeapLocation information from LSA, skip this optimization. return; } - if (heap_location_collector.HasVolatile() || heap_location_collector.HasMonitorOps()) { - // Don't do load/store elimination if the method has volatile field accesses or - // monitor operations, for now. - // TODO: do it right. - return; - } - heap_location_collector.BuildAliasingMatrix(); + LSEVisitor lse_visitor(graph_, heap_location_collector, side_effects_); for (HBasicBlock* block : graph_->GetReversePostOrder()) { lse_visitor.VisitBasicBlock(block); diff --git a/compiler/optimizing/load_store_elimination.h b/compiler/optimizing/load_store_elimination.h index 1d9e5c8da6..efe71c733a 100644 --- a/compiler/optimizing/load_store_elimination.h +++ b/compiler/optimizing/load_store_elimination.h @@ -22,12 +22,16 @@ namespace art { class SideEffectsAnalysis; +class LoadStoreAnalysis; class LoadStoreElimination : public HOptimization { public: - LoadStoreElimination(HGraph* graph, const SideEffectsAnalysis& side_effects) + LoadStoreElimination(HGraph* graph, + const SideEffectsAnalysis& side_effects, + const LoadStoreAnalysis& lsa) : HOptimization(graph, kLoadStoreEliminationPassName), - side_effects_(side_effects) {} + side_effects_(side_effects), + lsa_(lsa) {} void Run() OVERRIDE; @@ -35,6 +39,7 @@ class LoadStoreElimination : public HOptimization { private: const SideEffectsAnalysis& side_effects_; + const LoadStoreAnalysis& lsa_; DISALLOW_COPY_AND_ASSIGN(LoadStoreElimination); }; diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index d5e105951b..4334ab10bd 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -71,7 +71,7 @@ static bool IsSignExtensionAndGet(HInstruction* instruction, // extension when represented in the *width* of the given narrower data type // (the fact that char normally zero extends does not matter here). int64_t value = 0; - if (IsInt64AndGet(instruction, &value)) { + if (IsInt64AndGet(instruction, /*out*/ &value)) { switch (type) { case Primitive::kPrimByte: if (std::numeric_limits<int8_t>::min() <= value && @@ -119,7 +119,7 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction, // extension when represented in the *width* of the given narrower data type // (the fact that byte/short normally sign extend does not matter here). int64_t value = 0; - if (IsInt64AndGet(instruction, &value)) { + if (IsInt64AndGet(instruction, /*out*/ &value)) { switch (type) { case Primitive::kPrimByte: if (std::numeric_limits<uint8_t>::min() <= value && @@ -173,6 +173,84 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction, return false; } +// Detect situations with same-extension narrower operands. +// Returns true on success and sets is_unsigned accordingly. +static bool IsNarrowerOperands(HInstruction* a, + HInstruction* b, + Primitive::Type type, + /*out*/ HInstruction** r, + /*out*/ HInstruction** s, + /*out*/ bool* is_unsigned) { + if (IsSignExtensionAndGet(a, type, r) && IsSignExtensionAndGet(b, type, s)) { + *is_unsigned = false; + return true; + } else if (IsZeroExtensionAndGet(a, type, r) && IsZeroExtensionAndGet(b, type, s)) { + *is_unsigned = true; + return true; + } + return false; +} + +// As above, single operand. +static bool IsNarrowerOperand(HInstruction* a, + Primitive::Type type, + /*out*/ HInstruction** r, + /*out*/ bool* is_unsigned) { + if (IsSignExtensionAndGet(a, type, r)) { + *is_unsigned = false; + return true; + } else if (IsZeroExtensionAndGet(a, type, r)) { + *is_unsigned = true; + return true; + } + return false; +} + +// Detect up to two instructions a and b, and an acccumulated constant c. +static bool IsAddConstHelper(HInstruction* instruction, + /*out*/ HInstruction** a, + /*out*/ HInstruction** b, + /*out*/ int64_t* c, + int32_t depth) { + static constexpr int32_t kMaxDepth = 8; // don't search too deep + int64_t value = 0; + if (IsInt64AndGet(instruction, &value)) { + *c += value; + return true; + } else if (instruction->IsAdd() && depth <= kMaxDepth) { + return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) && + IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1); + } else if (*a == nullptr) { + *a = instruction; + return true; + } else if (*b == nullptr) { + *b = instruction; + return true; + } + return false; // too many non-const operands +} + +// Detect a + b + c for an optional constant c. +static bool IsAddConst(HInstruction* instruction, + /*out*/ HInstruction** a, + /*out*/ HInstruction** b, + /*out*/ int64_t* c) { + if (instruction->IsAdd()) { + // Try to find a + b and accumulated c. + if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) && + IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) && + *b != nullptr) { + return true; + } + // Found a + b. + *a = instruction->InputAt(0); + *b = instruction->InputAt(1); + *c = 0; + return true; + } + return false; +} + // Test vector restrictions. static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) { return (restrictions & tested) != 0; @@ -733,7 +811,7 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite(); } -// TODO: more operations and intrinsics, detect saturation arithmetic, etc. +// TODO: saturation arithmetic. bool HLoopOptimization::VectorizeUse(LoopNode* node, HInstruction* instruction, bool generate_code, @@ -755,10 +833,9 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, } return true; } else if (instruction->IsArrayGet()) { - // Strings are different, with a different offset to the actual data - // and some compressed to save memory. For now, all cases are rejected - // to avoid the complexity. - if (instruction->AsArrayGet()->IsStringCharAt()) { + // Deal with vector restrictions. + if (instruction->AsArrayGet()->IsStringCharAt() && + HasVectorRestrictions(restrictions, kNoStringCharAt)) { return false; } // Accept a right-hand-side array base[index] for @@ -850,30 +927,38 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return true; } // Deal with vector restrictions. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + HInstruction* r = opa; + bool is_unsigned = false; if ((HasVectorRestrictions(restrictions, kNoShift)) || (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) { return false; // unsupported instruction - } else if ((instruction->IsShr() || instruction->IsUShr()) && - HasVectorRestrictions(restrictions, kNoHiBits)) { - return false; // hibits may impact lobits; TODO: we can do better! + } else if (HasVectorRestrictions(restrictions, kNoHiBits)) { + // Shifts right need extra care to account for higher order bits. + // TODO: less likely shr/unsigned and ushr/signed can by flipping signess. + if (instruction->IsShr() && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { + return false; // reject, unless all operands are sign-extension narrower + } else if (instruction->IsUShr() && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || !is_unsigned)) { + return false; // reject, unless all operands are zero-extension narrower + } } // Accept shift operator for vectorizable/invariant operands. // TODO: accept symbolic, albeit loop invariant shift factors. - HInstruction* opa = instruction->InputAt(0); - HInstruction* opb = instruction->InputAt(1); - int64_t value = 0; - if (VectorizeUse(node, opa, generate_code, type, restrictions) && IsInt64AndGet(opb, &value)) { - // Make sure shift distance only looks at lower bits, as defined for sequential shifts. - int64_t mask = (instruction->GetType() == Primitive::kPrimLong) - ? kMaxLongShiftDistance - : kMaxIntShiftDistance; - int64_t distance = value & mask; + DCHECK(r != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + } + int64_t distance = 0; + if (VectorizeUse(node, r, generate_code, type, restrictions) && + IsInt64AndGet(opb, /*out*/ &distance)) { // Restrict shift distance to packed data type width. int64_t max_distance = Primitive::ComponentSize(type) * 8; if (0 <= distance && distance < max_distance) { if (generate_code) { - HInstruction* s = graph_->GetIntConstant(distance); - GenerateVecOp(instruction, vector_map_->Get(opa), s, type); + GenerateVecOp(instruction, vector_map_->Get(r), opb, type); } return true; } @@ -887,16 +972,59 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, case Intrinsics::kMathAbsFloat: case Intrinsics::kMathAbsDouble: { // Deal with vector restrictions. - if (HasVectorRestrictions(restrictions, kNoAbs) || - HasVectorRestrictions(restrictions, kNoHiBits)) { - // TODO: we can do better for some hibits cases. + HInstruction* opa = instruction->InputAt(0); + HInstruction* r = opa; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoAbs)) { return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { + return false; // reject, unless operand is sign-extension narrower } // Accept ABS(x) for vectorizable operand. + DCHECK(r != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + } + if (VectorizeUse(node, r, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, vector_map_->Get(r), nullptr, type); + } + return true; + } + return false; + } + case Intrinsics::kMathMinIntInt: + case Intrinsics::kMathMinLongLong: + case Intrinsics::kMathMinFloatFloat: + case Intrinsics::kMathMinDoubleDouble: + case Intrinsics::kMathMaxIntInt: + case Intrinsics::kMathMaxLongLong: + case Intrinsics::kMathMaxFloatFloat: + case Intrinsics::kMathMaxDoubleDouble: { + // Deal with vector restrictions. HInstruction* opa = instruction->InputAt(0); - if (VectorizeUse(node, opa, generate_code, type, restrictions)) { + HInstruction* opb = instruction->InputAt(1); + HInstruction* r = opa; + HInstruction* s = opb; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoMinMax)) { + return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) { + return false; // reject, unless all operands are same-extension narrower + } + // Accept MIN/MAX(x, y) for vectorizable operands. + DCHECK(r != nullptr && s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + s = opb; + } + if (VectorizeUse(node, r, generate_code, type, restrictions) && + VectorizeUse(node, s, generate_code, type, restrictions)) { if (generate_code) { - GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type); + GenerateVecOp( + instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned); } return true; } @@ -921,17 +1049,17 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv | kNoAbs; + *restrictions |= kNoDiv; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoAbs; + *restrictions |= kNoDiv; return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoDiv | kNoMul; + *restrictions |= kNoDiv | kNoMul | kNoMinMax; return TrySetVectorLength(2); case Primitive::kPrimFloat: return TrySetVectorLength(4); @@ -957,11 +1085,13 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric *restrictions |= kNoDiv; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs; + *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax; return TrySetVectorLength(2); case Primitive::kPrimFloat: + *restrictions |= kNoMinMax; // -0.0 vs +0.0 return TrySetVectorLength(4); case Primitive::kPrimDouble: + *restrictions |= kNoMinMax; // -0.0 vs +0.0 return TrySetVectorLength(2); default: break; @@ -969,9 +1099,36 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric } return false; case kMips: - case kMips64: // TODO: implement MIPS SIMD. return false; + case kMips64: + if (features->AsMips64InstructionSetFeatures()->HasMsa()) { + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + *restrictions |= kNoDiv | kNoMinMax; + return TrySetVectorLength(16); + case Primitive::kPrimChar: + case Primitive::kPrimShort: + *restrictions |= kNoDiv | kNoMinMax | kNoStringCharAt; + return TrySetVectorLength(8); + case Primitive::kPrimInt: + *restrictions |= kNoDiv | kNoMinMax; + return TrySetVectorLength(4); + case Primitive::kPrimLong: + *restrictions |= kNoDiv | kNoMinMax; + return TrySetVectorLength(2); + case Primitive::kPrimFloat: + *restrictions |= kNoMinMax; + return TrySetVectorLength(4); + case Primitive::kPrimDouble: + *restrictions |= kNoMinMax; + return TrySetVectorLength(2); + default: + break; + } // switch type + } + return false; default: return false; } // switch instruction set @@ -1058,13 +1215,14 @@ void HLoopOptimization::GenerateVecMem(HInstruction* org, void HLoopOptimization::GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, - Primitive::Type type) { + Primitive::Type type, + bool is_unsigned) { if (vector_mode_ == kSequential) { - // Scalar code follows implicit integral promotion. - if (type == Primitive::kPrimBoolean || - type == Primitive::kPrimByte || - type == Primitive::kPrimChar || - type == Primitive::kPrimShort) { + // Non-converting scalar code follows implicit integral promotion. + if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean || + type == Primitive::kPrimByte || + type == Primitive::kPrimChar || + type == Primitive::kPrimShort)) { type = Primitive::kPrimInt; } } @@ -1141,6 +1299,22 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, DCHECK(opb == nullptr); vector = new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_); break; + case Intrinsics::kMathMinIntInt: + case Intrinsics::kMathMinLongLong: + case Intrinsics::kMathMinFloatFloat: + case Intrinsics::kMathMinDoubleDouble: { + vector = new (global_allocator_) + HVecMin(global_allocator_, opa, opb, type, vector_length_, is_unsigned); + break; + } + case Intrinsics::kMathMaxIntInt: + case Intrinsics::kMathMaxLongLong: + case Intrinsics::kMathMaxFloatFloat: + case Intrinsics::kMathMaxDoubleDouble: { + vector = new (global_allocator_) + HVecMax(global_allocator_, opa, opb, type, vector_length_, is_unsigned); + break; + } default: LOG(FATAL) << "Unsupported SIMD intrinsic"; UNREACHABLE(); @@ -1150,9 +1324,10 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, // corresponding new scalar instructions in the loop. The instruction will get an // environment while being inserted from the instruction map in original program order. DCHECK(vector_mode_ == kSequential); + size_t num_args = invoke->GetNumberOfArguments(); HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect( global_allocator_, - invoke->GetNumberOfArguments(), + num_args, invoke->GetType(), invoke->GetDexPc(), invoke->GetDexMethodIndex(), @@ -1162,8 +1337,14 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, invoke->GetTargetMethod(), invoke->GetClinitCheckRequirement()); HInputsRef inputs = invoke->GetInputs(); - for (size_t index = 0; index < inputs.size(); ++index) { - new_invoke->SetArgumentAt(index, vector_map_->Get(inputs[index])); + size_t num_inputs = inputs.size(); + DCHECK_LE(num_args, num_inputs); + DCHECK_EQ(num_inputs, new_invoke->GetInputs().size()); // both invokes agree + for (size_t index = 0; index < num_inputs; ++index) { + HInstruction* new_input = index < num_args + ? vector_map_->Get(inputs[index]) + : inputs[index]; // beyond arguments: just pass through + new_invoke->SetArgumentAt(index, new_input); } new_invoke->SetIntrinsic(invoke->GetIntrinsic(), kNeedsEnvironmentOrCache, @@ -1200,34 +1381,30 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, Primitive::Type type, uint64_t restrictions) { // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1 - // (note whether the sign bit in higher precision is shifted in has no effect + // (note whether the sign bit in wider precision is shifted in has no effect // on the narrow precision computed by the idiom). - int64_t value = 0; + int64_t distance = 0; if ((instruction->IsShr() || instruction->IsUShr()) && - IsInt64AndGet(instruction->InputAt(1), &value) && value == 1) { - // - // TODO: make following code less sensitive to associativity and commutativity differences. - // - HInstruction* x = instruction->InputAt(0); - // Test for an optional rounding part (x + 1) >> 1. - bool is_rounded = false; - if (x->IsAdd() && IsInt64AndGet(x->InputAt(1), &value) && value == 1) { - x = x->InputAt(0); - is_rounded = true; - } - // Test for a core addition (a + b) >> 1 (possibly rounded), either unsigned or signed. - if (x->IsAdd()) { - HInstruction* a = x->InputAt(0); - HInstruction* b = x->InputAt(1); + IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) { + // Test for (a + b + c) >> 1 for optional constant c. + HInstruction* a = nullptr; + HInstruction* b = nullptr; + int64_t c = 0; + if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) { + DCHECK(a != nullptr && b != nullptr); + // Accept c == 1 (rounded) or c == 0 (not rounded). + bool is_rounded = false; + if (c == 1) { + is_rounded = true; + } else if (c != 0) { + return false; + } + // Accept consistent zero or sign extension on operands a and b. HInstruction* r = nullptr; HInstruction* s = nullptr; bool is_unsigned = false; - if (IsZeroExtensionAndGet(a, type, &r) && IsZeroExtensionAndGet(b, type, &s)) { - is_unsigned = true; - } else if (IsSignExtensionAndGet(a, type, &r) && IsSignExtensionAndGet(b, type, &s)) { - is_unsigned = false; - } else { + if (!IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned)) { return false; } // Deal with vector restrictions. @@ -1238,6 +1415,10 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, // Accept recognized halving add for vectorizable operands. Vectorized code uses the // shorthand idiomatic operation. Sequential code uses the original scalar expressions. DCHECK(r != nullptr && s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = instruction->InputAt(0); + s = instruction->InputAt(1); + } if (VectorizeUse(node, r, generate_code, type, restrictions) && VectorizeUse(node, s, generate_code, type, restrictions)) { if (generate_code) { @@ -1251,12 +1432,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, is_unsigned, is_rounded)); } else { - VectorizeUse(node, instruction->InputAt(0), generate_code, type, restrictions); - VectorizeUse(node, instruction->InputAt(1), generate_code, type, restrictions); - GenerateVecOp(instruction, - vector_map_->Get(instruction->InputAt(0)), - vector_map_->Get(instruction->InputAt(1)), - type); + GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type); } } return true; diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index c3b0b5d996..cc6343aeb5 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -71,6 +71,8 @@ class HLoopOptimization : public HOptimization { kNoSignedHAdd = 32, // no signed halving add kNoUnroundedHAdd = 64, // no unrounded halving add kNoAbs = 128, // no absolute value + kNoMinMax = 256, // no min/max + kNoStringCharAt = 512, // no StringCharAt }; /* @@ -136,7 +138,11 @@ class HLoopOptimization : public HOptimization { HInstruction* opa, HInstruction* opb, Primitive::Type type); - void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type); + void GenerateVecOp(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + Primitive::Type type, + bool is_unsigned = false); // Vectorization idioms. bool VectorizeHalvingAddIdiom(LoopNode* node, diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index cd05a884e9..9a91287670 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -528,6 +528,15 @@ HCurrentMethod* HGraph::GetCurrentMethod() { return cached_current_method_; } +const char* HGraph::GetMethodName() const { + const DexFile::MethodId& method_id = dex_file_.GetMethodId(method_idx_); + return dex_file_.GetMethodName(method_id); +} + +std::string HGraph::PrettyMethod(bool with_signature) const { + return dex_file_.PrettyMethod(method_idx_, with_signature); +} + HConstant* HGraph::GetConstant(Primitive::Type type, int64_t value, uint32_t dex_pc) { switch (type) { case Primitive::Type::kPrimBoolean: @@ -1150,6 +1159,95 @@ void HVariableInputSizeInstruction::RemoveInputAt(size_t index) { } } +void HVariableInputSizeInstruction::RemoveAllInputs() { + RemoveAsUserOfAllInputs(); + DCHECK(!HasNonEnvironmentUses()); + + inputs_.clear(); + DCHECK_EQ(0u, InputCount()); +} + +void HConstructorFence::RemoveConstructorFences(HInstruction* instruction) { + DCHECK(instruction->GetBlock() != nullptr); + // Removing constructor fences only makes sense for instructions with an object return type. + DCHECK_EQ(Primitive::kPrimNot, instruction->GetType()); + + // Efficient implementation that simultaneously (in one pass): + // * Scans the uses list for all constructor fences. + // * Deletes that constructor fence from the uses list of `instruction`. + // * Deletes `instruction` from the constructor fence's inputs. + // * Deletes the constructor fence if it now has 0 inputs. + + const HUseList<HInstruction*>& uses = instruction->GetUses(); + // Warning: Although this is "const", we might mutate the list when calling RemoveInputAt. + for (auto it = uses.begin(), end = uses.end(); it != end; ) { + const HUseListNode<HInstruction*>& use_node = *it; + HInstruction* const use_instruction = use_node.GetUser(); + + // Advance the iterator immediately once we fetch the use_node. + // Warning: If the input is removed, the current iterator becomes invalid. + ++it; + + if (use_instruction->IsConstructorFence()) { + HConstructorFence* ctor_fence = use_instruction->AsConstructorFence(); + size_t input_index = use_node.GetIndex(); + + // Process the candidate instruction for removal + // from the graph. + + // Constructor fence instructions are never + // used by other instructions. + // + // If we wanted to make this more generic, it + // could be a runtime if statement. + DCHECK(!ctor_fence->HasUses()); + + // A constructor fence's return type is "kPrimVoid" + // and therefore it can't have any environment uses. + DCHECK(!ctor_fence->HasEnvironmentUses()); + + // Remove the inputs first, otherwise removing the instruction + // will try to remove its uses while we are already removing uses + // and this operation will fail. + DCHECK_EQ(instruction, ctor_fence->InputAt(input_index)); + + // Removing the input will also remove the `use_node`. + // (Do not look at `use_node` after this, it will be a dangling reference). + ctor_fence->RemoveInputAt(input_index); + + // Once all inputs are removed, the fence is considered dead and + // is removed. + if (ctor_fence->InputCount() == 0u) { + ctor_fence->GetBlock()->RemoveInstruction(ctor_fence); + } + } + } + + if (kIsDebugBuild) { + // Post-condition checks: + // * None of the uses of `instruction` are a constructor fence. + // * The `instruction` itself did not get removed from a block. + for (const HUseListNode<HInstruction*>& use_node : instruction->GetUses()) { + CHECK(!use_node.GetUser()->IsConstructorFence()); + } + CHECK(instruction->GetBlock() != nullptr); + } +} + +HInstruction* HConstructorFence::GetAssociatedAllocation() { + HInstruction* new_instance_inst = GetPrevious(); + // Check if the immediately preceding instruction is a new-instance/new-array. + // Otherwise this fence is for protecting final fields. + if (new_instance_inst != nullptr && + (new_instance_inst->IsNewInstance() || new_instance_inst->IsNewArray())) { + // TODO: Need to update this code to handle multiple inputs. + DCHECK_EQ(InputCount(), 1u); + return new_instance_inst; + } else { + return nullptr; + } +} + #define DEFINE_ACCEPT(name, super) \ void H##name::Accept(HGraphVisitor* visitor) { \ visitor->Visit##name(this); \ @@ -2538,15 +2636,17 @@ bool HInvokeStaticOrDirect::NeedsDexCacheOfDeclaringClass() const { std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind rhs) { switch (rhs) { case HInvokeStaticOrDirect::MethodLoadKind::kStringInit: - return os << "string_init"; + return os << "StringInit"; case HInvokeStaticOrDirect::MethodLoadKind::kRecursive: - return os << "recursive"; + return os << "Recursive"; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: + return os << "BootImageLinkTimePcRelative"; case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: - return os << "direct"; + return os << "DirectAddress"; case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: - return os << "dex_cache_pc_relative"; + return os << "DexCachePcRelative"; case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: - return os << "dex_cache_via_method"; + return os << "DexCacheViaMethod"; default: LOG(FATAL) << "Unknown MethodLoadKind: " << static_cast<int>(rhs); UNREACHABLE(); @@ -2590,7 +2690,7 @@ bool HLoadClass::InstructionDataEquals(const HInstruction* other) const { void HLoadClass::SetLoadKind(LoadKind load_kind) { SetPackedField<LoadKindField>(load_kind); - if (load_kind != LoadKind::kDexCacheViaMethod && + if (load_kind != LoadKind::kRuntimeCall && load_kind != LoadKind::kReferrersClass) { RemoveAsUserOfInput(0u); SetRawInputAt(0u, nullptr); @@ -2606,8 +2706,6 @@ std::ostream& operator<<(std::ostream& os, HLoadClass::LoadKind rhs) { switch (rhs) { case HLoadClass::LoadKind::kReferrersClass: return os << "ReferrersClass"; - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: - return os << "BootImageLinkTimeAddress"; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: return os << "BootImageLinkTimePcRelative"; case HLoadClass::LoadKind::kBootImageAddress: @@ -2616,8 +2714,8 @@ std::ostream& operator<<(std::ostream& os, HLoadClass::LoadKind rhs) { return os << "BssEntry"; case HLoadClass::LoadKind::kJitTableAddress: return os << "JitTableAddress"; - case HLoadClass::LoadKind::kDexCacheViaMethod: - return os << "DexCacheViaMethod"; + case HLoadClass::LoadKind::kRuntimeCall: + return os << "RuntimeCall"; default: LOG(FATAL) << "Unknown HLoadClass::LoadKind: " << static_cast<int>(rhs); UNREACHABLE(); @@ -2645,10 +2743,10 @@ bool HLoadString::InstructionDataEquals(const HInstruction* other) const { void HLoadString::SetLoadKind(LoadKind load_kind) { // Once sharpened, the load kind should not be changed again. - DCHECK_EQ(GetLoadKind(), LoadKind::kDexCacheViaMethod); + DCHECK_EQ(GetLoadKind(), LoadKind::kRuntimeCall); SetPackedField<LoadKindField>(load_kind); - if (load_kind != LoadKind::kDexCacheViaMethod) { + if (load_kind != LoadKind::kRuntimeCall) { RemoveAsUserOfInput(0u); SetRawInputAt(0u, nullptr); } @@ -2660,8 +2758,6 @@ void HLoadString::SetLoadKind(LoadKind load_kind) { std::ostream& operator<<(std::ostream& os, HLoadString::LoadKind rhs) { switch (rhs) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: - return os << "BootImageLinkTimeAddress"; case HLoadString::LoadKind::kBootImageLinkTimePcRelative: return os << "BootImageLinkTimePcRelative"; case HLoadString::LoadKind::kBootImageAddress: @@ -2670,8 +2766,8 @@ std::ostream& operator<<(std::ostream& os, HLoadString::LoadKind rhs) { return os << "BssEntry"; case HLoadString::LoadKind::kJitTableAddress: return os << "JitTableAddress"; - case HLoadString::LoadKind::kDexCacheViaMethod: - return os << "DexCacheViaMethod"; + case HLoadString::LoadKind::kRuntimeCall: + return os << "RuntimeCall"; default: LOG(FATAL) << "Unknown HLoadString::LoadKind: " << static_cast<int>(rhs); UNREACHABLE(); diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 872b9083fe..befd0ff97b 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -47,6 +47,7 @@ namespace art { class GraphChecker; class HBasicBlock; +class HConstructorFence; class HCurrentMethod; class HDoubleConstant; class HEnvironment; @@ -58,6 +59,7 @@ class HIntConstant; class HInvoke; class HLongConstant; class HNullConstant; +class HParameterValue; class HPhi; class HSuspendCheck; class HTryBoundary; @@ -538,6 +540,12 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { return method_idx_; } + // Get the method name (without the signature), e.g. "<init>" + const char* GetMethodName() const; + + // Get the pretty method name (class + name + optionally signature). + std::string PrettyMethod(bool with_signature = true) const; + InvokeType GetInvokeType() const { return invoke_type_; } @@ -1298,6 +1306,7 @@ class HLoopInformationOutwardIterator : public ValueObject { M(ClearException, Instruction) \ M(ClinitCheck, Instruction) \ M(Compare, BinaryOperation) \ + M(ConstructorFence, Instruction) \ M(CurrentMethod, Instruction) \ M(ShouldDeoptimizeFlag, Instruction) \ M(Deoptimize, Instruction) \ @@ -1397,7 +1406,8 @@ class HLoopInformationOutwardIterator : public ValueObject { M(BitwiseNegatedRight, Instruction) \ M(DataProcWithShifterOp, Instruction) \ M(MultiplyAccumulate, Instruction) \ - M(IntermediateAddress, Instruction) + M(IntermediateAddress, Instruction) \ + M(IntermediateAddressIndex, Instruction) #endif #ifndef ART_ENABLE_CODEGEN_arm @@ -1477,8 +1487,11 @@ FOR_EACH_INSTRUCTION(FORWARD_DECLARATION) template <typename T> class HUseListNode : public ArenaObject<kArenaAllocUseListNode> { public: + // Get the instruction which has this use as one of the inputs. T GetUser() const { return user_; } + // Get the position of the input record that this use corresponds to. size_t GetIndex() const { return index_; } + // Set the position of the input record that this use corresponds to. void SetIndex(size_t index) { index_ = index; } // Hook for the IntrusiveForwardList<>. @@ -1777,7 +1790,7 @@ class HEnvironment : public ArenaObject<kArenaAllocEnvironment> { uint32_t dex_pc, HInstruction* holder) : vregs_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentVRegs)), - locations_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentLocations)), + locations_(arena->Adapter(kArenaAllocEnvironmentLocations)), parent_(nullptr), method_(method), dex_pc_(dex_pc), @@ -1791,6 +1804,11 @@ class HEnvironment : public ArenaObject<kArenaAllocEnvironment> { to_copy.GetDexPc(), holder) {} + void AllocateLocations() { + DCHECK(locations_.empty()); + locations_.resize(vregs_.size()); + } + void SetAndCopyParentChain(ArenaAllocator* allocator, HEnvironment* parent) { if (parent_ != nullptr) { parent_->SetAndCopyParentChain(allocator, parent); @@ -2038,7 +2056,8 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> { !IsNativeDebugInfo() && !IsParameterValue() && // If we added an explicit barrier then we should keep it. - !IsMemoryBarrier(); + !IsMemoryBarrier() && + !IsConstructorFence(); } bool IsDeadAndRemovable() const { @@ -2432,6 +2451,11 @@ class HVariableInputSizeInstruction : public HInstruction { void InsertInputAt(size_t index, HInstruction* input); void RemoveInputAt(size_t index); + // Removes all the inputs. + // Also removes this instructions from each input's use list + // (for non-environment uses only). + void RemoveAllInputs(); + protected: HVariableInputSizeInstruction(SideEffects side_effects, uint32_t dex_pc, @@ -4134,6 +4158,10 @@ class HInvokeStaticOrDirect FINAL : public HInvoke { // Use the method's own ArtMethod* loaded by the register allocator. kRecursive, + // Use PC-relative boot image ArtMethod* address that will be known at link time. + // Used for boot image methods referenced by boot image code. + kBootImageLinkTimePcRelative, + // Use ArtMethod* at a known address, embed the direct address in the code. // Used for app->boot calls with non-relocatable image and for JIT-compiled calls. kDirectAddress, @@ -4273,6 +4301,10 @@ class HInvokeStaticOrDirect FINAL : public HInvoke { bool HasPcRelativeDexCache() const { return GetMethodLoadKind() == MethodLoadKind::kDexCachePcRelative; } + bool HasPcRelativeMethodLoadKind() const { + return GetMethodLoadKind() == MethodLoadKind::kBootImageLinkTimePcRelative || + GetMethodLoadKind() == MethodLoadKind::kDexCachePcRelative; + } bool HasCurrentMethodInput() const { // This function can be called only after the invoke has been fully initialized by the builder. if (NeedsCurrentMethodInput(GetMethodLoadKind())) { @@ -5063,7 +5095,7 @@ class HParameterValue FINAL : public HExpression<0> { const DexFile& GetDexFile() const { return dex_file_; } dex::TypeIndex GetTypeIndex() const { return type_index_; } uint8_t GetIndex() const { return index_; } - bool IsThis() const ATTRIBUTE_UNUSED { return GetPackedFlag<kFlagIsThis>(); } + bool IsThis() const { return GetPackedFlag<kFlagIsThis>(); } bool CanBeNull() const OVERRIDE { return GetPackedFlag<kFlagCanBeNull>(); } void SetCanBeNull(bool can_be_null) { SetPackedFlag<kFlagCanBeNull>(can_be_null); } @@ -5371,10 +5403,16 @@ class HArrayGet FINAL : public HExpression<2> { } bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const OVERRIDE { // TODO: We can be smarter here. - // Currently, the array access is always preceded by an ArrayLength or a NullCheck - // which generates the implicit null check. There are cases when these can be removed - // to produce better code. If we ever add optimizations to do so we should allow an - // implicit check here (as long as the address falls in the first page). + // Currently, unless the array is the result of NewArray, the array access is always + // preceded by some form of null NullCheck necessary for the bounds check, usually + // implicit null check on the ArrayLength input to BoundsCheck or Deoptimize for + // dynamic BCE. There are cases when these could be removed to produce better code. + // If we ever add optimizations to do so we should allow an implicit check here + // (as long as the address falls in the first page). + // + // As an example of such fancy optimization, we could eliminate BoundsCheck for + // a = cond ? new int[1] : null; + // a[0]; // The Phi does not need bounds check for either input. return false; } @@ -5639,12 +5677,8 @@ class HLoadClass FINAL : public HInstruction { // Use the Class* from the method's own ArtMethod*. kReferrersClass, - // Use boot image Class* address that will be known at link time. - // Used for boot image classes referenced by boot image code in non-PIC mode. - kBootImageLinkTimeAddress, - // Use PC-relative boot image Class* address that will be known at link time. - // Used for boot image classes referenced by boot image code in PIC mode. + // Used for boot image classes referenced by boot image code. kBootImageLinkTimePcRelative, // Use a known boot image Class* address, embedded in the code by the codegen. @@ -5658,12 +5692,11 @@ class HLoadClass FINAL : public HInstruction { // Load from the root table associated with the JIT compiled method. kJitTableAddress, - // Load from resolved types array accessed through the class loaded from - // the compiled method's own ArtMethod*. This is the default access type when - // all other types are unavailable. - kDexCacheViaMethod, + // Load using a simple runtime call. This is the fall-back load kind when + // the codegen is unable to use another appropriate kind. + kRuntimeCall, - kLast = kDexCacheViaMethod + kLast = kRuntimeCall }; HLoadClass(HCurrentMethod* current_method, @@ -5684,7 +5717,7 @@ class HLoadClass FINAL : public HInstruction { DCHECK(!is_referrers_class || !needs_access_check); SetPackedField<LoadKindField>( - is_referrers_class ? LoadKind::kReferrersClass : LoadKind::kDexCacheViaMethod); + is_referrers_class ? LoadKind::kReferrersClass : LoadKind::kRuntimeCall); SetPackedFlag<kFlagNeedsAccessCheck>(needs_access_check); SetPackedFlag<kFlagIsInBootImage>(false); SetPackedFlag<kFlagGenerateClInitCheck>(false); @@ -5718,7 +5751,7 @@ class HLoadClass FINAL : public HInstruction { bool CanCallRuntime() const { return NeedsAccessCheck() || MustGenerateClinitCheck() || - GetLoadKind() == LoadKind::kDexCacheViaMethod || + GetLoadKind() == LoadKind::kRuntimeCall || GetLoadKind() == LoadKind::kBssEntry; } @@ -5728,7 +5761,7 @@ class HLoadClass FINAL : public HInstruction { // If the class is in the boot image, the lookup in the runtime call cannot throw. // This keeps CanThrow() consistent between non-PIC (using kBootImageAddress) and // PIC and subsequently avoids a DCE behavior dependency on the PIC option. - ((GetLoadKind() == LoadKind::kDexCacheViaMethod || + ((GetLoadKind() == LoadKind::kRuntimeCall || GetLoadKind() == LoadKind::kBssEntry) && !IsInBootImage()); } @@ -5747,7 +5780,7 @@ class HLoadClass FINAL : public HInstruction { const DexFile& GetDexFile() const { return dex_file_; } bool NeedsDexCacheOfDeclaringClass() const OVERRIDE { - return GetLoadKind() == LoadKind::kDexCacheViaMethod; + return GetLoadKind() == LoadKind::kRuntimeCall; } static SideEffects SideEffectsForArchRuntimeCalls() { @@ -5796,15 +5829,14 @@ class HLoadClass FINAL : public HInstruction { static bool HasTypeReference(LoadKind load_kind) { return load_kind == LoadKind::kReferrersClass || - load_kind == LoadKind::kBootImageLinkTimeAddress || load_kind == LoadKind::kBootImageLinkTimePcRelative || load_kind == LoadKind::kBssEntry || - load_kind == LoadKind::kDexCacheViaMethod; + load_kind == LoadKind::kRuntimeCall; } void SetLoadKindInternal(LoadKind load_kind); - // The special input is the HCurrentMethod for kDexCacheViaMethod or kReferrersClass. + // The special input is the HCurrentMethod for kRuntimeCall or kReferrersClass. // For other load kinds it's empty or possibly some architecture-specific instruction // for PC-relative loads, i.e. kBssEntry or kBootImageLinkTimePcRelative. HUserRecord<HInstruction*> special_input_; @@ -5813,7 +5845,7 @@ class HLoadClass FINAL : public HInstruction { // - The compiling method's dex file if the class is defined there too. // - The compiling method's dex file if the class is referenced there. // - The dex file where the class is defined. When the load kind can only be - // kBssEntry or kDexCacheViaMethod, we cannot emit code for this `HLoadClass`. + // kBssEntry or kRuntimeCall, we cannot emit code for this `HLoadClass`. const dex::TypeIndex type_index_; const DexFile& dex_file_; @@ -5830,7 +5862,6 @@ inline void HLoadClass::AddSpecialInput(HInstruction* special_input) { // The special input is used for PC-relative loads on some architectures, // including literal pool loads, which are PC-relative too. DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || - GetLoadKind() == LoadKind::kBootImageLinkTimeAddress || GetLoadKind() == LoadKind::kBootImageAddress || GetLoadKind() == LoadKind::kBssEntry) << GetLoadKind(); DCHECK(special_input_.GetInstruction() == nullptr); @@ -5842,12 +5873,8 @@ class HLoadString FINAL : public HInstruction { public: // Determines how to load the String. enum class LoadKind { - // Use boot image String* address that will be known at link time. - // Used for boot image strings referenced by boot image code in non-PIC mode. - kBootImageLinkTimeAddress, - // Use PC-relative boot image String* address that will be known at link time. - // Used for boot image strings referenced by boot image code in PIC mode. + // Used for boot image strings referenced by boot image code. kBootImageLinkTimePcRelative, // Use a known boot image String* address, embedded in the code by the codegen. @@ -5861,12 +5888,11 @@ class HLoadString FINAL : public HInstruction { // Load from the root table associated with the JIT compiled method. kJitTableAddress, - // Load from resolved strings array accessed through the class loaded from - // the compiled method's own ArtMethod*. This is the default access type when - // all other types are unavailable. - kDexCacheViaMethod, + // Load using a simple runtime call. This is the fall-back load kind when + // the codegen is unable to use another appropriate kind. + kRuntimeCall, - kLast = kDexCacheViaMethod, + kLast = kRuntimeCall, }; HLoadString(HCurrentMethod* current_method, @@ -5877,7 +5903,7 @@ class HLoadString FINAL : public HInstruction { special_input_(HUserRecord<HInstruction*>(current_method)), string_index_(string_index), dex_file_(dex_file) { - SetPackedField<LoadKindField>(LoadKind::kDexCacheViaMethod); + SetPackedField<LoadKindField>(LoadKind::kRuntimeCall); } void SetLoadKind(LoadKind load_kind); @@ -5912,8 +5938,7 @@ class HLoadString FINAL : public HInstruction { // the dex cache and the string is not guaranteed to be there yet. bool NeedsEnvironment() const OVERRIDE { LoadKind load_kind = GetLoadKind(); - if (load_kind == LoadKind::kBootImageLinkTimeAddress || - load_kind == LoadKind::kBootImageLinkTimePcRelative || + if (load_kind == LoadKind::kBootImageLinkTimePcRelative || load_kind == LoadKind::kBootImageAddress || load_kind == LoadKind::kJitTableAddress) { return false; @@ -5922,7 +5947,7 @@ class HLoadString FINAL : public HInstruction { } bool NeedsDexCacheOfDeclaringClass() const OVERRIDE { - return GetLoadKind() == LoadKind::kDexCacheViaMethod; + return GetLoadKind() == LoadKind::kRuntimeCall; } bool CanBeNull() const OVERRIDE { return false; } @@ -5956,7 +5981,7 @@ class HLoadString FINAL : public HInstruction { void SetLoadKindInternal(LoadKind load_kind); - // The special input is the HCurrentMethod for kDexCacheViaMethod. + // The special input is the HCurrentMethod for kRuntimeCall. // For other load kinds it's empty or possibly some architecture-specific instruction // for PC-relative loads, i.e. kBssEntry or kBootImageLinkTimePcRelative. HUserRecord<HInstruction*> special_input_; @@ -5976,7 +6001,6 @@ inline void HLoadString::AddSpecialInput(HInstruction* special_input) { // including literal pool loads, which are PC-relative too. DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || GetLoadKind() == LoadKind::kBssEntry || - GetLoadKind() == LoadKind::kBootImageLinkTimeAddress || GetLoadKind() == LoadKind::kBootImageAddress) << GetLoadKind(); // HLoadString::GetInputRecords() returns an empty array at this point, // so use the GetInputRecords() from the base class to set the input record. @@ -6495,6 +6519,145 @@ class HMemoryBarrier FINAL : public HTemplateInstruction<0> { DISALLOW_COPY_AND_ASSIGN(HMemoryBarrier); }; +// A constructor fence orders all prior stores to fields that could be accessed via a final field of +// the specified object(s), with respect to any subsequent store that might "publish" +// (i.e. make visible) the specified object to another thread. +// +// JLS 17.5.1 "Semantics of final fields" states that a freeze action happens +// for all final fields (that were set) at the end of the invoked constructor. +// +// The constructor fence models the freeze actions for the final fields of an object +// being constructed (semantically at the end of the constructor). Constructor fences +// have a per-object affinity; two separate objects being constructed get two separate +// constructor fences. +// +// (Note: that if calling a super-constructor or forwarding to another constructor, +// the freezes would happen at the end of *that* constructor being invoked). +// +// The memory model guarantees that when the object being constructed is "published" after +// constructor completion (i.e. escapes the current thread via a store), then any final field +// writes must be observable on other threads (once they observe that publication). +// +// Further, anything written before the freeze, and read by dereferencing through the final field, +// must also be visible (so final object field could itself have an object with non-final fields; +// yet the freeze must also extend to them). +// +// Constructor example: +// +// class HasFinal { +// final int field; Optimizing IR for <init>()V: +// HasFinal() { +// field = 123; HInstanceFieldSet(this, HasFinal.field, 123) +// // freeze(this.field); HConstructorFence(this) +// } HReturn +// } +// +// HConstructorFence can serve double duty as a fence for new-instance/new-array allocations of +// already-initialized classes; in that case the allocation must act as a "default-initializer" +// of the object which effectively writes the class pointer "final field". +// +// For example, we can model default-initialiation as roughly the equivalent of the following: +// +// class Object { +// private final Class header; +// } +// +// Java code: Optimizing IR: +// +// T new_instance<T>() { +// Object obj = allocate_memory(T.class.size); obj = HInvoke(art_quick_alloc_object, T) +// obj.header = T.class; // header write is done by above call. +// // freeze(obj.header) HConstructorFence(obj) +// return (T)obj; +// } +// +// See also: +// * CompilerDriver::RequiresConstructorBarrier +// * QuasiAtomic::ThreadFenceForConstructor +// +class HConstructorFence FINAL : public HVariableInputSizeInstruction { + // A fence has variable inputs because the inputs can be removed + // after prepare_for_register_allocation phase. + // (TODO: In the future a fence could freeze multiple objects + // after merging two fences together.) + public: + // `fence_object` is the reference that needs to be protected for correct publication. + // + // It makes sense in the following situations: + // * <init> constructors, it's the "this" parameter (i.e. HParameterValue, s.t. IsThis() == true). + // * new-instance-like instructions, it's the return value (i.e. HNewInstance). + // + // After construction the `fence_object` becomes the 0th input. + // This is not an input in a real sense, but just a convenient place to stash the information + // about the associated object. + HConstructorFence(HInstruction* fence_object, + uint32_t dex_pc, + ArenaAllocator* arena) + // We strongly suspect there is not a more accurate way to describe the fine-grained reordering + // constraints described in the class header. We claim that these SideEffects constraints + // enforce a superset of the real constraints. + // + // The ordering described above is conservatively modeled with SideEffects as follows: + // + // * To prevent reordering of the publication stores: + // ----> "Reads of objects" is the initial SideEffect. + // * For every primitive final field store in the constructor: + // ----> Union that field's type as a read (e.g. "Read of T") into the SideEffect. + // * If there are any stores to reference final fields in the constructor: + // ----> Use a more conservative "AllReads" SideEffect because any stores to any references + // that are reachable from `fence_object` also need to be prevented for reordering + // (and we do not want to do alias analysis to figure out what those stores are). + // + // In the implementation, this initially starts out as an "all reads" side effect; this is an + // even more conservative approach than the one described above, and prevents all of the + // above reordering without analyzing any of the instructions in the constructor. + // + // If in a later phase we discover that there are no writes to reference final fields, + // we can refine the side effect to a smaller set of type reads (see above constraints). + : HVariableInputSizeInstruction(SideEffects::AllReads(), + dex_pc, + arena, + /* number_of_inputs */ 1, + kArenaAllocConstructorFenceInputs) { + DCHECK(fence_object != nullptr); + SetRawInputAt(0, fence_object); + } + + // The object associated with this constructor fence. + // + // (Note: This will be null after the prepare_for_register_allocation phase, + // as all constructor fence inputs are removed there). + HInstruction* GetFenceObject() const { + return InputAt(0); + } + + // Find all the HConstructorFence uses (`fence_use`) for `this` and: + // - Delete `fence_use` from `this`'s use list. + // - Delete `this` from `fence_use`'s inputs list. + // - If the `fence_use` is dead, remove it from the graph. + // + // A fence is considered dead once it no longer has any uses + // and all of the inputs are dead. + // + // This must *not* be called during/after prepare_for_register_allocation, + // because that removes all the inputs to the fences but the fence is actually + // still considered live. + static void RemoveConstructorFences(HInstruction* instruction); + + // Check if this constructor fence is protecting + // an HNewInstance or HNewArray that is also the immediate + // predecessor of `this`. + // + // Returns the associated HNewArray or HNewInstance, + // or null otherwise. + HInstruction* GetAssociatedAllocation(); + + DECLARE_INSTRUCTION(ConstructorFence); + + private: + DISALLOW_COPY_AND_ASSIGN(HConstructorFence); +}; + class HMonitorOperation FINAL : public HTemplateInstruction<1> { public: enum class OperationKind { diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h index c6bfbcc7fb..075a816f3f 100644 --- a/compiler/optimizing/nodes_shared.h +++ b/compiler/optimizing/nodes_shared.h @@ -150,6 +150,49 @@ class HIntermediateAddress FINAL : public HExpression<2> { DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress); }; +// This instruction computes part of the array access offset (data and index offset). +// +// For array accesses the element address has the following structure: +// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing +// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with +// the same data type and index. For example, for the following loop 5 accesses can share address +// computation: +// +// void foo(int[] a, int[] b, int[] c) { +// for (i...) { +// a[i] = a[i] + 5; +// b[i] = b[i] + c[i]; +// } +// } +// +// Note: as the instruction doesn't involve base array address into computations it has no side +// effects (in comparison of HIntermediateAddress). +class HIntermediateAddressIndex FINAL : public HExpression<3> { + public: + HIntermediateAddressIndex( + HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc) + : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) { + SetRawInputAt(0, index); + SetRawInputAt(1, offset); + SetRawInputAt(2, shift); + } + + bool CanBeMoved() const OVERRIDE { return true; } + bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { + return true; + } + bool IsActualObject() const OVERRIDE { return false; } + + HInstruction* GetIndex() const { return InputAt(0); } + HInstruction* GetOffset() const { return InputAt(1); } + HInstruction* GetShift() const { return InputAt(2); } + + DECLARE_INSTRUCTION(IntermediateAddressIndex); + + private: + DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex); +}; + class HDataProcWithShifterOp FINAL : public HExpression<2> { public: enum OpKind { diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 52c247b52f..5dbe29b4fa 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -178,12 +178,17 @@ class HVecMemoryOperation : public HVecOperation { size_t vector_length, uint32_t dex_pc) : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc), - alignment_(Primitive::ComponentSize(packed_type), 0) { } + alignment_(Primitive::ComponentSize(packed_type), 0) { + DCHECK_GE(number_of_inputs, 2u); + } void SetAlignment(Alignment alignment) { alignment_ = alignment; } Alignment GetAlignment() const { return alignment_; } + HInstruction* GetArray() const { return InputAt(0); } + HInstruction* GetIndex() const { return InputAt(1); } + DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation); private: @@ -451,13 +456,24 @@ class HVecMin FINAL : public HVecBinaryOperation { HInstruction* right, Primitive::Type packed_type, size_t vector_length, + bool is_unsigned, uint32_t dex_pc = kNoDexPc) : HVecBinaryOperation(arena, left, right, packed_type, vector_length, dex_pc) { DCHECK(HasConsistentPackedTypes(left, packed_type)); DCHECK(HasConsistentPackedTypes(right, packed_type)); + SetPackedFlag<kFieldMinOpIsUnsigned>(is_unsigned); } + + bool IsUnsigned() const { return GetPackedFlag<kFieldMinOpIsUnsigned>(); } + DECLARE_INSTRUCTION(VecMin); + private: + // Additional packed bits. + static constexpr size_t kFieldMinOpIsUnsigned = HVecOperation::kNumberOfVectorOpPackedBits; + static constexpr size_t kNumberOfMinOpPackedBits = kFieldMinOpIsUnsigned + 1; + static_assert(kNumberOfMinOpPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields."); + DISALLOW_COPY_AND_ASSIGN(HVecMin); }; @@ -470,13 +486,24 @@ class HVecMax FINAL : public HVecBinaryOperation { HInstruction* right, Primitive::Type packed_type, size_t vector_length, + bool is_unsigned, uint32_t dex_pc = kNoDexPc) : HVecBinaryOperation(arena, left, right, packed_type, vector_length, dex_pc) { DCHECK(HasConsistentPackedTypes(left, packed_type)); DCHECK(HasConsistentPackedTypes(right, packed_type)); + SetPackedFlag<kFieldMaxOpIsUnsigned>(is_unsigned); } + + bool IsUnsigned() const { return GetPackedFlag<kFieldMaxOpIsUnsigned>(); } + DECLARE_INSTRUCTION(VecMax); + private: + // Additional packed bits. + static constexpr size_t kFieldMaxOpIsUnsigned = HVecOperation::kNumberOfVectorOpPackedBits; + static constexpr size_t kNumberOfMaxOpPackedBits = kFieldMaxOpIsUnsigned + 1; + static_assert(kNumberOfMaxOpPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields."); + DISALLOW_COPY_AND_ASSIGN(HVecMax); }; diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 065c11eddb..e5ab00bce3 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -83,6 +83,7 @@ #include "jit/jit_code_cache.h" #include "jni/quick/jni_compiler.h" #include "licm.h" +#include "load_store_analysis.h" #include "load_store_elimination.h" #include "loop_optimization.h" #include "nodes.h" @@ -465,7 +466,8 @@ static HOptimization* BuildOptimization( const DexCompilationUnit& dex_compilation_unit, VariableSizedHandleScope* handles, SideEffectsAnalysis* most_recent_side_effects, - HInductionVarAnalysis* most_recent_induction) { + HInductionVarAnalysis* most_recent_induction, + LoadStoreAnalysis* most_recent_lsa) { std::string opt_name = ConvertPassNameToOptimizationName(pass_name); if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) { CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr); @@ -499,15 +501,18 @@ static HOptimization* BuildOptimization( } else if (opt_name == HInductionVarAnalysis::kInductionPassName) { return new (arena) HInductionVarAnalysis(graph); } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) { - return new (arena) InstructionSimplifier(graph, codegen, stats, pass_name.c_str()); + return new (arena) InstructionSimplifier(graph, codegen, driver, stats, pass_name.c_str()); } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) { return new (arena) IntrinsicsRecognizer(graph, stats); } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) { CHECK(most_recent_side_effects != nullptr); return new (arena) LICM(graph, *most_recent_side_effects, stats); + } else if (opt_name == LoadStoreAnalysis::kLoadStoreAnalysisPassName) { + return new (arena) LoadStoreAnalysis(graph); } else if (opt_name == LoadStoreElimination::kLoadStoreEliminationPassName) { CHECK(most_recent_side_effects != nullptr); - return new (arena) LoadStoreElimination(graph, *most_recent_side_effects); + CHECK(most_recent_lsa != nullptr); + return new (arena) LoadStoreElimination(graph, *most_recent_side_effects, *most_recent_lsa); } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) { return new (arena) SideEffectsAnalysis(graph); } else if (opt_name == HLoopOptimization::kLoopOptimizationPassName) { @@ -556,6 +561,7 @@ static ArenaVector<HOptimization*> BuildOptimizations( // in the pass name list. SideEffectsAnalysis* most_recent_side_effects = nullptr; HInductionVarAnalysis* most_recent_induction = nullptr; + LoadStoreAnalysis* most_recent_lsa = nullptr; ArenaVector<HOptimization*> ret(arena->Adapter()); for (const std::string& pass_name : pass_names) { HOptimization* opt = BuildOptimization( @@ -568,7 +574,8 @@ static ArenaVector<HOptimization*> BuildOptimizations( dex_compilation_unit, handles, most_recent_side_effects, - most_recent_induction); + most_recent_induction, + most_recent_lsa); CHECK(opt != nullptr) << "Couldn't build optimization: \"" << pass_name << "\""; ret.push_back(opt); @@ -577,6 +584,8 @@ static ArenaVector<HOptimization*> BuildOptimizations( most_recent_side_effects = down_cast<SideEffectsAnalysis*>(opt); } else if (opt_name == HInductionVarAnalysis::kInductionPassName) { most_recent_induction = down_cast<HInductionVarAnalysis*>(opt); + } else if (opt_name == LoadStoreAnalysis::kLoadStoreAnalysisPassName) { + most_recent_lsa = down_cast<LoadStoreAnalysis*>(opt); } } return ret; @@ -638,11 +647,14 @@ void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set, new (arena) arm::InstructionSimplifierArm(graph, stats); SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch"); + HInstructionScheduling* scheduling = + new (arena) HInstructionScheduling(graph, instruction_set, codegen); HOptimization* arm_optimizations[] = { simplifier, side_effects, gvn, - fixups + fixups, + scheduling, }; RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer); break; @@ -760,7 +772,8 @@ void OptimizingCompiler::RunOptimizations(HGraph* graph, HDeadCodeElimination* dce3 = new (arena) HDeadCodeElimination( graph, stats, "dead_code_elimination$final"); HConstantFolding* fold1 = new (arena) HConstantFolding(graph, "constant_folding"); - InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, codegen, stats); + InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier( + graph, codegen, driver, stats); HSelectGenerator* select_generator = new (arena) HSelectGenerator(graph, stats); HConstantFolding* fold2 = new (arena) HConstantFolding( graph, "constant_folding$after_inlining"); @@ -774,15 +787,16 @@ void OptimizingCompiler::RunOptimizations(HGraph* graph, HInductionVarAnalysis* induction = new (arena) HInductionVarAnalysis(graph); BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects1, induction); HLoopOptimization* loop = new (arena) HLoopOptimization(graph, driver, induction); - LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2); + LoadStoreAnalysis* lsa = new (arena) LoadStoreAnalysis(graph); + LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2, *lsa); HSharpening* sharpening = new (arena) HSharpening( graph, codegen, dex_compilation_unit, driver, handles); InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier( - graph, codegen, stats, "instruction_simplifier$after_inlining"); + graph, codegen, driver, stats, "instruction_simplifier$after_inlining"); InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier( - graph, codegen, stats, "instruction_simplifier$after_bce"); + graph, codegen, driver, stats, "instruction_simplifier$after_bce"); InstructionSimplifier* simplify4 = new (arena) InstructionSimplifier( - graph, codegen, stats, "instruction_simplifier$before_codegen"); + graph, codegen, driver, stats, "instruction_simplifier$before_codegen"); IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, stats); CHAGuardOptimization* cha_guard = new (arena) CHAGuardOptimization(graph); CodeSinking* code_sinking = new (arena) CodeSinking(graph, stats); @@ -814,6 +828,7 @@ void OptimizingCompiler::RunOptimizations(HGraph* graph, fold3, // evaluates code generated by dynamic bce simplify3, side_effects2, + lsa, lse, cha_guard, dce3, diff --git a/compiler/optimizing/pc_relative_fixups_mips.cc b/compiler/optimizing/pc_relative_fixups_mips.cc index a0fdde169d..bce54bf49a 100644 --- a/compiler/optimizing/pc_relative_fixups_mips.cc +++ b/compiler/optimizing/pc_relative_fixups_mips.cc @@ -58,10 +58,22 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { DCHECK(base_ != nullptr); } + void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE { + // If this is an invoke with PC-relative pointer to a method, + // we need to add the base as the special input. + if (invoke->GetMethodLoadKind() == + HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative && + !IsCallFreeIntrinsic<IntrinsicLocationsBuilderMIPS>(invoke, codegen_)) { + InitializePCRelativeBasePointer(); + // Add the special argument base to the method. + DCHECK(!invoke->HasCurrentMethodInput()); + invoke->AddSpecialInput(base_); + } + } + void VisitLoadClass(HLoadClass* load_class) OVERRIDE { HLoadClass::LoadKind load_kind = load_class->GetLoadKind(); switch (load_kind) { - case HLoadClass::LoadKind::kBootImageLinkTimeAddress: case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: case HLoadClass::LoadKind::kBootImageAddress: case HLoadClass::LoadKind::kBssEntry: @@ -77,7 +89,6 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { void VisitLoadString(HLoadString* load_string) OVERRIDE { HLoadString::LoadKind load_kind = load_string->GetLoadKind(); switch (load_kind) { - case HLoadString::LoadKind::kBootImageLinkTimeAddress: case HLoadString::LoadKind::kBootImageAddress: case HLoadString::LoadKind::kBootImageLinkTimePcRelative: case HLoadString::LoadKind::kBssEntry: diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc index a1c916f43a..2743df9dcf 100644 --- a/compiler/optimizing/pc_relative_fixups_x86.cc +++ b/compiler/optimizing/pc_relative_fixups_x86.cc @@ -205,13 +205,13 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { // method pointer from the invoke. if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasCurrentMethodInput()) { - DCHECK(!invoke_static_or_direct->HasPcRelativeDexCache()); + DCHECK(!invoke_static_or_direct->HasPcRelativeMethodLoadKind()); return; } bool base_added = false; if (invoke_static_or_direct != nullptr && - invoke_static_or_direct->HasPcRelativeDexCache() && + invoke_static_or_direct->HasPcRelativeMethodLoadKind() && !IsCallFreeIntrinsic<IntrinsicLocationsBuilderX86>(invoke, codegen_)) { HX86ComputeBaseMethodAddress* method_address = GetPCRelativeBasePointer(invoke); // Add the extra parameter. diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc index 66bfea9860..aa42fd647b 100644 --- a/compiler/optimizing/prepare_for_register_allocation.cc +++ b/compiler/optimizing/prepare_for_register_allocation.cc @@ -167,6 +167,45 @@ void PrepareForRegisterAllocation::VisitCondition(HCondition* condition) { } } +void PrepareForRegisterAllocation::VisitConstructorFence(HConstructorFence* constructor_fence) { + // Trivially remove redundant HConstructorFence when it immediately follows an HNewInstance + // to an uninitialized class. In this special case, the art_quick_alloc_object_resolved + // will already have the 'dmb' which is strictly stronger than an HConstructorFence. + // + // The instruction builder always emits "x = HNewInstance; HConstructorFence(x)" so this + // is effectively pattern-matching that particular case and undoing the redundancy the builder + // had introduced. + // + // TODO: Move this to a separate pass. + HInstruction* allocation_inst = constructor_fence->GetAssociatedAllocation(); + if (allocation_inst != nullptr && allocation_inst->IsNewInstance()) { + HNewInstance* new_inst = allocation_inst->AsNewInstance(); + // This relies on the entrypoint already being set to the more optimized version; + // as that happens in this pass, this redundancy removal also cannot happen any earlier. + if (new_inst != nullptr && new_inst->GetEntrypoint() == kQuickAllocObjectResolved) { + // If this was done in an earlier pass, we would want to match that `previous` was an input + // to the `constructor_fence`. However, since this pass removes the inputs to the fence, + // we can ignore the inputs and just remove the instruction from its block. + DCHECK_EQ(1u, constructor_fence->InputCount()); + // TODO: GetAssociatedAllocation should not care about multiple inputs + // if we are in prepare_for_register_allocation pass only. + constructor_fence->GetBlock()->RemoveInstruction(constructor_fence); + return; + // TODO: actually remove the dmb from the .S entrypoints (initialized variants only). + } + + // HNewArray does not need this check because the art_quick_alloc_array does not itself + // have a dmb in any normal situation (i.e. the array class is never exactly in the + // "resolved" state). If the array class is not yet loaded, it will always go from + // Unloaded->Initialized state. + } + + // Remove all the inputs to the constructor fence; + // they aren't used by the InstructionCodeGenerator and this lets us avoid creating a + // LocationSummary in the LocationsBuilder. + constructor_fence->RemoveAllInputs(); +} + void PrepareForRegisterAllocation::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { if (invoke->IsStaticWithExplicitClinitCheck()) { HLoadClass* last_input = invoke->GetInputs().back()->AsLoadClass(); diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h index 7ffbe44ef6..395d4ba2ee 100644 --- a/compiler/optimizing/prepare_for_register_allocation.h +++ b/compiler/optimizing/prepare_for_register_allocation.h @@ -43,6 +43,7 @@ class PrepareForRegisterAllocation : public HGraphDelegateVisitor { void VisitArraySet(HArraySet* instruction) OVERRIDE; void VisitClinitCheck(HClinitCheck* check) OVERRIDE; void VisitCondition(HCondition* condition) OVERRIDE; + void VisitConstructorFence(HConstructorFence* constructor_fence) OVERRIDE; void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE; void VisitDeoptimize(HDeoptimize* deoptimize) OVERRIDE; diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc index 87f709f63d..2fd7b03151 100644 --- a/compiler/optimizing/register_allocator_graph_color.cc +++ b/compiler/optimizing/register_allocator_graph_color.cc @@ -20,7 +20,7 @@ #include "linear_order.h" #include "register_allocation_resolver.h" #include "ssa_liveness_analysis.h" -#include "thread-inl.h" +#include "thread-current-inl.h" namespace art { @@ -1968,8 +1968,7 @@ void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* in ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints( allocator_->Adapter(kArenaAllocRegisterAllocator)); - for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) { - LiveInterval* parent_interval = *it; + for (LiveInterval* parent_interval : *intervals) { DCHECK(parent_interval->IsParent()); DCHECK(!parent_interval->HasSpillSlot()); size_t start = parent_interval->GetStart(); diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index d65d20cf43..320f01a727 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -23,6 +23,10 @@ #include "scheduler_arm64.h" #endif +#ifdef ART_ENABLE_CODEGEN_arm +#include "scheduler_arm.h" +#endif + namespace art { void SchedulingGraph::AddDependency(SchedulingNode* node, @@ -264,10 +268,11 @@ void SchedulingGraph::DumpAsDotGraph(const std::string& description, // Start the dot graph. Use an increasing index for easier differentiation. output << "digraph G {\n"; for (const auto& entry : nodes_map_) { - DumpAsDotNode(output, entry.second); + SchedulingNode* node = entry.second; + DumpAsDotNode(output, node); } // Create a fake 'end_of_scheduling' node to help visualization of critical_paths. - for (auto node : initial_candidates) { + for (SchedulingNode* node : initial_candidates) { const HInstruction* instruction = node->GetInstruction(); output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n " << "[label=\"" << node->GetLatency() << "\",dir=back]\n"; @@ -580,28 +585,39 @@ bool HScheduler::IsSchedulingBarrier(const HInstruction* instr) const { void HInstructionScheduling::Run(bool only_optimize_loop_blocks, bool schedule_randomly) { +#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm) + // Phase-local allocator that allocates scheduler internal data structures like + // scheduling nodes, internel nodes map, dependencies, etc. + ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool()); + CriticalPathSchedulingNodeSelector critical_path_selector; + RandomSchedulingNodeSelector random_selector; + SchedulingNodeSelector* selector = schedule_randomly + ? static_cast<SchedulingNodeSelector*>(&random_selector) + : static_cast<SchedulingNodeSelector*>(&critical_path_selector); +#else // Avoid compilation error when compiling for unsupported instruction set. UNUSED(only_optimize_loop_blocks); UNUSED(schedule_randomly); +#endif switch (instruction_set_) { #ifdef ART_ENABLE_CODEGEN_arm64 case kArm64: { - // Phase-local allocator that allocates scheduler internal data structures like - // scheduling nodes, internel nodes map, dependencies, etc. - ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool()); - - CriticalPathSchedulingNodeSelector critical_path_selector; - RandomSchedulingNodeSelector random_selector; - SchedulingNodeSelector* selector = schedule_randomly - ? static_cast<SchedulingNodeSelector*>(&random_selector) - : static_cast<SchedulingNodeSelector*>(&critical_path_selector); - arm64::HSchedulerARM64 scheduler(&arena_allocator, selector); scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); scheduler.Schedule(graph_); break; } #endif +#if defined(ART_ENABLE_CODEGEN_arm) + case kThumb2: + case kArm: { + arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_); + arm::HSchedulerARM scheduler(&arena_allocator, selector, &arm_latency_visitor); + scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); + scheduler.Schedule(graph_); + break; + } +#endif default: break; } diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index 9236a0e4fa..73e8087cd0 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -23,6 +23,7 @@ #include "driver/compiler_driver.h" #include "nodes.h" #include "optimization.h" +#include "code_generator.h" namespace art { @@ -469,8 +470,9 @@ inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction class HInstructionScheduling : public HOptimization { public: - HInstructionScheduling(HGraph* graph, InstructionSet instruction_set) + HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr) : HOptimization(graph, kInstructionScheduling), + codegen_(cg), instruction_set_(instruction_set) {} void Run() { @@ -480,6 +482,7 @@ class HInstructionScheduling : public HOptimization { static constexpr const char* kInstructionScheduling = "scheduler"; + CodeGenerator* const codegen_; const InstructionSet instruction_set_; private: diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc new file mode 100644 index 0000000000..832a7e1571 --- /dev/null +++ b/compiler/optimizing/scheduler_arm.cc @@ -0,0 +1,827 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arch/arm/instruction_set_features_arm.h" +#include "code_generator_utils.h" +#include "common_arm.h" +#include "mirror/array-inl.h" +#include "scheduler_arm.h" + +namespace art { +namespace arm { + +using helpers::Int32ConstantFrom; +using helpers::Uint64ConstantFrom; + +void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + // HAdd and HSub long operations translate to ADDS+ADC or SUBS+SBC pairs, + // so a bubble (kArmNopLatency) is added to represent the internal carry flag + // dependency inside these pairs. + last_visited_internal_latency_ = kArmIntegerOpLatency + kArmNopLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitAdd(HAdd* instr) { + HandleBinaryOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitSub(HSub* instr) { + HandleBinaryOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitMul(HMul* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = 3 * kArmMulIntegerLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmMulFloatingPointLatency; + break; + default: + last_visited_latency_ = kArmMulIntegerLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::HandleBitwiseOperationLantencies(HBinaryOperation* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitAnd(HAnd* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitOr(HOr* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitXor(HXor* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitRor(HRor* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimInt: + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: { + // HandleLongRotate + HInstruction* rhs = instr->GetRight(); + if (rhs->IsConstant()) { + uint64_t rot = Uint64ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance; + if (rot != 0u) { + last_visited_internal_latency_ = 3 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } + } else { + last_visited_internal_latency_ = 9 * kArmIntegerOpLatency + kArmBranchLatency; + last_visited_latency_ = kArmBranchLatency; + } + break; + } + default: + LOG(FATAL) << "Unexpected operation type " << instr->GetResultType(); + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::HandleShiftLatencies(HBinaryOperation* instr) { + Primitive::Type type = instr->GetResultType(); + HInstruction* rhs = instr->GetRight(); + switch (type) { + case Primitive::kPrimInt: + if (!rhs->IsConstant()) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: + if (!rhs->IsConstant()) { + last_visited_internal_latency_ = 8 * kArmIntegerOpLatency; + } else { + uint32_t shift_value = Int32ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance; + if (shift_value == 1 || shift_value >= 32) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + } + } + last_visited_latency_ = kArmIntegerOpLatency; + break; + default: + LOG(FATAL) << "Unexpected operation type " << type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitShl(HShl* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitShr(HShr* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitUShr(HUShr* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitCondition(HCondition* instr) { + switch (instr->GetLeft()->GetType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = 4 * kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = 2 * kArmFloatingPointOpLatency; + break; + default: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + } + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitCompare(HCompare* instr) { + Primitive::Type type = instr->InputAt(0)->GetType(); + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency + 3 * kArmBranchLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = kArmIntegerOpLatency + 2 * kArmFloatingPointOpLatency; + break; + default: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + } + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitBitwiseNegatedRight(HBitwiseNegatedRight* instruction) { + if (instruction->GetResultType() == Primitive::kPrimInt) { + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateDataProcInstruction(bool internal_latency) { + if (internal_latency) { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmDataProcWithShifterOpLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateDataProc(HDataProcWithShifterOp* instruction) { + const HInstruction::InstructionKind kind = instruction->GetInstrKind(); + if (kind == HInstruction::kAdd) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else if (kind == HInstruction::kSub) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(); + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction) { + DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong); + DCHECK(HDataProcWithShifterOp::IsShiftOp(instruction->GetOpKind())); + + const uint32_t shift_value = instruction->GetShiftAmount(); + const HInstruction::InstructionKind kind = instruction->GetInstrKind(); + + if (shift_value >= 32) { + // Different shift types actually generate similar code here, + // no need to differentiate shift types like the codegen pass does, + // which also avoids handling shift types from different ARM backends. + HandleGenerateDataProc(instruction); + } else { + DCHECK_GT(shift_value, 1U); + DCHECK_LT(shift_value, 32U); + + if (kind == HInstruction::kOr || kind == HInstruction::kXor) { + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(); + } else { + last_visited_internal_latency_ += 2 * kArmIntegerOpLatency; + HandleGenerateDataProc(instruction); + } + } +} + +void SchedulingLatencyVisitorARM::VisitDataProcWithShifterOp(HDataProcWithShifterOp* instruction) { + const HDataProcWithShifterOp::OpKind op_kind = instruction->GetOpKind(); + + if (instruction->GetType() == Primitive::kPrimInt) { + DCHECK(!HDataProcWithShifterOp::IsExtensionOp(op_kind)); + HandleGenerateDataProcInstruction(); + } else { + DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong); + if (HDataProcWithShifterOp::IsExtensionOp(op_kind)) { + HandleGenerateDataProc(instruction); + } else { + HandleGenerateLongDataProc(instruction); + } + } +} + +void SchedulingLatencyVisitorARM::VisitIntermediateAddress(HIntermediateAddress* ATTRIBUTE_UNUSED) { + // Although the code generated is a simple `add` instruction, we found through empirical results + // that spacing it from its use in memory accesses was beneficial. + last_visited_internal_latency_ = kArmNopLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* ATTRIBUTE_UNUSED) { + UNIMPLEMENTED(FATAL) << "IntermediateAddressIndex is not implemented for ARM"; +} + +void SchedulingLatencyVisitorARM::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArmMulIntegerLatency; +} + +void SchedulingLatencyVisitorARM::VisitArrayGet(HArrayGet* instruction) { + Primitive::Type type = instruction->GetType(); + const bool maybe_compressed_char_at = + mirror::kUseStringCompression && instruction->IsStringCharAt(); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + HInstruction* index = instruction->InputAt(1); + + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: { + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += kArmMemoryLoadLatency; + } + if (index->IsConstant()) { + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += + kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmBranchLatency; + } else { + last_visited_latency_ += kArmMemoryLoadLatency; + } + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += + kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmBranchLatency; + } else { + last_visited_latency_ += kArmMemoryLoadLatency; + } + } + break; + } + + case Primitive::kPrimNot: { + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + last_visited_latency_ = kArmLoadWithBakerReadBarrierLatency; + } else { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } + last_visited_internal_latency_ = kArmMemoryLoadLatency; + } + } + break; + } + + case Primitive::kPrimLong: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimFloat: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimDouble: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitArrayLength(HArrayLength* instruction) { + last_visited_latency_ = kArmMemoryLoadLatency; + if (mirror::kUseStringCompression && instruction->IsStringLength()) { + last_visited_internal_latency_ = kArmMemoryLoadLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::VisitArraySet(HArraySet* instruction) { + HInstruction* index = instruction->InputAt(1); + Primitive::Type value_type = instruction->GetComponentType(); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + + switch (value_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryStoreLatency; + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + } + + case Primitive::kPrimNot: { + if (instruction->InputAt(2)->IsNullConstant()) { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryStoreLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryStoreLatency; + } + } else { + // Following the exact instructions of runtime type checks is too complicated, + // just giving it a simple slow latency. + last_visited_latency_ = kArmRuntimeTypeCheckLatency; + } + break; + } + + case Primitive::kPrimLong: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimFloat: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimDouble: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + default: + LOG(FATAL) << "Unreachable type " << value_type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + // Users do not use any data results. + last_visited_latency_ = 0; +} + +void SchedulingLatencyVisitorARM::HandleDivRemConstantIntegralLatencies(int32_t imm) { + if (imm == 0) { + last_visited_internal_latency_ = 0; + last_visited_latency_ = 0; + } else if (imm == 1 || imm == -1) { + last_visited_latency_ = kArmIntegerOpLatency; + } else if (IsPowerOfTwo(AbsOrMin(imm))) { + last_visited_internal_latency_ = 3 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmMulIntegerLatency + 2 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::VisitDiv(HDiv* instruction) { + Primitive::Type type = instruction->GetResultType(); + switch (type) { + case Primitive::kPrimInt: { + HInstruction* rhs = instruction->GetRight(); + if (rhs->IsConstant()) { + int32_t imm = Int32ConstantFrom(rhs->AsConstant()); + HandleDivRemConstantIntegralLatencies(imm); + } else { + last_visited_latency_ = kArmDivIntegerLatency; + } + break; + } + case Primitive::kPrimFloat: + last_visited_latency_ = kArmDivFloatLatency; + break; + case Primitive::kPrimDouble: + last_visited_latency_ = kArmDivDoubleLatency; + break; + default: + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) { + HandleFieldGetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) { + HandleFieldSetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmLoadStringInternalLatency; + last_visited_latency_ = kArmMemoryLoadLatency; +} + +void SchedulingLatencyVisitorARM::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmIntegerOpLatency + kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitNewInstance(HNewInstance* instruction) { + if (instruction->IsStringAlloc()) { + last_visited_internal_latency_ = 2 * kArmMemoryLoadLatency + kArmCallInternalLatency; + } else { + last_visited_internal_latency_ = kArmCallInternalLatency; + } + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitRem(HRem* instruction) { + Primitive::Type type = instruction->GetResultType(); + switch (type) { + case Primitive::kPrimInt: { + HInstruction* rhs = instruction->GetRight(); + if (rhs->IsConstant()) { + int32_t imm = Int32ConstantFrom(rhs->AsConstant()); + HandleDivRemConstantIntegralLatencies(imm); + } else { + last_visited_internal_latency_ = kArmDivIntegerLatency; + last_visited_latency_ = kArmMulIntegerLatency; + } + break; + } + default: + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::HandleFieldGetLatencies(HInstruction* instruction, + const FieldInfo& field_info) { + DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet()); + DCHECK(codegen_ != nullptr); + bool is_volatile = field_info.IsVolatile(); + Primitive::Type field_type = field_info.GetFieldType(); + bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd(); + + switch (field_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + + case Primitive::kPrimNot: + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + case Primitive::kPrimLong: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + case Primitive::kPrimFloat: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + + case Primitive::kPrimDouble: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = + kArmMemoryLoadLatency + kArmIntegerOpLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + default: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + } + + if (is_volatile) { + last_visited_internal_latency_ += kArmMemoryBarrierLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleFieldSetLatencies(HInstruction* instruction, + const FieldInfo& field_info) { + DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet()); + DCHECK(codegen_ != nullptr); + bool is_volatile = field_info.IsVolatile(); + Primitive::Type field_type = field_info.GetFieldType(); + bool needs_write_barrier = + CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1)); + bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd(); + + switch (field_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + if (is_volatile) { + last_visited_internal_latency_ = kArmMemoryBarrierLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmMemoryBarrierLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + case Primitive::kPrimInt: + case Primitive::kPrimNot: + if (kPoisonHeapReferences && needs_write_barrier) { + last_visited_internal_latency_ += kArmIntegerOpLatency * 2; + } + last_visited_latency_ = kArmMemoryStoreLatency; + break; + + case Primitive::kPrimLong: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = + kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + case Primitive::kPrimFloat: + last_visited_latency_ = kArmMemoryStoreLatency; + break; + + case Primitive::kPrimDouble: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = kArmIntegerOpLatency + + kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + default: + last_visited_latency_ = kArmMemoryStoreLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) { + HandleFieldGetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) { + HandleFieldSetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitSuspendCheck(HSuspendCheck* instruction) { + HBasicBlock* block = instruction->GetBlock(); + DCHECK((block->GetLoopInformation() != nullptr) || + (block->IsEntryBlock() && instruction->GetNext()->IsGoto())); + // Users do not use any data results. + last_visited_latency_ = 0; +} + +void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) { + Primitive::Type result_type = instr->GetResultType(); + Primitive::Type input_type = instr->GetInputType(); + + switch (result_type) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + last_visited_latency_ = kArmIntegerOpLatency; // SBFX or UBFX + break; + + case Primitive::kPrimInt: + switch (input_type) { + case Primitive::kPrimLong: + last_visited_latency_ = kArmIntegerOpLatency; // MOV + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } + break; + + case Primitive::kPrimLong: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + // MOV and extension + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + // invokes runtime + last_visited_internal_latency_ = kArmCallInternalLatency; + break; + default: + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + } + break; + + case Primitive::kPrimFloat: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimLong: + // invokes runtime + last_visited_internal_latency_ = kArmCallInternalLatency; + break; + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + } + break; + + case Primitive::kPrimDouble: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimLong: + last_visited_internal_latency_ = 5 * kArmFloatingPointOpLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimFloat: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + } + break; + + default: + last_visited_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitArmDexCacheArraysBase(art::HArmDexCacheArraysBase*) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +} // namespace arm +} // namespace art diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h new file mode 100644 index 0000000000..897e97da49 --- /dev/null +++ b/compiler/optimizing/scheduler_arm.h @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ +#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ + +#ifdef ART_USE_OLD_ARM_BACKEND +#include "code_generator_arm.h" +#else +#include "code_generator_arm_vixl.h" +#endif +#include "scheduler.h" + +namespace art { +namespace arm { +#ifdef ART_USE_OLD_ARM_BACKEND +typedef CodeGeneratorARM CodeGeneratorARMType; +#else +typedef CodeGeneratorARMVIXL CodeGeneratorARMType; +#endif + +// AArch32 instruction latencies. +// We currently assume that all ARM CPUs share the same instruction latency list. +// The following latencies were tuned based on performance experiments and +// automatic tuning using differential evolution approach on various benchmarks. +static constexpr uint32_t kArmIntegerOpLatency = 2; +static constexpr uint32_t kArmFloatingPointOpLatency = 11; +static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4; +static constexpr uint32_t kArmMulIntegerLatency = 6; +static constexpr uint32_t kArmMulFloatingPointLatency = 11; +static constexpr uint32_t kArmDivIntegerLatency = 10; +static constexpr uint32_t kArmDivFloatLatency = 20; +static constexpr uint32_t kArmDivDoubleLatency = 25; +static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11; +static constexpr uint32_t kArmMemoryLoadLatency = 9; +static constexpr uint32_t kArmMemoryStoreLatency = 9; +static constexpr uint32_t kArmMemoryBarrierLatency = 6; +static constexpr uint32_t kArmBranchLatency = 4; +static constexpr uint32_t kArmCallLatency = 5; +static constexpr uint32_t kArmCallInternalLatency = 29; +static constexpr uint32_t kArmLoadStringInternalLatency = 10; +static constexpr uint32_t kArmNopLatency = 2; +static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18; +static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46; + +class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor { + public: + explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen) + : codegen_(down_cast<CodeGeneratorARMType*>(codegen)) {} + + // Default visitor for instructions not handled specifically below. + void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArmIntegerOpLatency; + } + +// We add a second unused parameter to be able to use this macro like the others +// defined in `nodes.h`. +#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \ + M(ArrayGet , unused) \ + M(ArrayLength , unused) \ + M(ArraySet , unused) \ + M(Add , unused) \ + M(Sub , unused) \ + M(And , unused) \ + M(Or , unused) \ + M(Ror , unused) \ + M(Xor , unused) \ + M(Shl , unused) \ + M(Shr , unused) \ + M(UShr , unused) \ + M(Mul , unused) \ + M(Div , unused) \ + M(Condition , unused) \ + M(Compare , unused) \ + M(BoundsCheck , unused) \ + M(InstanceFieldGet , unused) \ + M(InstanceFieldSet , unused) \ + M(InstanceOf , unused) \ + M(Invoke , unused) \ + M(LoadString , unused) \ + M(NewArray , unused) \ + M(NewInstance , unused) \ + M(Rem , unused) \ + M(StaticFieldGet , unused) \ + M(StaticFieldSet , unused) \ + M(SuspendCheck , unused) \ + M(TypeConversion , unused) + +#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ + M(BitwiseNegatedRight, unused) \ + M(MultiplyAccumulate, unused) \ + M(IntermediateAddress, unused) \ + M(IntermediateAddressIndex, unused) \ + M(DataProcWithShifterOp, unused) + +#define DECLARE_VISIT_INSTRUCTION(type, unused) \ + void Visit##type(H##type* instruction) OVERRIDE; + + FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + private: + void HandleBinaryOperationLantencies(HBinaryOperation* instr); + void HandleBitwiseOperationLantencies(HBinaryOperation* instr); + void HandleShiftLatencies(HBinaryOperation* instr); + void HandleDivRemConstantIntegralLatencies(int32_t imm); + void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleGenerateDataProcInstruction(bool internal_latency = false); + void HandleGenerateDataProc(HDataProcWithShifterOp* instruction); + void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction); + + // The latency setting for each HInstruction depends on how CodeGenerator may generate code, + // latency visitors may query CodeGenerator for such information for accurate latency settings. + CodeGeneratorARMType* codegen_; +}; + +class HSchedulerARM : public HScheduler { + public: + HSchedulerARM(ArenaAllocator* arena, + SchedulingNodeSelector* selector, + SchedulingLatencyVisitorARM* arm_latency_visitor) + : HScheduler(arena, arm_latency_visitor, selector) {} + ~HSchedulerARM() OVERRIDE {} + + bool IsSchedulable(const HInstruction* instruction) const OVERRIDE { +#define CASE_INSTRUCTION_KIND(type, unused) case \ + HInstruction::InstructionKind::k##type: + switch (instruction->GetKind()) { + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND) + return true; + FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND) + return true; + default: + return HScheduler::IsSchedulable(instruction); + } +#undef CASE_INSTRUCTION_KIND + } + + private: + DISALLOW_COPY_AND_ASSIGN(HSchedulerARM); +}; + +} // namespace arm +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc index 558dcc4cbc..83b487fb5b 100644 --- a/compiler/optimizing/scheduler_arm64.cc +++ b/compiler/optimizing/scheduler_arm64.cc @@ -16,6 +16,7 @@ #include "scheduler_arm64.h" #include "code_generator_utils.h" +#include "mirror/array-inl.h" namespace art { namespace arm64 { @@ -43,6 +44,13 @@ void SchedulingLatencyVisitorARM64::VisitIntermediateAddress( last_visited_latency_ = kArm64IntegerOpLatency + 2; } +void SchedulingLatencyVisitorARM64::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instr ATTRIBUTE_UNUSED) { + // Although the code generated is a simple `add` instruction, we found through empirical results + // that spacing it from its use in memory accesses was beneficial. + last_visited_latency_ = kArm64DataProcWithShifterOpLatency + 2; +} + void SchedulingLatencyVisitorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) { last_visited_latency_ = kArm64MulIntegerLatency; } @@ -192,5 +200,148 @@ void SchedulingLatencyVisitorARM64::VisitTypeConversion(HTypeConversion* instr) } } +void SchedulingLatencyVisitorARM64::HandleSimpleArithmeticSIMD(HVecOperation *instr) { + if (Primitive::IsFloatingPointType(instr->GetPackedType())) { + last_visited_latency_ = kArm64SIMDFloatingPointOpLatency; + } else { + last_visited_latency_ = kArm64SIMDIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM64::VisitVecReplicateScalar( + HVecReplicateScalar* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDReplicateOpLatency; +} + +void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) { + LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId(); +} + +void SchedulingLatencyVisitorARM64::VisitVecSumReduce(HVecSumReduce* instr) { + LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId(); +} + +void SchedulingLatencyVisitorARM64::VisitVecCnv(HVecCnv* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDTypeConversionInt2FPLatency; +} + +void SchedulingLatencyVisitorARM64::VisitVecNeg(HVecNeg* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecAbs(HVecAbs* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecNot(HVecNot* instr) { + if (instr->GetPackedType() == Primitive::kPrimBoolean) { + last_visited_internal_latency_ = kArm64SIMDIntegerOpLatency; + } + last_visited_latency_ = kArm64SIMDIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM64::VisitVecAdd(HVecAdd* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecHalvingAdd(HVecHalvingAdd* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecSub(HVecSub* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecMul(HVecMul* instr) { + if (Primitive::IsFloatingPointType(instr->GetPackedType())) { + last_visited_latency_ = kArm64SIMDMulFloatingPointLatency; + } else { + last_visited_latency_ = kArm64SIMDMulIntegerLatency; + } +} + +void SchedulingLatencyVisitorARM64::VisitVecDiv(HVecDiv* instr) { + if (instr->GetPackedType() == Primitive::kPrimFloat) { + last_visited_latency_ = kArm64SIMDDivFloatLatency; + } else { + DCHECK(instr->GetPackedType() == Primitive::kPrimDouble); + last_visited_latency_ = kArm64SIMDDivDoubleLatency; + } +} + +void SchedulingLatencyVisitorARM64::VisitVecMin(HVecMin* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecMax(HVecMax* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecAnd(HVecAnd* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr) { + LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId(); +} + +void SchedulingLatencyVisitorARM64::VisitVecOr(HVecOr* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM64::VisitVecXor(HVecXor* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM64::VisitVecShl(HVecShl* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecShr(HVecShr* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecUShr(HVecUShr* instr) { + HandleSimpleArithmeticSIMD(instr); +} + +void SchedulingLatencyVisitorARM64::VisitVecMultiplyAccumulate( + HVecMultiplyAccumulate* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDMulIntegerLatency; +} + +void SchedulingLatencyVisitorARM64::HandleVecAddress( + HVecMemoryOperation* instruction, + size_t size ATTRIBUTE_UNUSED) { + HInstruction* index = instruction->InputAt(1); + if (!index->IsConstant()) { + last_visited_internal_latency_ += kArm64DataProcWithShifterOpLatency; + } +} + +void SchedulingLatencyVisitorARM64::VisitVecLoad(HVecLoad* instr) { + last_visited_internal_latency_ = 0; + size_t size = Primitive::ComponentSize(instr->GetPackedType()); + + if (instr->GetPackedType() == Primitive::kPrimChar + && mirror::kUseStringCompression + && instr->IsStringCharAt()) { + // Set latencies for the uncompressed case. + last_visited_internal_latency_ += kArm64MemoryLoadLatency + kArm64BranchLatency; + HandleVecAddress(instr, size); + last_visited_latency_ = kArm64SIMDMemoryLoadLatency; + } else { + HandleVecAddress(instr, size); + last_visited_latency_ = kArm64SIMDMemoryLoadLatency; + } +} + +void SchedulingLatencyVisitorARM64::VisitVecStore(HVecStore* instr) { + last_visited_internal_latency_ = 0; + size_t size = Primitive::ComponentSize(instr->GetPackedType()); + HandleVecAddress(instr, size); + last_visited_latency_ = kArm64SIMDMemoryStoreLatency; +} + } // namespace arm64 } // namespace art diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h index 7a33720655..63d5b7d6b6 100644 --- a/compiler/optimizing/scheduler_arm64.h +++ b/compiler/optimizing/scheduler_arm64.h @@ -42,6 +42,18 @@ static constexpr uint32_t kArm64LoadStringInternalLatency = 7; static constexpr uint32_t kArm64MulFloatingPointLatency = 6; static constexpr uint32_t kArm64MulIntegerLatency = 6; static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5; +static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency; + +static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10; +static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6; +static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10; +static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6; +static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12; +static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12; +static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16; +static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60; +static constexpr uint32_t kArm64SIMDDivFloatLatency = 30; +static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10; class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor { public: @@ -52,29 +64,54 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor { // We add a second unused parameter to be able to use this macro like the others // defined in `nodes.h`. -#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \ - M(ArrayGet , unused) \ - M(ArrayLength , unused) \ - M(ArraySet , unused) \ - M(BinaryOperation , unused) \ - M(BoundsCheck , unused) \ - M(Div , unused) \ - M(InstanceFieldGet , unused) \ - M(InstanceOf , unused) \ - M(Invoke , unused) \ - M(LoadString , unused) \ - M(Mul , unused) \ - M(NewArray , unused) \ - M(NewInstance , unused) \ - M(Rem , unused) \ - M(StaticFieldGet , unused) \ - M(SuspendCheck , unused) \ - M(TypeConversion , unused) +#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \ + M(ArrayGet , unused) \ + M(ArrayLength , unused) \ + M(ArraySet , unused) \ + M(BinaryOperation , unused) \ + M(BoundsCheck , unused) \ + M(Div , unused) \ + M(InstanceFieldGet , unused) \ + M(InstanceOf , unused) \ + M(Invoke , unused) \ + M(LoadString , unused) \ + M(Mul , unused) \ + M(NewArray , unused) \ + M(NewInstance , unused) \ + M(Rem , unused) \ + M(StaticFieldGet , unused) \ + M(SuspendCheck , unused) \ + M(TypeConversion , unused) \ + M(VecReplicateScalar , unused) \ + M(VecSetScalars , unused) \ + M(VecSumReduce , unused) \ + M(VecCnv , unused) \ + M(VecNeg , unused) \ + M(VecAbs , unused) \ + M(VecNot , unused) \ + M(VecAdd , unused) \ + M(VecHalvingAdd , unused) \ + M(VecSub , unused) \ + M(VecMul , unused) \ + M(VecDiv , unused) \ + M(VecMin , unused) \ + M(VecMax , unused) \ + M(VecAnd , unused) \ + M(VecAndNot , unused) \ + M(VecOr , unused) \ + M(VecXor , unused) \ + M(VecShl , unused) \ + M(VecShr , unused) \ + M(VecUShr , unused) \ + M(VecMultiplyAccumulate, unused) \ + M(VecLoad , unused) \ + M(VecStore , unused) #define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ M(BitwiseNegatedRight, unused) \ M(MultiplyAccumulate, unused) \ M(IntermediateAddress, unused) \ + M(IntermediateAddressIndex, unused) \ M(DataProcWithShifterOp, unused) #define DECLARE_VISIT_INSTRUCTION(type, unused) \ @@ -85,6 +122,10 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor { FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION) #undef DECLARE_VISIT_INSTRUCTION + + private: + void HandleSimpleArithmeticSIMD(HVecOperation *instr); + void HandleVecAddress(HVecMemoryOperation* instruction, size_t size); }; class HSchedulerARM64 : public HScheduler { @@ -101,6 +142,8 @@ class HSchedulerARM64 : public HScheduler { return true; FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND) return true; + FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(CASE_INSTRUCTION_KIND) + return true; default: return HScheduler::IsSchedulable(instruction); } diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc index 31d13e2a26..d87600aa5e 100644 --- a/compiler/optimizing/scheduler_test.cc +++ b/compiler/optimizing/scheduler_test.cc @@ -28,6 +28,10 @@ #include "scheduler_arm64.h" #endif +#ifdef ART_ENABLE_CODEGEN_arm +#include "scheduler_arm.h" +#endif + namespace art { // Return all combinations of ISA and code generator that are executable on @@ -56,7 +60,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { #endif }; - for (auto test_config : test_config_candidates) { + for (const CodegenTargetConfig& test_config : test_config_candidates) { if (CanExecute(test_config.GetInstructionSet())) { v.push_back(test_config); } @@ -65,133 +69,151 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { return v; } -class SchedulerTest : public CommonCompilerTest {}; - -#ifdef ART_ENABLE_CODEGEN_arm64 -TEST_F(SchedulerTest, DependencyGraph) { - ArenaPool pool; - ArenaAllocator allocator(&pool); - HGraph* graph = CreateGraph(&allocator); - HBasicBlock* entry = new (&allocator) HBasicBlock(graph); - HBasicBlock* block1 = new (&allocator) HBasicBlock(graph); - graph->AddBlock(entry); - graph->AddBlock(block1); - graph->SetEntryBlock(entry); - - // entry: - // array ParameterValue - // c1 IntConstant - // c2 IntConstant - // block1: - // add1 Add [c1, c2] - // add2 Add [add1, c2] - // mul Mul [add1, add2] - // div_check DivZeroCheck [add2] (env: add2, mul) - // div Div [add1, div_check] - // array_get1 ArrayGet [array, add1] - // array_set1 ArraySet [array, add1, add2] - // array_get2 ArrayGet [array, add1] - // array_set2 ArraySet [array, add1, add2] - - HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(), - dex::TypeIndex(0), - 0, - Primitive::kPrimNot); - HInstruction* c1 = graph->GetIntConstant(1); - HInstruction* c2 = graph->GetIntConstant(10); - HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2); - HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2); - HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2); - HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0); - HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0); - HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0); - HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); - HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0); - HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); - - DCHECK(div_check->CanThrow()); - - entry->AddInstruction(array); - - HInstruction* block_instructions[] = {add1, - add2, - mul, - div_check, - div, - array_get1, - array_set1, - array_get2, - array_set2}; - for (auto instr : block_instructions) { - block1->AddInstruction(instr); +class SchedulerTest : public CommonCompilerTest { + public: + SchedulerTest() : pool_(), allocator_(&pool_) { + graph_ = CreateGraph(&allocator_); } - HEnvironment* environment = new (&allocator) HEnvironment(&allocator, - 2, - graph->GetArtMethod(), + // Build scheduling graph, and run target specific scheduling on it. + void TestBuildDependencyGraphAndSchedule(HScheduler* scheduler) { + HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_); + HBasicBlock* block1 = new (&allocator_) HBasicBlock(graph_); + graph_->AddBlock(entry); + graph_->AddBlock(block1); + graph_->SetEntryBlock(entry); + + // entry: + // array ParameterValue + // c1 IntConstant + // c2 IntConstant + // block1: + // add1 Add [c1, c2] + // add2 Add [add1, c2] + // mul Mul [add1, add2] + // div_check DivZeroCheck [add2] (env: add2, mul) + // div Div [add1, div_check] + // array_get1 ArrayGet [array, add1] + // array_set1 ArraySet [array, add1, add2] + // array_get2 ArrayGet [array, add1] + // array_set2 ArraySet [array, add1, add2] + + HInstruction* array = new (&allocator_) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), 0, - div_check); - div_check->SetRawEnvironment(environment); - environment->SetRawEnvAt(0, add2); - add2->AddEnvUseAt(div_check->GetEnvironment(), 0); - environment->SetRawEnvAt(1, mul); - mul->AddEnvUseAt(div_check->GetEnvironment(), 1); - - ArenaAllocator* arena = graph->GetArena(); - CriticalPathSchedulingNodeSelector critical_path_selector; - arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector); - SchedulingGraph scheduling_graph(&scheduler, arena); - // Instructions must be inserted in reverse order into the scheduling graph. - for (auto instr : ReverseRange(block_instructions)) { - scheduling_graph.AddNode(instr); + Primitive::kPrimNot); + HInstruction* c1 = graph_->GetIntConstant(1); + HInstruction* c2 = graph_->GetIntConstant(10); + HInstruction* add1 = new (&allocator_) HAdd(Primitive::kPrimInt, c1, c2); + HInstruction* add2 = new (&allocator_) HAdd(Primitive::kPrimInt, add1, c2); + HInstruction* mul = new (&allocator_) HMul(Primitive::kPrimInt, add1, add2); + HInstruction* div_check = new (&allocator_) HDivZeroCheck(add2, 0); + HInstruction* div = new (&allocator_) HDiv(Primitive::kPrimInt, add1, div_check, 0); + HInstruction* array_get1 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0); + HInstruction* array_set1 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); + HInstruction* array_get2 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0); + HInstruction* array_set2 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); + + DCHECK(div_check->CanThrow()); + + entry->AddInstruction(array); + + HInstruction* block_instructions[] = {add1, + add2, + mul, + div_check, + div, + array_get1, + array_set1, + array_get2, + array_set2}; + for (HInstruction* instr : block_instructions) { + block1->AddInstruction(instr); + } + + HEnvironment* environment = new (&allocator_) HEnvironment(&allocator_, + 2, + graph_->GetArtMethod(), + 0, + div_check); + div_check->SetRawEnvironment(environment); + environment->SetRawEnvAt(0, add2); + add2->AddEnvUseAt(div_check->GetEnvironment(), 0); + environment->SetRawEnvAt(1, mul); + mul->AddEnvUseAt(div_check->GetEnvironment(), 1); + + SchedulingGraph scheduling_graph(scheduler, graph_->GetArena()); + // Instructions must be inserted in reverse order into the scheduling graph. + for (HInstruction* instr : ReverseRange(block_instructions)) { + scheduling_graph.AddNode(instr); + } + + // Should not have dependencies cross basic blocks. + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2)); + + // Define-use dependency. + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2)); + + // Read and write dependencies + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1)); + + // Env dependency. + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul)); + ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check)); + + // CanThrow. + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check)); + + // Exercise the code path of target specific scheduler and SchedulingLatencyVisitor. + scheduler->Schedule(graph_); } - // Should not have dependencies cross basic blocks. - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2)); - - // Define-use dependency. - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2)); - - // Read and write dependencies - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1)); - - // Env dependency. - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul)); - ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check)); - - // CanThrow. - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check)); + void CompileWithRandomSchedulerAndRun(const uint16_t* data, bool has_result, int expected) { + for (CodegenTargetConfig target_config : GetTargetConfigs()) { + HGraph* graph = CreateCFG(&allocator_, data); + + // Schedule the graph randomly. + HInstructionScheduling scheduling(graph, target_config.GetInstructionSet()); + scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true); + + RunCode(target_config, + graph, + [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); }, + has_result, expected); + } + } + + ArenaPool pool_; + ArenaAllocator allocator_; + HGraph* graph_; +}; + +#if defined(ART_ENABLE_CODEGEN_arm64) +TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) { + CriticalPathSchedulingNodeSelector critical_path_selector; + arm64::HSchedulerARM64 scheduler(&allocator_, &critical_path_selector); + TestBuildDependencyGraphAndSchedule(&scheduler); } #endif -static void CompileWithRandomSchedulerAndRun(const uint16_t* data, - bool has_result, - int expected) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { - ArenaPool pool; - ArenaAllocator arena(&pool); - HGraph* graph = CreateCFG(&arena, data); - - // Schedule the graph randomly. - HInstructionScheduling scheduling(graph, target_config.GetInstructionSet()); - scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true); - - RunCode(target_config, - graph, - [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); }, - has_result, expected); - } +#if defined(ART_ENABLE_CODEGEN_arm) +TEST_F(SchedulerTest, DependencyGrapAndSchedulerARM) { + CriticalPathSchedulingNodeSelector critical_path_selector; + arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); + arm::HSchedulerARM scheduler(&allocator_, &critical_path_selector, &arm_latency_visitor); + TestBuildDependencyGraphAndSchedule(&scheduler); } +#endif TEST_F(SchedulerTest, RandomScheduling) { // diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc index eedaf6e67e..106b709eda 100644 --- a/compiler/optimizing/sharpening.cc +++ b/compiler/optimizing/sharpening.cc @@ -16,6 +16,7 @@ #include "sharpening.h" +#include "art_method-inl.h" #include "base/casts.h" #include "base/enums.h" #include "class_linker.h" @@ -41,7 +42,9 @@ void HSharpening::Run() { for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { HInstruction* instruction = it.Current(); if (instruction->IsInvokeStaticOrDirect()) { - SharpenInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect(), codegen_); + SharpenInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect(), + codegen_, + compiler_driver_); } else if (instruction->IsLoadString()) { ProcessLoadString(instruction->AsLoadString()); } @@ -56,7 +59,7 @@ static bool IsInBootImage(ArtMethod* method) { const std::vector<gc::space::ImageSpace*>& image_spaces = Runtime::Current()->GetHeap()->GetBootImageSpaces(); for (gc::space::ImageSpace* image_space : image_spaces) { - const auto& method_section = image_space->GetImageHeader().GetMethodsSection(); + const ImageSection& method_section = image_space->GetImageHeader().GetMethodsSection(); if (method_section.Contains(reinterpret_cast<uint8_t*>(method) - image_space->Begin())) { return true; } @@ -68,9 +71,21 @@ static bool AOTCanEmbedMethod(ArtMethod* method, const CompilerOptions& options) return IsInBootImage(method) && !options.GetCompilePic(); } +static bool BootImageAOTCanEmbedMethod(ArtMethod* method, CompilerDriver* compiler_driver) { + DCHECK(compiler_driver->GetCompilerOptions().IsBootImage()); + if (!compiler_driver->GetSupportBootImageFixup()) { + return false; + } + ScopedObjectAccess soa(Thread::Current()); + ObjPtr<mirror::Class> klass = method->GetDeclaringClass(); + DCHECK(klass != nullptr); + const DexFile& dex_file = klass->GetDexFile(); + return compiler_driver->IsImageClass(dex_file.StringByTypeIdx(klass->GetDexTypeIndex())); +} void HSharpening::SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, - CodeGenerator* codegen) { + CodeGenerator* codegen, + CompilerDriver* compiler_driver) { if (invoke->IsStringInit()) { // Not using the dex cache arrays. But we could still try to use a better dispatch... // TODO: Use direct_method and direct_code for the appropriate StringFactory method. @@ -108,6 +123,10 @@ void HSharpening::SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress; method_load_data = reinterpret_cast<uintptr_t>(callee); code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; + } else if (codegen->GetCompilerOptions().IsBootImage() && + BootImageAOTCanEmbedMethod(callee, compiler_driver)) { + method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative; + code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; } else { // Use PC-relative access to the dex cache arrays. method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative; @@ -140,7 +159,7 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, CompilerDriver* compiler_driver, const DexCompilationUnit& dex_compilation_unit) { Handle<mirror::Class> klass = load_class->GetClass(); - DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCacheViaMethod || + DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kRuntimeCall || load_class->GetLoadKind() == HLoadClass::LoadKind::kReferrersClass) << load_class->GetLoadKind(); DCHECK(!load_class->IsInBootImage()) << "HLoadClass should not be optimized before sharpening."; @@ -166,13 +185,11 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, DCHECK(!runtime->UseJitCompilation()); if (!compiler_driver->GetSupportBootImageFixup()) { // compiler_driver_test. Do not sharpen. - desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod; - } else if ((klass != nullptr) && compiler_driver->IsImageClass( - dex_file.StringDataByIdx(dex_file.GetTypeId(type_index).descriptor_idx_))) { + desired_load_kind = HLoadClass::LoadKind::kRuntimeCall; + } else if ((klass != nullptr) && + compiler_driver->IsImageClass(dex_file.StringByTypeIdx(type_index))) { is_in_boot_image = true; - desired_load_kind = codegen->GetCompilerOptions().GetCompilePic() - ? HLoadClass::LoadKind::kBootImageLinkTimePcRelative - : HLoadClass::LoadKind::kBootImageLinkTimeAddress; + desired_load_kind = HLoadClass::LoadKind::kBootImageLinkTimePcRelative; } else { // Not a boot image class. DCHECK(ContainsElement(compiler_driver->GetDexFilesForOatFile(), &dex_file)); @@ -182,8 +199,7 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, is_in_boot_image = (klass != nullptr) && runtime->GetHeap()->ObjectIsInBootImageSpace(klass.Get()); if (runtime->UseJitCompilation()) { - // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus. - // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic()); + DCHECK(!codegen->GetCompilerOptions().GetCompilePic()); if (is_in_boot_image) { // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787 desired_load_kind = HLoadClass::LoadKind::kBootImageAddress; @@ -194,7 +210,7 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, // this `HLoadClass` hasn't been executed in the interpreter. // Fallback to the dex cache. // TODO(ngeoffray): Generate HDeoptimize instead. - desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod; + desired_load_kind = HLoadClass::LoadKind::kRuntimeCall; } } else if (is_in_boot_image && !codegen->GetCompilerOptions().GetCompilePic()) { // AOT app compilation. Check if the class is in the boot image. @@ -213,7 +229,7 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, } if (!IsSameDexFile(load_class->GetDexFile(), *dex_compilation_unit.GetDexFile())) { - if ((load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) || + if ((load_kind == HLoadClass::LoadKind::kRuntimeCall) || (load_kind == HLoadClass::LoadKind::kBssEntry)) { // We actually cannot reference this class, we're forced to bail. // We cannot reference this class with Bss, as the entrypoint will lookup the class @@ -225,7 +241,7 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, } void HSharpening::ProcessLoadString(HLoadString* load_string) { - DCHECK_EQ(load_string->GetLoadKind(), HLoadString::LoadKind::kDexCacheViaMethod); + DCHECK_EQ(load_string->GetLoadKind(), HLoadString::LoadKind::kRuntimeCall); const DexFile& dex_file = load_string->GetDexFile(); dex::StringIndex string_index = load_string->GetStringIndex(); @@ -249,16 +265,13 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { CHECK(string != nullptr); if (compiler_driver_->GetSupportBootImageFixup()) { DCHECK(ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file)); - desired_load_kind = codegen_->GetCompilerOptions().GetCompilePic() - ? HLoadString::LoadKind::kBootImageLinkTimePcRelative - : HLoadString::LoadKind::kBootImageLinkTimeAddress; + desired_load_kind = HLoadString::LoadKind::kBootImageLinkTimePcRelative; } else { // compiler_driver_test. Do not sharpen. - desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; + desired_load_kind = HLoadString::LoadKind::kRuntimeCall; } } else if (runtime->UseJitCompilation()) { - // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus. - // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic()); + DCHECK(!codegen_->GetCompilerOptions().GetCompilePic()); string = class_linker->LookupString(dex_file, string_index, dex_cache.Get()); if (string != nullptr) { if (runtime->GetHeap()->ObjectIsInBootImageSpace(string)) { @@ -267,7 +280,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { desired_load_kind = HLoadString::LoadKind::kJitTableAddress; } } else { - desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod; + desired_load_kind = HLoadString::LoadKind::kRuntimeCall; } } else { // AOT app compilation. Try to lookup the string without allocating if not found. diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h index 10707c796f..f74b0afdbf 100644 --- a/compiler/optimizing/sharpening.h +++ b/compiler/optimizing/sharpening.h @@ -55,7 +55,9 @@ class HSharpening : public HOptimization { REQUIRES_SHARED(Locks::mutator_lock_); // Used by Sharpening and InstructionSimplifier. - static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, CodeGenerator* codegen); + static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, + CodeGenerator* codegen, + CompilerDriver* compiler_driver); private: void ProcessLoadString(HLoadString* load_string); |