diff options
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/scheduler.cc | 17 | ||||
-rw-r--r-- | compiler/optimizing/scheduler.h | 41 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm.cc | 134 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm.h | 140 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm64.cc | 134 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm64.h | 135 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_test.cc | 6 |
7 files changed, 326 insertions, 281 deletions
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index 4236a545bc..f4cf7b0a49 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -548,20 +548,10 @@ void HScheduler::Schedule(HGraph* graph) { void HScheduler::Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector) { ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack()); - ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler)); // Build the scheduling graph. - SchedulingGraph scheduling_graph(&allocator, heap_location_collector); - for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - HInstruction* instruction = it.Current(); - CHECK_EQ(instruction->GetBlock(), block) - << instruction->DebugName() - << " is in block " << instruction->GetBlock()->GetBlockId() - << ", and expected in block " << block->GetBlockId(); - SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction)); - CalculateLatency(node); - scheduling_nodes.push_back(node); - } + auto [scheduling_graph, scheduling_nodes] = + BuildSchedulingGraph(block, &allocator, heap_location_collector); if (scheduling_graph.Size() <= 1) { return; @@ -803,8 +793,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks, #if defined(ART_ENABLE_CODEGEN_arm) case InstructionSet::kThumb2: case InstructionSet::kArm: { - arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_); - arm::HSchedulerARM scheduler(selector, &arm_latency_visitor); + arm::HSchedulerARM scheduler(selector, codegen_); scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); scheduler.Schedule(graph_); break; diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index 299fbc93f3..a9672ea732 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -497,9 +497,8 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector { class HScheduler { public: - HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector) - : latency_visitor_(latency_visitor), - selector_(selector), + explicit HScheduler(SchedulingNodeSelector* selector) + : selector_(selector), only_optimize_loop_blocks_(true), cursor_(nullptr) {} virtual ~HScheduler() {} @@ -512,6 +511,35 @@ class HScheduler { virtual bool IsSchedulingBarrier(const HInstruction* instruction) const; protected: + virtual std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph( + HBasicBlock* block, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector) = 0; + + template <typename LatencyVisitor> + std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph( + HBasicBlock* block, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector, + LatencyVisitor* latency_visitor) ALWAYS_INLINE { + SchedulingGraph scheduling_graph(allocator, heap_location_collector); + ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator->Adapter(kArenaAllocScheduler)); + for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + HInstruction* instruction = it.Current(); + CHECK_EQ(instruction->GetBlock(), block) + << instruction->DebugName() + << " is in block " << instruction->GetBlock()->GetBlockId() + << ", and expected in block " << block->GetBlockId(); + SchedulingNode* node = + scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction)); + latency_visitor->CalculateLatency(node); + node->SetLatency(latency_visitor->GetLastVisitedLatency()); + node->SetInternalLatency(latency_visitor->GetLastVisitedInternalLatency()); + scheduling_nodes.push_back(node); + } + return {std::move(scheduling_graph), std::move(scheduling_nodes)}; + } + void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector); void Schedule(SchedulingNode* scheduling_node, /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates); @@ -529,13 +557,6 @@ class HScheduler { virtual bool IsSchedulable(const HInstruction* instruction) const; bool IsSchedulable(const HBasicBlock* block) const; - void CalculateLatency(SchedulingNode* node) { - latency_visitor_->CalculateLatency(node); - node->SetLatency(latency_visitor_->GetLastVisitedLatency()); - node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency()); - } - - SchedulingLatencyVisitor* const latency_visitor_; SchedulingNodeSelector* const selector_; bool only_optimize_loop_blocks_; diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc index 510a0f5496..3ee6f06b46 100644 --- a/compiler/optimizing/scheduler_arm.cc +++ b/compiler/optimizing/scheduler_arm.cc @@ -17,6 +17,7 @@ #include "scheduler_arm.h" #include "arch/arm/instruction_set_features_arm.h" +#include "code_generator_arm_vixl.h" #include "code_generator_utils.h" #include "common_arm.h" #include "heap_poisoning.h" @@ -29,6 +30,116 @@ namespace arm { using helpers::Int32ConstantFrom; using helpers::Uint64ConstantFrom; +// AArch32 instruction latencies. +// We currently assume that all ARM CPUs share the same instruction latency list. +// The following latencies were tuned based on performance experiments and +// automatic tuning using differential evolution approach on various benchmarks. +static constexpr uint32_t kArmIntegerOpLatency = 2; +static constexpr uint32_t kArmFloatingPointOpLatency = 11; +static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4; +static constexpr uint32_t kArmMulIntegerLatency = 6; +static constexpr uint32_t kArmMulFloatingPointLatency = 11; +static constexpr uint32_t kArmDivIntegerLatency = 10; +static constexpr uint32_t kArmDivFloatLatency = 20; +static constexpr uint32_t kArmDivDoubleLatency = 25; +static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11; +static constexpr uint32_t kArmMemoryLoadLatency = 9; +static constexpr uint32_t kArmMemoryStoreLatency = 9; +static constexpr uint32_t kArmMemoryBarrierLatency = 6; +static constexpr uint32_t kArmBranchLatency = 4; +static constexpr uint32_t kArmCallLatency = 5; +static constexpr uint32_t kArmCallInternalLatency = 29; +static constexpr uint32_t kArmLoadStringInternalLatency = 10; +static constexpr uint32_t kArmNopLatency = 2; +static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18; +static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46; + +class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor { + public: + explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen) + : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {} + + // Default visitor for instructions not handled specifically below. + void VisitInstruction([[maybe_unused]] HInstruction*) override { + last_visited_latency_ = kArmIntegerOpLatency; + } + +// We add a second unused parameter to be able to use this macro like the others +// defined in `nodes.h`. +#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \ + M(ArrayGet, unused) \ + M(ArrayLength, unused) \ + M(ArraySet, unused) \ + M(Add, unused) \ + M(Sub, unused) \ + M(And, unused) \ + M(Or, unused) \ + M(Ror, unused) \ + M(Xor, unused) \ + M(Shl, unused) \ + M(Shr, unused) \ + M(UShr, unused) \ + M(Mul, unused) \ + M(Div, unused) \ + M(Condition, unused) \ + M(Compare, unused) \ + M(BoundsCheck, unused) \ + M(InstanceFieldGet, unused) \ + M(InstanceFieldSet, unused) \ + M(InstanceOf, unused) \ + M(Invoke, unused) \ + M(LoadString, unused) \ + M(NewArray, unused) \ + M(NewInstance, unused) \ + M(Rem, unused) \ + M(StaticFieldGet, unused) \ + M(StaticFieldSet, unused) \ + M(SuspendCheck, unused) \ + M(TypeConversion, unused) + +#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ + M(BitwiseNegatedRight, unused) \ + M(MultiplyAccumulate, unused) \ + M(IntermediateAddress, unused) \ + M(IntermediateAddressIndex, unused) \ + M(DataProcWithShifterOp, unused) + +#define DECLARE_VISIT_INSTRUCTION(type, unused) \ + void Visit##type(H##type* instruction) override; + + FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + private: + bool CanGenerateTest(HCondition* cond); + void HandleGenerateConditionWithZero(IfCondition cond); + void HandleGenerateLongTestConstant(HCondition* cond); + void HandleGenerateLongTest(HCondition* cond); + void HandleGenerateLongComparesAndJumps(); + void HandleGenerateTest(HCondition* cond); + void HandleGenerateConditionGeneric(HCondition* cond); + void HandleGenerateEqualLong(HCondition* cond); + void HandleGenerateConditionLong(HCondition* cond); + void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond); + void HandleCondition(HCondition* instr); + void HandleBinaryOperationLantencies(HBinaryOperation* instr); + void HandleBitwiseOperationLantencies(HBinaryOperation* instr); + void HandleShiftLatencies(HBinaryOperation* instr); + void HandleDivRemConstantIntegralLatencies(int32_t imm); + void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleGenerateDataProcInstruction(bool internal_latency = false); + void HandleGenerateDataProc(HDataProcWithShifterOp* instruction); + void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction); + + // The latency setting for each HInstruction depends on how CodeGenerator may generate code, + // latency visitors may query CodeGenerator for such information for accurate latency settings. + CodeGeneratorARMVIXL* codegen_; +}; + void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) { switch (instr->GetResultType()) { case DataType::Type::kInt64: @@ -1153,5 +1264,28 @@ void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) { } } +bool HSchedulerARM::IsSchedulable(const HInstruction* instruction) const { + switch (instruction->GetKind()) { +#define SCHEDULABLE_CASE(type, unused) \ + case HInstruction::InstructionKind::k##type: \ + return true; + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE) + FOR_EACH_CONCRETE_INSTRUCTION_ARM(SCHEDULABLE_CASE) +#undef SCHEDULABLE_CASE + + default: + return HScheduler::IsSchedulable(instruction); + } +} + +std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> HSchedulerARM::BuildSchedulingGraph( + HBasicBlock* block, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector) { + SchedulingLatencyVisitorARM latency_visitor(codegen_); + return HScheduler::BuildSchedulingGraph( + block, allocator, heap_location_collector, &latency_visitor); +} + } // namespace arm } // namespace art diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h index cf00fa12a3..25eac1b2c4 100644 --- a/compiler/optimizing/scheduler_arm.h +++ b/compiler/optimizing/scheduler_arm.h @@ -18,144 +18,32 @@ #define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ #include "base/macros.h" -#include "code_generator_arm_vixl.h" #include "scheduler.h" namespace art HIDDEN { -namespace arm { -// AArch32 instruction latencies. -// We currently assume that all ARM CPUs share the same instruction latency list. -// The following latencies were tuned based on performance experiments and -// automatic tuning using differential evolution approach on various benchmarks. -static constexpr uint32_t kArmIntegerOpLatency = 2; -static constexpr uint32_t kArmFloatingPointOpLatency = 11; -static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4; -static constexpr uint32_t kArmMulIntegerLatency = 6; -static constexpr uint32_t kArmMulFloatingPointLatency = 11; -static constexpr uint32_t kArmDivIntegerLatency = 10; -static constexpr uint32_t kArmDivFloatLatency = 20; -static constexpr uint32_t kArmDivDoubleLatency = 25; -static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11; -static constexpr uint32_t kArmMemoryLoadLatency = 9; -static constexpr uint32_t kArmMemoryStoreLatency = 9; -static constexpr uint32_t kArmMemoryBarrierLatency = 6; -static constexpr uint32_t kArmBranchLatency = 4; -static constexpr uint32_t kArmCallLatency = 5; -static constexpr uint32_t kArmCallInternalLatency = 29; -static constexpr uint32_t kArmLoadStringInternalLatency = 10; -static constexpr uint32_t kArmNopLatency = 2; -static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18; -static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46; - -class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor { - public: - explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen) - : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {} - - // Default visitor for instructions not handled specifically below. - void VisitInstruction([[maybe_unused]] HInstruction*) override { - last_visited_latency_ = kArmIntegerOpLatency; - } - -// We add a second unused parameter to be able to use this macro like the others -// defined in `nodes.h`. -#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \ - M(ArrayGet, unused) \ - M(ArrayLength, unused) \ - M(ArraySet, unused) \ - M(Add, unused) \ - M(Sub, unused) \ - M(And, unused) \ - M(Or, unused) \ - M(Ror, unused) \ - M(Xor, unused) \ - M(Shl, unused) \ - M(Shr, unused) \ - M(UShr, unused) \ - M(Mul, unused) \ - M(Div, unused) \ - M(Condition, unused) \ - M(Compare, unused) \ - M(BoundsCheck, unused) \ - M(InstanceFieldGet, unused) \ - M(InstanceFieldSet, unused) \ - M(InstanceOf, unused) \ - M(Invoke, unused) \ - M(LoadString, unused) \ - M(NewArray, unused) \ - M(NewInstance, unused) \ - M(Rem, unused) \ - M(StaticFieldGet, unused) \ - M(StaticFieldSet, unused) \ - M(SuspendCheck, unused) \ - M(TypeConversion, unused) - -#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ - M(BitwiseNegatedRight, unused) \ - M(MultiplyAccumulate, unused) \ - M(IntermediateAddress, unused) \ - M(IntermediateAddressIndex, unused) \ - M(DataProcWithShifterOp, unused) - -#define DECLARE_VISIT_INSTRUCTION(type, unused) \ - void Visit##type(H##type* instruction) override; - FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) - FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) - FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION) +class CodeGenerator; -#undef DECLARE_VISIT_INSTRUCTION - - private: - bool CanGenerateTest(HCondition* cond); - void HandleGenerateConditionWithZero(IfCondition cond); - void HandleGenerateLongTestConstant(HCondition* cond); - void HandleGenerateLongTest(HCondition* cond); - void HandleGenerateLongComparesAndJumps(); - void HandleGenerateTest(HCondition* cond); - void HandleGenerateConditionGeneric(HCondition* cond); - void HandleGenerateEqualLong(HCondition* cond); - void HandleGenerateConditionLong(HCondition* cond); - void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond); - void HandleCondition(HCondition* instr); - void HandleBinaryOperationLantencies(HBinaryOperation* instr); - void HandleBitwiseOperationLantencies(HBinaryOperation* instr); - void HandleShiftLatencies(HBinaryOperation* instr); - void HandleDivRemConstantIntegralLatencies(int32_t imm); - void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info); - void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info); - void HandleGenerateDataProcInstruction(bool internal_latency = false); - void HandleGenerateDataProc(HDataProcWithShifterOp* instruction); - void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction); - - // The latency setting for each HInstruction depends on how CodeGenerator may generate code, - // latency visitors may query CodeGenerator for such information for accurate latency settings. - CodeGeneratorARMVIXL* codegen_; -}; +namespace arm { -class HSchedulerARM : public HScheduler { +class HSchedulerARM final : public HScheduler { public: - HSchedulerARM(SchedulingNodeSelector* selector, - SchedulingLatencyVisitorARM* arm_latency_visitor) - : HScheduler(arm_latency_visitor, selector) {} + HSchedulerARM(SchedulingNodeSelector* selector, CodeGenerator* codegen) + : HScheduler(selector), codegen_(codegen) {} ~HSchedulerARM() override {} - bool IsSchedulable(const HInstruction* instruction) const override { -#define CASE_INSTRUCTION_KIND(type, unused) case \ - HInstruction::InstructionKind::k##type: - switch (instruction->GetKind()) { - FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND) - return true; - FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND) - return true; - default: - return HScheduler::IsSchedulable(instruction); - } -#undef CASE_INSTRUCTION_KIND - } + bool IsSchedulable(const HInstruction* instruction) const override; + + protected: + std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph( + HBasicBlock* block, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector) override; private: DISALLOW_COPY_AND_ASSIGN(HSchedulerARM); + + CodeGenerator* const codegen_; }; } // namespace arm diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc index 5113cf446d..08b8a3fb78 100644 --- a/compiler/optimizing/scheduler_arm64.cc +++ b/compiler/optimizing/scheduler_arm64.cc @@ -23,6 +23,115 @@ namespace art HIDDEN { namespace arm64 { +static constexpr uint32_t kArm64MemoryLoadLatency = 5; +static constexpr uint32_t kArm64MemoryStoreLatency = 3; + +static constexpr uint32_t kArm64CallInternalLatency = 10; +static constexpr uint32_t kArm64CallLatency = 5; + +// AArch64 instruction latency. +// We currently assume that all arm64 CPUs share the same instruction latency list. +static constexpr uint32_t kArm64IntegerOpLatency = 2; +static constexpr uint32_t kArm64FloatingPointOpLatency = 5; + +static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3; +static constexpr uint32_t kArm64DivDoubleLatency = 30; +static constexpr uint32_t kArm64DivFloatLatency = 15; +static constexpr uint32_t kArm64DivIntegerLatency = 5; +static constexpr uint32_t kArm64LoadStringInternalLatency = 7; +static constexpr uint32_t kArm64MulFloatingPointLatency = 6; +static constexpr uint32_t kArm64MulIntegerLatency = 6; +static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5; +static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency; + +static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10; +static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6; +static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10; +static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6; +static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12; +static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12; +static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16; +static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60; +static constexpr uint32_t kArm64SIMDDivFloatLatency = 30; +static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10; + +class SchedulingLatencyVisitorARM64 final : public SchedulingLatencyVisitor { + public: + // Default visitor for instructions not handled specifically below. + void VisitInstruction([[maybe_unused]] HInstruction*) override { + last_visited_latency_ = kArm64IntegerOpLatency; + } + +// We add a second unused parameter to be able to use this macro like the others +// defined in `nodes.h`. +#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \ + M(ArrayGet , unused) \ + M(ArrayLength , unused) \ + M(ArraySet , unused) \ + M(BoundsCheck , unused) \ + M(Div , unused) \ + M(InstanceFieldGet , unused) \ + M(InstanceOf , unused) \ + M(LoadString , unused) \ + M(Mul , unused) \ + M(NewArray , unused) \ + M(NewInstance , unused) \ + M(Rem , unused) \ + M(StaticFieldGet , unused) \ + M(SuspendCheck , unused) \ + M(TypeConversion , unused) \ + M(VecReplicateScalar , unused) \ + M(VecExtractScalar , unused) \ + M(VecReduce , unused) \ + M(VecCnv , unused) \ + M(VecNeg , unused) \ + M(VecAbs , unused) \ + M(VecNot , unused) \ + M(VecAdd , unused) \ + M(VecHalvingAdd , unused) \ + M(VecSub , unused) \ + M(VecMul , unused) \ + M(VecDiv , unused) \ + M(VecMin , unused) \ + M(VecMax , unused) \ + M(VecAnd , unused) \ + M(VecAndNot , unused) \ + M(VecOr , unused) \ + M(VecXor , unused) \ + M(VecShl , unused) \ + M(VecShr , unused) \ + M(VecUShr , unused) \ + M(VecSetScalars , unused) \ + M(VecMultiplyAccumulate, unused) \ + M(VecLoad , unused) \ + M(VecStore , unused) + +#define FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(M) \ + M(BinaryOperation , unused) \ + M(Invoke , unused) + +#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ + M(BitwiseNegatedRight, unused) \ + M(MultiplyAccumulate, unused) \ + M(IntermediateAddress, unused) \ + M(IntermediateAddressIndex, unused) \ + M(DataProcWithShifterOp, unused) + +#define DECLARE_VISIT_INSTRUCTION(type, unused) \ + void Visit##type(H##type* instruction) override; + + FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + private: + void HandleSimpleArithmeticSIMD(HVecOperation *instr); + void HandleVecAddress(HVecMemoryOperation* instruction, size_t size); +}; + void SchedulingLatencyVisitorARM64::VisitBinaryOperation(HBinaryOperation* instr) { last_visited_latency_ = DataType::IsFloatingPointType(instr->GetResultType()) ? kArm64FloatingPointOpLatency @@ -348,5 +457,30 @@ void SchedulingLatencyVisitorARM64::VisitVecStore(HVecStore* instr) { last_visited_latency_ = kArm64SIMDMemoryStoreLatency; } +bool HSchedulerARM64::IsSchedulable(const HInstruction* instruction) const { + switch (instruction->GetKind()) { +#define SCHEDULABLE_CASE(type, unused) \ + case HInstruction::InstructionKind::k##type: \ + return true; + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE) + FOR_EACH_CONCRETE_INSTRUCTION_ARM64(SCHEDULABLE_CASE) + FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(SCHEDULABLE_CASE) +#undef SCHEDULABLE_CASE + + default: + return HScheduler::IsSchedulable(instruction); + } +} + +std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> +HSchedulerARM64::BuildSchedulingGraph( + HBasicBlock* block, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector) { + SchedulingLatencyVisitorARM64 latency_visitor; + return HScheduler::BuildSchedulingGraph( + block, allocator, heap_location_collector, &latency_visitor); +} + } // namespace arm64 } // namespace art diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h index 7ce00e00ab..044aa48a5a 100644 --- a/compiler/optimizing/scheduler_arm64.h +++ b/compiler/optimizing/scheduler_arm64.h @@ -23,137 +23,13 @@ namespace art HIDDEN { namespace arm64 { -static constexpr uint32_t kArm64MemoryLoadLatency = 5; -static constexpr uint32_t kArm64MemoryStoreLatency = 3; - -static constexpr uint32_t kArm64CallInternalLatency = 10; -static constexpr uint32_t kArm64CallLatency = 5; - -// AArch64 instruction latency. -// We currently assume that all arm64 CPUs share the same instruction latency list. -static constexpr uint32_t kArm64IntegerOpLatency = 2; -static constexpr uint32_t kArm64FloatingPointOpLatency = 5; - - -static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3; -static constexpr uint32_t kArm64DivDoubleLatency = 30; -static constexpr uint32_t kArm64DivFloatLatency = 15; -static constexpr uint32_t kArm64DivIntegerLatency = 5; -static constexpr uint32_t kArm64LoadStringInternalLatency = 7; -static constexpr uint32_t kArm64MulFloatingPointLatency = 6; -static constexpr uint32_t kArm64MulIntegerLatency = 6; -static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5; -static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency; - -static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10; -static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6; -static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10; -static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6; -static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12; -static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12; -static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16; -static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60; -static constexpr uint32_t kArm64SIMDDivFloatLatency = 30; -static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10; - -class SchedulingLatencyVisitorARM64 final : public SchedulingLatencyVisitor { - public: - // Default visitor for instructions not handled specifically below. - void VisitInstruction([[maybe_unused]] HInstruction*) override { - last_visited_latency_ = kArm64IntegerOpLatency; - } - -// We add a second unused parameter to be able to use this macro like the others -// defined in `nodes.h`. -#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \ - M(ArrayGet , unused) \ - M(ArrayLength , unused) \ - M(ArraySet , unused) \ - M(BoundsCheck , unused) \ - M(Div , unused) \ - M(InstanceFieldGet , unused) \ - M(InstanceOf , unused) \ - M(LoadString , unused) \ - M(Mul , unused) \ - M(NewArray , unused) \ - M(NewInstance , unused) \ - M(Rem , unused) \ - M(StaticFieldGet , unused) \ - M(SuspendCheck , unused) \ - M(TypeConversion , unused) \ - M(VecReplicateScalar , unused) \ - M(VecExtractScalar , unused) \ - M(VecReduce , unused) \ - M(VecCnv , unused) \ - M(VecNeg , unused) \ - M(VecAbs , unused) \ - M(VecNot , unused) \ - M(VecAdd , unused) \ - M(VecHalvingAdd , unused) \ - M(VecSub , unused) \ - M(VecMul , unused) \ - M(VecDiv , unused) \ - M(VecMin , unused) \ - M(VecMax , unused) \ - M(VecAnd , unused) \ - M(VecAndNot , unused) \ - M(VecOr , unused) \ - M(VecXor , unused) \ - M(VecShl , unused) \ - M(VecShr , unused) \ - M(VecUShr , unused) \ - M(VecSetScalars , unused) \ - M(VecMultiplyAccumulate, unused) \ - M(VecLoad , unused) \ - M(VecStore , unused) - -#define FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(M) \ - M(BinaryOperation , unused) \ - M(Invoke , unused) - -#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ - M(BitwiseNegatedRight, unused) \ - M(MultiplyAccumulate, unused) \ - M(IntermediateAddress, unused) \ - M(IntermediateAddressIndex, unused) \ - M(DataProcWithShifterOp, unused) - -#define DECLARE_VISIT_INSTRUCTION(type, unused) \ - void Visit##type(H##type* instruction) override; - - FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) - FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) - FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) - FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION) - -#undef DECLARE_VISIT_INSTRUCTION - - private: - void HandleSimpleArithmeticSIMD(HVecOperation *instr); - void HandleVecAddress(HVecMemoryOperation* instruction, size_t size); -}; - class HSchedulerARM64 : public HScheduler { public: explicit HSchedulerARM64(SchedulingNodeSelector* selector) - : HScheduler(&arm64_latency_visitor_, selector) {} + : HScheduler(selector) {} ~HSchedulerARM64() override {} - bool IsSchedulable(const HInstruction* instruction) const override { -#define CASE_INSTRUCTION_KIND(type, unused) case \ - HInstruction::InstructionKind::k##type: - switch (instruction->GetKind()) { - FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND) - return true; - FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND) - return true; - FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(CASE_INSTRUCTION_KIND) - return true; - default: - return HScheduler::IsSchedulable(instruction); - } -#undef CASE_INSTRUCTION_KIND - } + bool IsSchedulable(const HInstruction* instruction) const override; // Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized // loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler; @@ -169,8 +45,13 @@ class HSchedulerARM64 : public HScheduler { instr->IsVecReplicateScalar(); } + protected: + std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph( + HBasicBlock* block, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector) override; + private: - SchedulingLatencyVisitorARM64 arm64_latency_visitor_; DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64); }; diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc index c2b1fd6f7c..0b020f1460 100644 --- a/compiler/optimizing/scheduler_test.cc +++ b/compiler/optimizing/scheduler_test.cc @@ -406,15 +406,13 @@ TEST_F(SchedulerTest, ArrayAccessAliasingARM64) { #if defined(ART_ENABLE_CODEGEN_arm) TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) { CriticalPathSchedulingNodeSelector critical_path_selector; - arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); - arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor); + arm::HSchedulerARM scheduler(&critical_path_selector, /*codegen=*/ nullptr); TestBuildDependencyGraphAndSchedule(&scheduler); } TEST_F(SchedulerTest, ArrayAccessAliasingARM) { CriticalPathSchedulingNodeSelector critical_path_selector; - arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); - arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor); + arm::HSchedulerARM scheduler(&critical_path_selector, /*codegen=*/ nullptr); TestDependencyGraphOnAliasingArrayAccesses(&scheduler); } #endif |