summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Vladimir Marko <vmarko@google.com> 2024-02-12 11:17:28 +0100
committer VladimĂ­r Marko <vmarko@google.com> 2024-02-13 15:28:35 +0000
commit77e5997b524a133d38585da8bf58420f2411f7ad (patch)
tree0273c71559a03593b358f04ef060214796159863
parent2389869ed372eebd886c2f984f23ec7e342da22b (diff)
Optimizing: Refactor `HScheduler`.
Move `SchedulingLatencyVisitor{ARM,ARM64}` to .cc files. Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: run-gtests.sh Test: testrunner.py --target --optimizing Change-Id: I15cb1a4cbef00a328fec947189412c502bf80f46
-rw-r--r--compiler/optimizing/scheduler.cc17
-rw-r--r--compiler/optimizing/scheduler.h41
-rw-r--r--compiler/optimizing/scheduler_arm.cc134
-rw-r--r--compiler/optimizing/scheduler_arm.h140
-rw-r--r--compiler/optimizing/scheduler_arm64.cc134
-rw-r--r--compiler/optimizing/scheduler_arm64.h135
-rw-r--r--compiler/optimizing/scheduler_test.cc6
7 files changed, 326 insertions, 281 deletions
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 4236a545bc..f4cf7b0a49 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -548,20 +548,10 @@ void HScheduler::Schedule(HGraph* graph) {
void HScheduler::Schedule(HBasicBlock* block,
const HeapLocationCollector* heap_location_collector) {
ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
- ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
// Build the scheduling graph.
- SchedulingGraph scheduling_graph(&allocator, heap_location_collector);
- for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
- HInstruction* instruction = it.Current();
- CHECK_EQ(instruction->GetBlock(), block)
- << instruction->DebugName()
- << " is in block " << instruction->GetBlock()->GetBlockId()
- << ", and expected in block " << block->GetBlockId();
- SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
- CalculateLatency(node);
- scheduling_nodes.push_back(node);
- }
+ auto [scheduling_graph, scheduling_nodes] =
+ BuildSchedulingGraph(block, &allocator, heap_location_collector);
if (scheduling_graph.Size() <= 1) {
return;
@@ -803,8 +793,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
#if defined(ART_ENABLE_CODEGEN_arm)
case InstructionSet::kThumb2:
case InstructionSet::kArm: {
- arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
- arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(selector, codegen_);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index 299fbc93f3..a9672ea732 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -497,9 +497,8 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
class HScheduler {
public:
- HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector)
- : latency_visitor_(latency_visitor),
- selector_(selector),
+ explicit HScheduler(SchedulingNodeSelector* selector)
+ : selector_(selector),
only_optimize_loop_blocks_(true),
cursor_(nullptr) {}
virtual ~HScheduler() {}
@@ -512,6 +511,35 @@ class HScheduler {
virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
protected:
+ virtual std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+ HBasicBlock* block,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector) = 0;
+
+ template <typename LatencyVisitor>
+ std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+ HBasicBlock* block,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector,
+ LatencyVisitor* latency_visitor) ALWAYS_INLINE {
+ SchedulingGraph scheduling_graph(allocator, heap_location_collector);
+ ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator->Adapter(kArenaAllocScheduler));
+ for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+ HInstruction* instruction = it.Current();
+ CHECK_EQ(instruction->GetBlock(), block)
+ << instruction->DebugName()
+ << " is in block " << instruction->GetBlock()->GetBlockId()
+ << ", and expected in block " << block->GetBlockId();
+ SchedulingNode* node =
+ scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
+ latency_visitor->CalculateLatency(node);
+ node->SetLatency(latency_visitor->GetLastVisitedLatency());
+ node->SetInternalLatency(latency_visitor->GetLastVisitedInternalLatency());
+ scheduling_nodes.push_back(node);
+ }
+ return {std::move(scheduling_graph), std::move(scheduling_nodes)};
+ }
+
void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector);
void Schedule(SchedulingNode* scheduling_node,
/*inout*/ ScopedArenaVector<SchedulingNode*>* candidates);
@@ -529,13 +557,6 @@ class HScheduler {
virtual bool IsSchedulable(const HInstruction* instruction) const;
bool IsSchedulable(const HBasicBlock* block) const;
- void CalculateLatency(SchedulingNode* node) {
- latency_visitor_->CalculateLatency(node);
- node->SetLatency(latency_visitor_->GetLastVisitedLatency());
- node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
- }
-
- SchedulingLatencyVisitor* const latency_visitor_;
SchedulingNodeSelector* const selector_;
bool only_optimize_loop_blocks_;
diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc
index 510a0f5496..3ee6f06b46 100644
--- a/compiler/optimizing/scheduler_arm.cc
+++ b/compiler/optimizing/scheduler_arm.cc
@@ -17,6 +17,7 @@
#include "scheduler_arm.h"
#include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_arm_vixl.h"
#include "code_generator_utils.h"
#include "common_arm.h"
#include "heap_poisoning.h"
@@ -29,6 +30,116 @@ namespace arm {
using helpers::Int32ConstantFrom;
using helpers::Uint64ConstantFrom;
+// AArch32 instruction latencies.
+// We currently assume that all ARM CPUs share the same instruction latency list.
+// The following latencies were tuned based on performance experiments and
+// automatic tuning using differential evolution approach on various benchmarks.
+static constexpr uint32_t kArmIntegerOpLatency = 2;
+static constexpr uint32_t kArmFloatingPointOpLatency = 11;
+static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
+static constexpr uint32_t kArmMulIntegerLatency = 6;
+static constexpr uint32_t kArmMulFloatingPointLatency = 11;
+static constexpr uint32_t kArmDivIntegerLatency = 10;
+static constexpr uint32_t kArmDivFloatLatency = 20;
+static constexpr uint32_t kArmDivDoubleLatency = 25;
+static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
+static constexpr uint32_t kArmMemoryLoadLatency = 9;
+static constexpr uint32_t kArmMemoryStoreLatency = 9;
+static constexpr uint32_t kArmMemoryBarrierLatency = 6;
+static constexpr uint32_t kArmBranchLatency = 4;
+static constexpr uint32_t kArmCallLatency = 5;
+static constexpr uint32_t kArmCallInternalLatency = 29;
+static constexpr uint32_t kArmLoadStringInternalLatency = 10;
+static constexpr uint32_t kArmNopLatency = 2;
+static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
+static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
+
+class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor {
+ public:
+ explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
+ : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {}
+
+ // Default visitor for instructions not handled specifically below.
+ void VisitInstruction([[maybe_unused]] HInstruction*) override {
+ last_visited_latency_ = kArmIntegerOpLatency;
+ }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \
+ M(ArrayGet, unused) \
+ M(ArrayLength, unused) \
+ M(ArraySet, unused) \
+ M(Add, unused) \
+ M(Sub, unused) \
+ M(And, unused) \
+ M(Or, unused) \
+ M(Ror, unused) \
+ M(Xor, unused) \
+ M(Shl, unused) \
+ M(Shr, unused) \
+ M(UShr, unused) \
+ M(Mul, unused) \
+ M(Div, unused) \
+ M(Condition, unused) \
+ M(Compare, unused) \
+ M(BoundsCheck, unused) \
+ M(InstanceFieldGet, unused) \
+ M(InstanceFieldSet, unused) \
+ M(InstanceOf, unused) \
+ M(Invoke, unused) \
+ M(LoadString, unused) \
+ M(NewArray, unused) \
+ M(NewInstance, unused) \
+ M(Rem, unused) \
+ M(StaticFieldGet, unused) \
+ M(StaticFieldSet, unused) \
+ M(SuspendCheck, unused) \
+ M(TypeConversion, unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+ M(BitwiseNegatedRight, unused) \
+ M(MultiplyAccumulate, unused) \
+ M(IntermediateAddress, unused) \
+ M(IntermediateAddressIndex, unused) \
+ M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused) \
+ void Visit##type(H##type* instruction) override;
+
+ FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+ FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+ FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+ bool CanGenerateTest(HCondition* cond);
+ void HandleGenerateConditionWithZero(IfCondition cond);
+ void HandleGenerateLongTestConstant(HCondition* cond);
+ void HandleGenerateLongTest(HCondition* cond);
+ void HandleGenerateLongComparesAndJumps();
+ void HandleGenerateTest(HCondition* cond);
+ void HandleGenerateConditionGeneric(HCondition* cond);
+ void HandleGenerateEqualLong(HCondition* cond);
+ void HandleGenerateConditionLong(HCondition* cond);
+ void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond);
+ void HandleCondition(HCondition* instr);
+ void HandleBinaryOperationLantencies(HBinaryOperation* instr);
+ void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
+ void HandleShiftLatencies(HBinaryOperation* instr);
+ void HandleDivRemConstantIntegralLatencies(int32_t imm);
+ void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+ void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+ void HandleGenerateDataProcInstruction(bool internal_latency = false);
+ void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
+ void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
+
+ // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
+ // latency visitors may query CodeGenerator for such information for accurate latency settings.
+ CodeGeneratorARMVIXL* codegen_;
+};
+
void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) {
switch (instr->GetResultType()) {
case DataType::Type::kInt64:
@@ -1153,5 +1264,28 @@ void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) {
}
}
+bool HSchedulerARM::IsSchedulable(const HInstruction* instruction) const {
+ switch (instruction->GetKind()) {
+#define SCHEDULABLE_CASE(type, unused) \
+ case HInstruction::InstructionKind::k##type: \
+ return true;
+ FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE)
+ FOR_EACH_CONCRETE_INSTRUCTION_ARM(SCHEDULABLE_CASE)
+#undef SCHEDULABLE_CASE
+
+ default:
+ return HScheduler::IsSchedulable(instruction);
+ }
+}
+
+std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> HSchedulerARM::BuildSchedulingGraph(
+ HBasicBlock* block,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector) {
+ SchedulingLatencyVisitorARM latency_visitor(codegen_);
+ return HScheduler::BuildSchedulingGraph(
+ block, allocator, heap_location_collector, &latency_visitor);
+}
+
} // namespace arm
} // namespace art
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
index cf00fa12a3..25eac1b2c4 100644
--- a/compiler/optimizing/scheduler_arm.h
+++ b/compiler/optimizing/scheduler_arm.h
@@ -18,144 +18,32 @@
#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
#include "base/macros.h"
-#include "code_generator_arm_vixl.h"
#include "scheduler.h"
namespace art HIDDEN {
-namespace arm {
-// AArch32 instruction latencies.
-// We currently assume that all ARM CPUs share the same instruction latency list.
-// The following latencies were tuned based on performance experiments and
-// automatic tuning using differential evolution approach on various benchmarks.
-static constexpr uint32_t kArmIntegerOpLatency = 2;
-static constexpr uint32_t kArmFloatingPointOpLatency = 11;
-static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
-static constexpr uint32_t kArmMulIntegerLatency = 6;
-static constexpr uint32_t kArmMulFloatingPointLatency = 11;
-static constexpr uint32_t kArmDivIntegerLatency = 10;
-static constexpr uint32_t kArmDivFloatLatency = 20;
-static constexpr uint32_t kArmDivDoubleLatency = 25;
-static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
-static constexpr uint32_t kArmMemoryLoadLatency = 9;
-static constexpr uint32_t kArmMemoryStoreLatency = 9;
-static constexpr uint32_t kArmMemoryBarrierLatency = 6;
-static constexpr uint32_t kArmBranchLatency = 4;
-static constexpr uint32_t kArmCallLatency = 5;
-static constexpr uint32_t kArmCallInternalLatency = 29;
-static constexpr uint32_t kArmLoadStringInternalLatency = 10;
-static constexpr uint32_t kArmNopLatency = 2;
-static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
-static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
-
-class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor {
- public:
- explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
- : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {}
-
- // Default visitor for instructions not handled specifically below.
- void VisitInstruction([[maybe_unused]] HInstruction*) override {
- last_visited_latency_ = kArmIntegerOpLatency;
- }
-
-// We add a second unused parameter to be able to use this macro like the others
-// defined in `nodes.h`.
-#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \
- M(ArrayGet, unused) \
- M(ArrayLength, unused) \
- M(ArraySet, unused) \
- M(Add, unused) \
- M(Sub, unused) \
- M(And, unused) \
- M(Or, unused) \
- M(Ror, unused) \
- M(Xor, unused) \
- M(Shl, unused) \
- M(Shr, unused) \
- M(UShr, unused) \
- M(Mul, unused) \
- M(Div, unused) \
- M(Condition, unused) \
- M(Compare, unused) \
- M(BoundsCheck, unused) \
- M(InstanceFieldGet, unused) \
- M(InstanceFieldSet, unused) \
- M(InstanceOf, unused) \
- M(Invoke, unused) \
- M(LoadString, unused) \
- M(NewArray, unused) \
- M(NewInstance, unused) \
- M(Rem, unused) \
- M(StaticFieldGet, unused) \
- M(StaticFieldSet, unused) \
- M(SuspendCheck, unused) \
- M(TypeConversion, unused)
-
-#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
- M(BitwiseNegatedRight, unused) \
- M(MultiplyAccumulate, unused) \
- M(IntermediateAddress, unused) \
- M(IntermediateAddressIndex, unused) \
- M(DataProcWithShifterOp, unused)
-
-#define DECLARE_VISIT_INSTRUCTION(type, unused) \
- void Visit##type(H##type* instruction) override;
- FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
- FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
- FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+class CodeGenerator;
-#undef DECLARE_VISIT_INSTRUCTION
-
- private:
- bool CanGenerateTest(HCondition* cond);
- void HandleGenerateConditionWithZero(IfCondition cond);
- void HandleGenerateLongTestConstant(HCondition* cond);
- void HandleGenerateLongTest(HCondition* cond);
- void HandleGenerateLongComparesAndJumps();
- void HandleGenerateTest(HCondition* cond);
- void HandleGenerateConditionGeneric(HCondition* cond);
- void HandleGenerateEqualLong(HCondition* cond);
- void HandleGenerateConditionLong(HCondition* cond);
- void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond);
- void HandleCondition(HCondition* instr);
- void HandleBinaryOperationLantencies(HBinaryOperation* instr);
- void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
- void HandleShiftLatencies(HBinaryOperation* instr);
- void HandleDivRemConstantIntegralLatencies(int32_t imm);
- void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
- void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
- void HandleGenerateDataProcInstruction(bool internal_latency = false);
- void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
- void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
-
- // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
- // latency visitors may query CodeGenerator for such information for accurate latency settings.
- CodeGeneratorARMVIXL* codegen_;
-};
+namespace arm {
-class HSchedulerARM : public HScheduler {
+class HSchedulerARM final : public HScheduler {
public:
- HSchedulerARM(SchedulingNodeSelector* selector,
- SchedulingLatencyVisitorARM* arm_latency_visitor)
- : HScheduler(arm_latency_visitor, selector) {}
+ HSchedulerARM(SchedulingNodeSelector* selector, CodeGenerator* codegen)
+ : HScheduler(selector), codegen_(codegen) {}
~HSchedulerARM() override {}
- bool IsSchedulable(const HInstruction* instruction) const override {
-#define CASE_INSTRUCTION_KIND(type, unused) case \
- HInstruction::InstructionKind::k##type:
- switch (instruction->GetKind()) {
- FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
- return true;
- FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND)
- return true;
- default:
- return HScheduler::IsSchedulable(instruction);
- }
-#undef CASE_INSTRUCTION_KIND
- }
+ bool IsSchedulable(const HInstruction* instruction) const override;
+
+ protected:
+ std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+ HBasicBlock* block,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector) override;
private:
DISALLOW_COPY_AND_ASSIGN(HSchedulerARM);
+
+ CodeGenerator* const codegen_;
};
} // namespace arm
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
index 5113cf446d..08b8a3fb78 100644
--- a/compiler/optimizing/scheduler_arm64.cc
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -23,6 +23,115 @@
namespace art HIDDEN {
namespace arm64 {
+static constexpr uint32_t kArm64MemoryLoadLatency = 5;
+static constexpr uint32_t kArm64MemoryStoreLatency = 3;
+
+static constexpr uint32_t kArm64CallInternalLatency = 10;
+static constexpr uint32_t kArm64CallLatency = 5;
+
+// AArch64 instruction latency.
+// We currently assume that all arm64 CPUs share the same instruction latency list.
+static constexpr uint32_t kArm64IntegerOpLatency = 2;
+static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
+
+static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
+static constexpr uint32_t kArm64DivDoubleLatency = 30;
+static constexpr uint32_t kArm64DivFloatLatency = 15;
+static constexpr uint32_t kArm64DivIntegerLatency = 5;
+static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
+static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
+static constexpr uint32_t kArm64MulIntegerLatency = 6;
+static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
+static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency;
+
+static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10;
+static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6;
+static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10;
+static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6;
+static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12;
+static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12;
+static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16;
+static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60;
+static constexpr uint32_t kArm64SIMDDivFloatLatency = 30;
+static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10;
+
+class SchedulingLatencyVisitorARM64 final : public SchedulingLatencyVisitor {
+ public:
+ // Default visitor for instructions not handled specifically below.
+ void VisitInstruction([[maybe_unused]] HInstruction*) override {
+ last_visited_latency_ = kArm64IntegerOpLatency;
+ }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \
+ M(ArrayGet , unused) \
+ M(ArrayLength , unused) \
+ M(ArraySet , unused) \
+ M(BoundsCheck , unused) \
+ M(Div , unused) \
+ M(InstanceFieldGet , unused) \
+ M(InstanceOf , unused) \
+ M(LoadString , unused) \
+ M(Mul , unused) \
+ M(NewArray , unused) \
+ M(NewInstance , unused) \
+ M(Rem , unused) \
+ M(StaticFieldGet , unused) \
+ M(SuspendCheck , unused) \
+ M(TypeConversion , unused) \
+ M(VecReplicateScalar , unused) \
+ M(VecExtractScalar , unused) \
+ M(VecReduce , unused) \
+ M(VecCnv , unused) \
+ M(VecNeg , unused) \
+ M(VecAbs , unused) \
+ M(VecNot , unused) \
+ M(VecAdd , unused) \
+ M(VecHalvingAdd , unused) \
+ M(VecSub , unused) \
+ M(VecMul , unused) \
+ M(VecDiv , unused) \
+ M(VecMin , unused) \
+ M(VecMax , unused) \
+ M(VecAnd , unused) \
+ M(VecAndNot , unused) \
+ M(VecOr , unused) \
+ M(VecXor , unused) \
+ M(VecShl , unused) \
+ M(VecShr , unused) \
+ M(VecUShr , unused) \
+ M(VecSetScalars , unused) \
+ M(VecMultiplyAccumulate, unused) \
+ M(VecLoad , unused) \
+ M(VecStore , unused)
+
+#define FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(M) \
+ M(BinaryOperation , unused) \
+ M(Invoke , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+ M(BitwiseNegatedRight, unused) \
+ M(MultiplyAccumulate, unused) \
+ M(IntermediateAddress, unused) \
+ M(IntermediateAddressIndex, unused) \
+ M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused) \
+ void Visit##type(H##type* instruction) override;
+
+ FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+ FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+ FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+ FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+ void HandleSimpleArithmeticSIMD(HVecOperation *instr);
+ void HandleVecAddress(HVecMemoryOperation* instruction, size_t size);
+};
+
void SchedulingLatencyVisitorARM64::VisitBinaryOperation(HBinaryOperation* instr) {
last_visited_latency_ = DataType::IsFloatingPointType(instr->GetResultType())
? kArm64FloatingPointOpLatency
@@ -348,5 +457,30 @@ void SchedulingLatencyVisitorARM64::VisitVecStore(HVecStore* instr) {
last_visited_latency_ = kArm64SIMDMemoryStoreLatency;
}
+bool HSchedulerARM64::IsSchedulable(const HInstruction* instruction) const {
+ switch (instruction->GetKind()) {
+#define SCHEDULABLE_CASE(type, unused) \
+ case HInstruction::InstructionKind::k##type: \
+ return true;
+ FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE)
+ FOR_EACH_CONCRETE_INSTRUCTION_ARM64(SCHEDULABLE_CASE)
+ FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(SCHEDULABLE_CASE)
+#undef SCHEDULABLE_CASE
+
+ default:
+ return HScheduler::IsSchedulable(instruction);
+ }
+}
+
+std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>>
+HSchedulerARM64::BuildSchedulingGraph(
+ HBasicBlock* block,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector) {
+ SchedulingLatencyVisitorARM64 latency_visitor;
+ return HScheduler::BuildSchedulingGraph(
+ block, allocator, heap_location_collector, &latency_visitor);
+}
+
} // namespace arm64
} // namespace art
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 7ce00e00ab..044aa48a5a 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -23,137 +23,13 @@
namespace art HIDDEN {
namespace arm64 {
-static constexpr uint32_t kArm64MemoryLoadLatency = 5;
-static constexpr uint32_t kArm64MemoryStoreLatency = 3;
-
-static constexpr uint32_t kArm64CallInternalLatency = 10;
-static constexpr uint32_t kArm64CallLatency = 5;
-
-// AArch64 instruction latency.
-// We currently assume that all arm64 CPUs share the same instruction latency list.
-static constexpr uint32_t kArm64IntegerOpLatency = 2;
-static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
-
-
-static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
-static constexpr uint32_t kArm64DivDoubleLatency = 30;
-static constexpr uint32_t kArm64DivFloatLatency = 15;
-static constexpr uint32_t kArm64DivIntegerLatency = 5;
-static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
-static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
-static constexpr uint32_t kArm64MulIntegerLatency = 6;
-static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
-static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency;
-
-static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10;
-static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6;
-static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10;
-static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6;
-static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12;
-static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12;
-static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16;
-static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60;
-static constexpr uint32_t kArm64SIMDDivFloatLatency = 30;
-static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10;
-
-class SchedulingLatencyVisitorARM64 final : public SchedulingLatencyVisitor {
- public:
- // Default visitor for instructions not handled specifically below.
- void VisitInstruction([[maybe_unused]] HInstruction*) override {
- last_visited_latency_ = kArm64IntegerOpLatency;
- }
-
-// We add a second unused parameter to be able to use this macro like the others
-// defined in `nodes.h`.
-#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \
- M(ArrayGet , unused) \
- M(ArrayLength , unused) \
- M(ArraySet , unused) \
- M(BoundsCheck , unused) \
- M(Div , unused) \
- M(InstanceFieldGet , unused) \
- M(InstanceOf , unused) \
- M(LoadString , unused) \
- M(Mul , unused) \
- M(NewArray , unused) \
- M(NewInstance , unused) \
- M(Rem , unused) \
- M(StaticFieldGet , unused) \
- M(SuspendCheck , unused) \
- M(TypeConversion , unused) \
- M(VecReplicateScalar , unused) \
- M(VecExtractScalar , unused) \
- M(VecReduce , unused) \
- M(VecCnv , unused) \
- M(VecNeg , unused) \
- M(VecAbs , unused) \
- M(VecNot , unused) \
- M(VecAdd , unused) \
- M(VecHalvingAdd , unused) \
- M(VecSub , unused) \
- M(VecMul , unused) \
- M(VecDiv , unused) \
- M(VecMin , unused) \
- M(VecMax , unused) \
- M(VecAnd , unused) \
- M(VecAndNot , unused) \
- M(VecOr , unused) \
- M(VecXor , unused) \
- M(VecShl , unused) \
- M(VecShr , unused) \
- M(VecUShr , unused) \
- M(VecSetScalars , unused) \
- M(VecMultiplyAccumulate, unused) \
- M(VecLoad , unused) \
- M(VecStore , unused)
-
-#define FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(M) \
- M(BinaryOperation , unused) \
- M(Invoke , unused)
-
-#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
- M(BitwiseNegatedRight, unused) \
- M(MultiplyAccumulate, unused) \
- M(IntermediateAddress, unused) \
- M(IntermediateAddressIndex, unused) \
- M(DataProcWithShifterOp, unused)
-
-#define DECLARE_VISIT_INSTRUCTION(type, unused) \
- void Visit##type(H##type* instruction) override;
-
- FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
- FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
- FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
- FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
-
-#undef DECLARE_VISIT_INSTRUCTION
-
- private:
- void HandleSimpleArithmeticSIMD(HVecOperation *instr);
- void HandleVecAddress(HVecMemoryOperation* instruction, size_t size);
-};
-
class HSchedulerARM64 : public HScheduler {
public:
explicit HSchedulerARM64(SchedulingNodeSelector* selector)
- : HScheduler(&arm64_latency_visitor_, selector) {}
+ : HScheduler(selector) {}
~HSchedulerARM64() override {}
- bool IsSchedulable(const HInstruction* instruction) const override {
-#define CASE_INSTRUCTION_KIND(type, unused) case \
- HInstruction::InstructionKind::k##type:
- switch (instruction->GetKind()) {
- FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
- return true;
- FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND)
- return true;
- FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(CASE_INSTRUCTION_KIND)
- return true;
- default:
- return HScheduler::IsSchedulable(instruction);
- }
-#undef CASE_INSTRUCTION_KIND
- }
+ bool IsSchedulable(const HInstruction* instruction) const override;
// Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized
// loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler;
@@ -169,8 +45,13 @@ class HSchedulerARM64 : public HScheduler {
instr->IsVecReplicateScalar();
}
+ protected:
+ std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+ HBasicBlock* block,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector) override;
+
private:
- SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);
};
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index c2b1fd6f7c..0b020f1460 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -406,15 +406,13 @@ TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
#if defined(ART_ENABLE_CODEGEN_arm)
TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) {
CriticalPathSchedulingNodeSelector critical_path_selector;
- arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
- arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(&critical_path_selector, /*codegen=*/ nullptr);
TestBuildDependencyGraphAndSchedule(&scheduler);
}
TEST_F(SchedulerTest, ArrayAccessAliasingARM) {
CriticalPathSchedulingNodeSelector critical_path_selector;
- arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
- arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(&critical_path_selector, /*codegen=*/ nullptr);
TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
}
#endif