diff options
author | 2018-02-15 14:43:48 +0000 | |
---|---|---|
committer | 2018-05-15 19:33:45 +0100 | |
commit | cf43fb6a1e676cc6bbc04c6591640f18643b1839 (patch) | |
tree | 2573ba1024307763c54df655333f1e2477d0ea82 /compiler | |
parent | 9076eb66ad173933d7fbd5ce328d31c7f97fd202 (diff) |
ART: Enable scalar loop peeling and unrolling.
Turn on scalar loop peeling and unrolling by default.
Test: 482-checker-loop-back-edge-use, 530-checker-peel-unroll
Test: test-art-host, test-art-target, boot-to-gui
Change-Id: Ibfe1b54f790a97b281e85396da2985e0f22c2834
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/optimizing/loop_analysis.cc | 68 | ||||
-rw-r--r-- | compiler/optimizing/loop_analysis.h | 23 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 17 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 4 |
4 files changed, 73 insertions, 39 deletions
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc index a0760eff69..a2124455e2 100644 --- a/compiler/optimizing/loop_analysis.cc +++ b/compiler/optimizing/loop_analysis.cc @@ -35,6 +35,9 @@ void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info, for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { HInstruction* instruction = it.Current(); + if (it.Current()->GetType() == DataType::Type::kInt64) { + analysis_results->has_long_type_instructions_ = true; + } if (MakesScalarPeelingUnrollingNonBeneficial(instruction)) { analysis_results->has_instructions_preventing_scalar_peeling_ = true; analysis_results->has_instructions_preventing_scalar_unrolling_ = true; @@ -61,34 +64,29 @@ bool LoopAnalysis::HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info) { return false; } -class Arm64LoopHelper : public ArchDefaultLoopHelper { +// Default implementation of loop helper; used for all targets unless a custom implementation +// is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics. +class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper { public: // Scalar loop unrolling parameters and heuristics. // // Maximum possible unrolling factor. - static constexpr uint32_t kArm64ScalarMaxUnrollFactor = 2; + static constexpr uint32_t kScalarMaxUnrollFactor = 2; // Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled. - static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40; + static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17; // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled. - static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8; + static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6; - // SIMD loop unrolling parameters and heuristics. - // - // Maximum possible unrolling factor. - static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8; - // Loop's maximum instruction count. Loops with higher count will not be unrolled. - static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50; - - bool IsLoopTooBigForScalarPeelingUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE { - size_t instr_num = loop_analysis_info->GetNumberOfInstructions(); - size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks(); - return (instr_num >= kArm64ScalarHeuristicMaxBodySizeInstr || - bb_num >= kArm64ScalarHeuristicMaxBodySizeBlocks); + bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE { + return loop_analysis_info->HasLongTypeInstructions() || + IsLoopTooBig(loop_analysis_info, + kScalarHeuristicMaxBodySizeInstr, + kScalarHeuristicMaxBodySizeBlocks); } uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED, uint64_t trip_count) const OVERRIDE { - uint32_t desired_unrolling_factor = kArm64ScalarMaxUnrollFactor; + uint32_t desired_unrolling_factor = kScalarMaxUnrollFactor; if (trip_count < desired_unrolling_factor || trip_count % desired_unrolling_factor != 0) { return kNoUnrollingFactor; } @@ -98,6 +96,38 @@ class Arm64LoopHelper : public ArchDefaultLoopHelper { bool IsLoopPeelingEnabled() const OVERRIDE { return true; } + protected: + bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info, + size_t instr_threshold, + size_t bb_threshold) const { + size_t instr_num = loop_analysis_info->GetNumberOfInstructions(); + size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks(); + return (instr_num >= instr_threshold || bb_num >= bb_threshold); + } +}; + +// Custom implementation of loop helper for arm64 target. Enables heuristics for scalar loop +// peeling and unrolling and supports SIMD loop unrolling. +class Arm64LoopHelper : public ArchDefaultLoopHelper { + public: + // SIMD loop unrolling parameters and heuristics. + // + // Maximum possible unrolling factor. + static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8; + // Loop's maximum instruction count. Loops with higher count will not be unrolled. + static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50; + + // Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled. + static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40; + // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled. + static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8; + + bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE { + return IsLoopTooBig(loop_analysis_info, + kArm64ScalarHeuristicMaxBodySizeInstr, + kArm64ScalarHeuristicMaxBodySizeBlocks); + } + uint32_t GetSIMDUnrollingFactor(HBasicBlock* block, int64_t trip_count, uint32_t max_peel, @@ -126,8 +156,8 @@ class Arm64LoopHelper : public ArchDefaultLoopHelper { } }; -ArchDefaultLoopHelper* ArchDefaultLoopHelper::Create(InstructionSet isa, - ArenaAllocator* allocator) { +ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa, + ArenaAllocator* allocator) { switch (isa) { case InstructionSet::kArm64: { return new (allocator) Arm64LoopHelper; diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h index ece9858136..c09d3ff00f 100644 --- a/compiler/optimizing/loop_analysis.h +++ b/compiler/optimizing/loop_analysis.h @@ -35,6 +35,7 @@ class LoopAnalysisInfo : public ValueObject { exits_num_(0), has_instructions_preventing_scalar_peeling_(false), has_instructions_preventing_scalar_unrolling_(false), + has_long_type_instructions_(false), loop_info_(loop_info) {} size_t GetNumberOfBasicBlocks() const { return bb_num_; } @@ -49,6 +50,10 @@ class LoopAnalysisInfo : public ValueObject { return has_instructions_preventing_scalar_unrolling_; } + bool HasLongTypeInstructions() const { + return has_long_type_instructions_; + } + const HLoopInformation* GetLoopInfo() const { return loop_info_; } private: @@ -62,6 +67,9 @@ class LoopAnalysisInfo : public ValueObject { bool has_instructions_preventing_scalar_peeling_; // Whether the loop has instructions which make scalar loop unrolling non-beneficial. bool has_instructions_preventing_scalar_unrolling_; + // Whether the loop has instructions of primitive long type; unrolling these loop will + // likely introduce spill/fills on 32-bit targets. + bool has_long_type_instructions_; // Corresponding HLoopInformation. const HLoopInformation* loop_info_; @@ -117,22 +125,21 @@ class LoopAnalysis : public ValueObject { // To support peeling/unrolling for a new architecture one needs to create new helper class, // inherit it from this and add implementation for the following methods. // -class ArchDefaultLoopHelper : public ArenaObject<kArenaAllocOptimization> { +class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> { public: - virtual ~ArchDefaultLoopHelper() {} + virtual ~ArchNoOptsLoopHelper() {} // Creates an instance of specialised helper for the target or default helper if the target // doesn't support loop peeling and unrolling. - static ArchDefaultLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator); + static ArchNoOptsLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator); - // Returns whether the loop is too big for loop peeling/unrolling by checking its total number of - // basic blocks and instructions. + // Returns whether the loop is not beneficial for loop peeling/unrolling. // - // If the loop body has too many instructions then peeling/unrolling optimization will not bring - // any noticeable performance improvement however will increase the code size. + // For example, if the loop body has too many instructions then peeling/unrolling optimization + // will not bring any noticeable performance improvement however will increase the code size. // // Returns 'true' by default, should be overridden by particular target loop helper. - virtual bool IsLoopTooBigForScalarPeelingUnrolling( + virtual bool IsLoopNonBeneficialForScalarOpts( LoopAnalysisInfo* loop_analysis_info ATTRIBUTE_UNUSED) const { return true; } // Returns optimal scalar unrolling factor for the loop. diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 1ce3524bd6..eda6bd1e86 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -33,9 +33,6 @@ namespace art { // Enables vectorization (SIMDization) in the loop optimizer. static constexpr bool kEnableVectorization = true; -// Enables scalar loop unrolling in the loop optimizer. -static constexpr bool kEnableScalarPeelingUnrolling = false; - // // Static helpers. // @@ -457,7 +454,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_header_(nullptr), vector_body_(nullptr), vector_index_(nullptr), - arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr + arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_driver_ != nullptr ? compiler_driver_->GetInstructionSet() : InstructionSet::kNone, global_allocator_)) { @@ -761,7 +758,7 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) { // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests) // as InstructionSet is needed. - if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) { + if (compiler_driver_ == nullptr) { return false; } @@ -781,9 +778,9 @@ bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) { LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); // Check "IsLoopClonable" last as it can be time-consuming. - if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) || + if (loop_analysis_info.HasInstructionsPreventingScalarUnrolling() || + arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) || (loop_analysis_info.GetNumberOfExits() > 1) || - loop_analysis_info.HasInstructionsPreventingScalarUnrolling() || !PeelUnrollHelper::IsLoopClonable(loop_info)) { return false; } @@ -807,7 +804,7 @@ bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) { bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) { // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests) // as InstructionSet is needed. - if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) { + if (compiler_driver_ == nullptr) { return false; } @@ -821,8 +818,8 @@ bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* nod LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); // Check "IsLoopClonable" last as it can be time-consuming. - if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) || - loop_analysis_info.HasInstructionsPreventingScalarPeeling() || + if (loop_analysis_info.HasInstructionsPreventingScalarPeeling() || + arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) || !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) || !PeelUnrollHelper::IsLoopClonable(loop_info)) { return false; diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 7807da15ed..191a93da26 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -28,7 +28,7 @@ namespace art { class CompilerDriver; -class ArchDefaultLoopHelper; +class ArchNoOptsLoopHelper; /** * Loop optimizations. Builds a loop hierarchy and applies optimizations to @@ -308,7 +308,7 @@ class HLoopOptimization : public HOptimization { HInstruction* vector_index_; // normalized index of the new loop // Helper for target-specific behaviour for loop optimizations. - ArchDefaultLoopHelper* arch_loop_helper_; + ArchNoOptsLoopHelper* arch_loop_helper_; friend class LoopOptimizationTest; |