ART: Enable scalar loop peeling and unrolling.
Turn on scalar loop peeling and unrolling by default.
Test: 482-checker-loop-back-edge-use, 530-checker-peel-unroll
Test: test-art-host, test-art-target, boot-to-gui
Change-Id: Ibfe1b54f790a97b281e85396da2985e0f22c2834
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index a0760ef..a212445 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -35,6 +35,9 @@
for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
HInstruction* instruction = it.Current();
+ if (it.Current()->GetType() == DataType::Type::kInt64) {
+ analysis_results->has_long_type_instructions_ = true;
+ }
if (MakesScalarPeelingUnrollingNonBeneficial(instruction)) {
analysis_results->has_instructions_preventing_scalar_peeling_ = true;
analysis_results->has_instructions_preventing_scalar_unrolling_ = true;
@@ -61,34 +64,29 @@
return false;
}
-class Arm64LoopHelper : public ArchDefaultLoopHelper {
+// Default implementation of loop helper; used for all targets unless a custom implementation
+// is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics.
+class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper {
public:
// Scalar loop unrolling parameters and heuristics.
//
// Maximum possible unrolling factor.
- static constexpr uint32_t kArm64ScalarMaxUnrollFactor = 2;
+ static constexpr uint32_t kScalarMaxUnrollFactor = 2;
// Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
- static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40;
+ static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
// Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
- static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8;
+ static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
- // SIMD loop unrolling parameters and heuristics.
- //
- // Maximum possible unrolling factor.
- static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8;
- // Loop's maximum instruction count. Loops with higher count will not be unrolled.
- static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
-
- bool IsLoopTooBigForScalarPeelingUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
- size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
- size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
- return (instr_num >= kArm64ScalarHeuristicMaxBodySizeInstr ||
- bb_num >= kArm64ScalarHeuristicMaxBodySizeBlocks);
+ bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+ return loop_analysis_info->HasLongTypeInstructions() ||
+ IsLoopTooBig(loop_analysis_info,
+ kScalarHeuristicMaxBodySizeInstr,
+ kScalarHeuristicMaxBodySizeBlocks);
}
uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED,
uint64_t trip_count) const OVERRIDE {
- uint32_t desired_unrolling_factor = kArm64ScalarMaxUnrollFactor;
+ uint32_t desired_unrolling_factor = kScalarMaxUnrollFactor;
if (trip_count < desired_unrolling_factor || trip_count % desired_unrolling_factor != 0) {
return kNoUnrollingFactor;
}
@@ -98,6 +96,38 @@
bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
+ protected:
+ bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
+ size_t instr_threshold,
+ size_t bb_threshold) const {
+ size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
+ size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
+ return (instr_num >= instr_threshold || bb_num >= bb_threshold);
+ }
+};
+
+// Custom implementation of loop helper for arm64 target. Enables heuristics for scalar loop
+// peeling and unrolling and supports SIMD loop unrolling.
+class Arm64LoopHelper : public ArchDefaultLoopHelper {
+ public:
+ // SIMD loop unrolling parameters and heuristics.
+ //
+ // Maximum possible unrolling factor.
+ static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8;
+ // Loop's maximum instruction count. Loops with higher count will not be unrolled.
+ static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
+
+ // Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
+ static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40;
+ // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
+ static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8;
+
+ bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+ return IsLoopTooBig(loop_analysis_info,
+ kArm64ScalarHeuristicMaxBodySizeInstr,
+ kArm64ScalarHeuristicMaxBodySizeBlocks);
+ }
+
uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
int64_t trip_count,
uint32_t max_peel,
@@ -126,8 +156,8 @@
}
};
-ArchDefaultLoopHelper* ArchDefaultLoopHelper::Create(InstructionSet isa,
- ArenaAllocator* allocator) {
+ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa,
+ ArenaAllocator* allocator) {
switch (isa) {
case InstructionSet::kArm64: {
return new (allocator) Arm64LoopHelper;
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index ece9858..c09d3ff 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -35,6 +35,7 @@
exits_num_(0),
has_instructions_preventing_scalar_peeling_(false),
has_instructions_preventing_scalar_unrolling_(false),
+ has_long_type_instructions_(false),
loop_info_(loop_info) {}
size_t GetNumberOfBasicBlocks() const { return bb_num_; }
@@ -49,6 +50,10 @@
return has_instructions_preventing_scalar_unrolling_;
}
+ bool HasLongTypeInstructions() const {
+ return has_long_type_instructions_;
+ }
+
const HLoopInformation* GetLoopInfo() const { return loop_info_; }
private:
@@ -62,6 +67,9 @@
bool has_instructions_preventing_scalar_peeling_;
// Whether the loop has instructions which make scalar loop unrolling non-beneficial.
bool has_instructions_preventing_scalar_unrolling_;
+ // Whether the loop has instructions of primitive long type; unrolling these loop will
+ // likely introduce spill/fills on 32-bit targets.
+ bool has_long_type_instructions_;
// Corresponding HLoopInformation.
const HLoopInformation* loop_info_;
@@ -117,22 +125,21 @@
// To support peeling/unrolling for a new architecture one needs to create new helper class,
// inherit it from this and add implementation for the following methods.
//
-class ArchDefaultLoopHelper : public ArenaObject<kArenaAllocOptimization> {
+class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> {
public:
- virtual ~ArchDefaultLoopHelper() {}
+ virtual ~ArchNoOptsLoopHelper() {}
// Creates an instance of specialised helper for the target or default helper if the target
// doesn't support loop peeling and unrolling.
- static ArchDefaultLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
+ static ArchNoOptsLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
- // Returns whether the loop is too big for loop peeling/unrolling by checking its total number of
- // basic blocks and instructions.
+ // Returns whether the loop is not beneficial for loop peeling/unrolling.
//
- // If the loop body has too many instructions then peeling/unrolling optimization will not bring
- // any noticeable performance improvement however will increase the code size.
+ // For example, if the loop body has too many instructions then peeling/unrolling optimization
+ // will not bring any noticeable performance improvement however will increase the code size.
//
// Returns 'true' by default, should be overridden by particular target loop helper.
- virtual bool IsLoopTooBigForScalarPeelingUnrolling(
+ virtual bool IsLoopNonBeneficialForScalarOpts(
LoopAnalysisInfo* loop_analysis_info ATTRIBUTE_UNUSED) const { return true; }
// Returns optimal scalar unrolling factor for the loop.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 1ce3524..eda6bd1 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,9 +33,6 @@
// Enables vectorization (SIMDization) in the loop optimizer.
static constexpr bool kEnableVectorization = true;
-// Enables scalar loop unrolling in the loop optimizer.
-static constexpr bool kEnableScalarPeelingUnrolling = false;
-
//
// Static helpers.
//
@@ -457,7 +454,7 @@
vector_header_(nullptr),
vector_body_(nullptr),
vector_index_(nullptr),
- arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+ arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_driver_ != nullptr
? compiler_driver_->GetInstructionSet()
: InstructionSet::kNone,
global_allocator_)) {
@@ -761,7 +758,7 @@
bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
// Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
// as InstructionSet is needed.
- if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+ if (compiler_driver_ == nullptr) {
return false;
}
@@ -781,9 +778,9 @@
LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
// Check "IsLoopClonable" last as it can be time-consuming.
- if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+ if (loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+ arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) ||
(loop_analysis_info.GetNumberOfExits() > 1) ||
- loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
!PeelUnrollHelper::IsLoopClonable(loop_info)) {
return false;
}
@@ -807,7 +804,7 @@
bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
// Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
// as InstructionSet is needed.
- if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+ if (compiler_driver_ == nullptr) {
return false;
}
@@ -821,8 +818,8 @@
LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
// Check "IsLoopClonable" last as it can be time-consuming.
- if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
- loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+ if (loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+ arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) ||
!LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
!PeelUnrollHelper::IsLoopClonable(loop_info)) {
return false;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 7807da1..191a93d 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -28,7 +28,7 @@
namespace art {
class CompilerDriver;
-class ArchDefaultLoopHelper;
+class ArchNoOptsLoopHelper;
/**
* Loop optimizations. Builds a loop hierarchy and applies optimizations to
@@ -308,7 +308,7 @@
HInstruction* vector_index_; // normalized index of the new loop
// Helper for target-specific behaviour for loop optimizations.
- ArchDefaultLoopHelper* arch_loop_helper_;
+ ArchNoOptsLoopHelper* arch_loop_helper_;
friend class LoopOptimizationTest;