diff options
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/driver/compiler_options.cc | 59 | ||||
-rw-r--r-- | compiler/driver/compiler_options.h | 34 | ||||
-rw-r--r-- | compiler/jit/jit_compiler.cc | 28 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 9 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 7 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 9 | ||||
-rw-r--r-- | compiler/optimizing/graph_visualizer.cc | 12 | ||||
-rw-r--r-- | compiler/optimizing/induction_var_range.cc | 10 | ||||
-rw-r--r-- | compiler/optimizing/induction_var_range.h | 1 | ||||
-rw-r--r-- | compiler/optimizing/induction_var_range_test.cc | 12 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 291 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 28 | ||||
-rw-r--r-- | compiler/optimizing/nodes.cc | 1 | ||||
-rw-r--r-- | compiler/optimizing/nodes.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/scheduler.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/ssa_liveness_analysis.cc | 2 |
16 files changed, 308 insertions, 217 deletions
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc index a4e2083fe4..76f0ae9202 100644 --- a/compiler/driver/compiler_options.cc +++ b/compiler/driver/compiler_options.cc @@ -40,7 +40,7 @@ CompilerOptions::CompilerOptions() implicit_so_checks_(true), implicit_suspend_checks_(false), compile_pic_(false), - verbose_methods_(nullptr), + verbose_methods_(), abort_on_hard_verifier_failure_(false), init_failure_output_(nullptr), dump_cfg_file_name_(""), @@ -55,58 +55,6 @@ CompilerOptions::~CompilerOptions() { // because we don't want to include the PassManagerOptions definition from the header file. } -CompilerOptions::CompilerOptions(CompilerFilter::Filter compiler_filter, - size_t huge_method_threshold, - size_t large_method_threshold, - size_t small_method_threshold, - size_t tiny_method_threshold, - size_t num_dex_methods_threshold, - size_t inline_max_code_units, - const std::vector<const DexFile*>* no_inline_from, - double top_k_profile_threshold, - bool debuggable, - bool generate_debug_info, - bool implicit_null_checks, - bool implicit_so_checks, - bool implicit_suspend_checks, - bool compile_pic, - const std::vector<std::string>* verbose_methods, - std::ostream* init_failure_output, - bool abort_on_hard_verifier_failure, - const std::string& dump_cfg_file_name, - bool dump_cfg_append, - bool force_determinism, - RegisterAllocator::Strategy regalloc_strategy, - const std::vector<std::string>* passes_to_run) - : compiler_filter_(compiler_filter), - huge_method_threshold_(huge_method_threshold), - large_method_threshold_(large_method_threshold), - small_method_threshold_(small_method_threshold), - tiny_method_threshold_(tiny_method_threshold), - num_dex_methods_threshold_(num_dex_methods_threshold), - inline_max_code_units_(inline_max_code_units), - no_inline_from_(no_inline_from), - boot_image_(false), - app_image_(false), - top_k_profile_threshold_(top_k_profile_threshold), - debuggable_(debuggable), - generate_debug_info_(generate_debug_info), - generate_mini_debug_info_(kDefaultGenerateMiniDebugInfo), - generate_build_id_(false), - implicit_null_checks_(implicit_null_checks), - implicit_so_checks_(implicit_so_checks), - implicit_suspend_checks_(implicit_suspend_checks), - compile_pic_(compile_pic), - verbose_methods_(verbose_methods), - abort_on_hard_verifier_failure_(abort_on_hard_verifier_failure), - init_failure_output_(init_failure_output), - dump_cfg_file_name_(dump_cfg_file_name), - dump_cfg_append_(dump_cfg_append), - force_determinism_(force_determinism), - register_allocation_strategy_(regalloc_strategy), - passes_to_run_(passes_to_run) { -} - void CompilerOptions::ParseHugeMethodMax(const StringPiece& option, UsageFn Usage) { ParseUintOption(option, "--huge-method-max", &huge_method_threshold_, Usage); } @@ -204,6 +152,11 @@ bool CompilerOptions::ParseCompilerOption(const StringPiece& option, UsageFn Usa dump_cfg_append_ = true; } else if (option.starts_with("--register-allocation-strategy=")) { ParseRegisterAllocationStrategy(option, Usage); + } else if (option.starts_with("--verbose-methods=")) { + // TODO: rather than switch off compiler logging, make all VLOG(compiler) messages + // conditional on having verbose methods. + gLogVerbosity.compiler = false; + Split(option.substr(strlen("--verbose-methods=")).ToString(), ',', &verbose_methods_); } else { // Option not recognized. return false; diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h index 89c2537476..b99263db0e 100644 --- a/compiler/driver/compiler_options.h +++ b/compiler/driver/compiler_options.h @@ -52,30 +52,6 @@ class CompilerOptions FINAL { CompilerOptions(); ~CompilerOptions(); - CompilerOptions(CompilerFilter::Filter compiler_filter, - size_t huge_method_threshold, - size_t large_method_threshold, - size_t small_method_threshold, - size_t tiny_method_threshold, - size_t num_dex_methods_threshold, - size_t inline_max_code_units, - const std::vector<const DexFile*>* no_inline_from, - double top_k_profile_threshold, - bool debuggable, - bool generate_debug_info, - bool implicit_null_checks, - bool implicit_so_checks, - bool implicit_suspend_checks, - bool compile_pic, - const std::vector<std::string>* verbose_methods, - std::ostream* init_failure_output, - bool abort_on_hard_verifier_failure, - const std::string& dump_cfg_file_name, - bool dump_cfg_append, - bool force_determinism, - RegisterAllocator::Strategy regalloc_strategy, - const std::vector<std::string>* passes_to_run); - CompilerFilter::Filter GetCompilerFilter() const { return compiler_filter_; } @@ -163,6 +139,10 @@ class CompilerOptions FINAL { return debuggable_; } + void SetDebuggable(bool value) { + debuggable_ = value; + } + bool GetNativeDebuggable() const { return GetDebuggable() && GetGenerateDebugInfo(); } @@ -211,11 +191,11 @@ class CompilerOptions FINAL { } bool HasVerboseMethods() const { - return verbose_methods_ != nullptr && !verbose_methods_->empty(); + return !verbose_methods_.empty(); } bool IsVerboseMethod(const std::string& pretty_method) const { - for (const std::string& cur_method : *verbose_methods_) { + for (const std::string& cur_method : verbose_methods_) { if (pretty_method.find(cur_method) != std::string::npos) { return true; } @@ -299,7 +279,7 @@ class CompilerOptions FINAL { bool compile_pic_; // Vector of methods to have verbose output enabled for. - const std::vector<std::string>* verbose_methods_; + std::vector<std::string> verbose_methods_; // Abort compilation with an error if we find a class that fails verification with a hard // failure. diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc index 66135414f7..715d97379e 100644 --- a/compiler/jit/jit_compiler.cc +++ b/compiler/jit/jit_compiler.cc @@ -90,36 +90,16 @@ NO_RETURN static void Usage(const char* fmt, ...) { } JitCompiler::JitCompiler() { - compiler_options_.reset(new CompilerOptions( - CompilerFilter::kDefaultCompilerFilter, - CompilerOptions::kDefaultHugeMethodThreshold, - CompilerOptions::kDefaultLargeMethodThreshold, - CompilerOptions::kDefaultSmallMethodThreshold, - CompilerOptions::kDefaultTinyMethodThreshold, - CompilerOptions::kDefaultNumDexMethodsThreshold, - CompilerOptions::kDefaultInlineMaxCodeUnits, - /* no_inline_from */ nullptr, - CompilerOptions::kDefaultTopKProfileThreshold, - Runtime::Current()->IsJavaDebuggable(), - CompilerOptions::kDefaultGenerateDebugInfo, - /* implicit_null_checks */ true, - /* implicit_so_checks */ true, - /* implicit_suspend_checks */ false, - /* pic */ false, - /* verbose_methods */ nullptr, - /* init_failure_output */ nullptr, - /* abort_on_hard_verifier_failure */ false, - /* dump_cfg_file_name */ "", - /* dump_cfg_append */ false, - /* force_determinism */ false, - RegisterAllocator::kRegisterAllocatorDefault, - /* passes_to_run */ nullptr)); + compiler_options_.reset(new CompilerOptions()); for (const std::string& argument : Runtime::Current()->GetCompilerOptions()) { compiler_options_->ParseCompilerOption(argument, Usage); } // JIT is never PIC, no matter what the runtime compiler options specify. compiler_options_->SetNonPic(); + // Set debuggability based on the runtime value. + compiler_options_->SetDebuggable(Runtime::Current()->IsJavaDebuggable()); + const InstructionSet instruction_set = kRuntimeISA; for (const StringPiece option : Runtime::Current()->GetCompilerOptions()) { VLOG(compiler) << "JIT compiler option " << option; diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 0b3ac204ff..6b9f232e8f 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -2875,21 +2875,28 @@ void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condi if (CanGenerateTest(condition, codegen_->GetAssembler())) { Label* non_fallthrough_target; bool invert; + bool emit_both_branches; if (true_target_in == nullptr) { + // The true target is fallthrough. DCHECK(false_target_in != nullptr); non_fallthrough_target = false_target_in; invert = true; + emit_both_branches = false; } else { + // Either the false target is fallthrough, or there is no fallthrough + // and both branches must be emitted. non_fallthrough_target = true_target_in; invert = false; + emit_both_branches = (false_target_in != nullptr); } const auto cond = GenerateTest(condition, invert, codegen_); __ b(non_fallthrough_target, cond.first); - if (false_target_in != nullptr && false_target_in != non_fallthrough_target) { + if (emit_both_branches) { + // No target falls through, we need to branch. __ b(false_target_in); } diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 34397e66bc..2561ed0762 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -3608,9 +3608,6 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct size_t condition_input_index, vixl::aarch64::Label* true_target, vixl::aarch64::Label* false_target) { - // FP branching requires both targets to be explicit. If either of the targets - // is nullptr (fallthrough) use and bind `fallthrough_target` instead. - vixl::aarch64::Label fallthrough_target; HInstruction* cond = instruction->InputAt(condition_input_index); if (true_target == nullptr && false_target == nullptr) { @@ -3711,10 +3708,6 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct if (true_target != nullptr && false_target != nullptr) { __ B(false_target); } - - if (fallthrough_target.IsLinked()) { - __ Bind(&fallthrough_target); - } } void LocationsBuilderARM64::VisitIf(HIf* if_instr) { diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index a8b00c358b..9a2402be04 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -2964,21 +2964,28 @@ void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* c if (CanGenerateTest(condition, codegen_->GetAssembler())) { vixl32::Label* non_fallthrough_target; bool invert; + bool emit_both_branches; if (true_target_in == nullptr) { + // The true target is fallthrough. DCHECK(false_target_in != nullptr); non_fallthrough_target = false_target_in; invert = true; + emit_both_branches = false; } else { non_fallthrough_target = true_target_in; invert = false; + // Either the false target is fallthrough, or there is no fallthrough + // and both branches must be emitted. + emit_both_branches = (false_target_in != nullptr); } const auto cond = GenerateTest(condition, invert, codegen_); __ B(cond.first, non_fallthrough_target, is_far_target); - if (false_target_in != nullptr && false_target_in != non_fallthrough_target) { + if (emit_both_branches) { + // No target falls through, we need to branch. __ B(false_target_in); } diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index 7dcf2440b2..a20ec3c0db 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -451,8 +451,16 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { void VisitInvoke(HInvoke* invoke) OVERRIDE { StartAttributeStream("dex_file_index") << invoke->GetDexMethodIndex(); - StartAttributeStream("method_name") << GetGraph()->GetDexFile().PrettyMethod( - invoke->GetDexMethodIndex(), /* with_signature */ false); + ArtMethod* method = invoke->GetResolvedMethod(); + // We don't print signatures, which conflict with c1visualizer format. + static constexpr bool kWithSignature = false; + // Note that we can only use the graph's dex file for the unresolved case. The + // other invokes might be coming from inlined methods. + ScopedObjectAccess soa(Thread::Current()); + std::string method_name = (method == nullptr) + ? GetGraph()->GetDexFile().PrettyMethod(invoke->GetDexMethodIndex(), kWithSignature) + : method->PrettyMethod(kWithSignature); + StartAttributeStream("method_name") << method_name; } void VisitInvokeUnresolved(HInvokeUnresolved* invoke) OVERRIDE { diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc index c0ec58f824..f35aace3a9 100644 --- a/compiler/optimizing/induction_var_range.cc +++ b/compiler/optimizing/induction_var_range.cc @@ -373,21 +373,23 @@ bool InductionVarRange::IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) co bool InductionVarRange::IsUnitStride(HInstruction* context, HInstruction* instruction, + HGraph* graph, /*out*/ HInstruction** offset) const { HLoopInformation* loop = nullptr; HInductionVarAnalysis::InductionInfo* info = nullptr; HInductionVarAnalysis::InductionInfo* trip = nullptr; if (HasInductionInfo(context, instruction, &loop, &info, &trip)) { if (info->induction_class == HInductionVarAnalysis::kLinear && - info->op_b->operation == HInductionVarAnalysis::kFetch && !HInductionVarAnalysis::IsNarrowingLinear(info)) { int64_t stride_value = 0; if (IsConstant(info->op_a, kExact, &stride_value) && stride_value == 1) { int64_t off_value = 0; - if (IsConstant(info->op_b, kExact, &off_value) && off_value == 0) { - *offset = nullptr; - } else { + if (IsConstant(info->op_b, kExact, &off_value)) { + *offset = graph->GetConstant(info->op_b->type, off_value); + } else if (info->op_b->operation == HInductionVarAnalysis::kFetch) { *offset = info->op_b->fetch; + } else { + return false; } return true; } diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h index a8ee829d08..ab1772bf15 100644 --- a/compiler/optimizing/induction_var_range.h +++ b/compiler/optimizing/induction_var_range.h @@ -163,6 +163,7 @@ class InductionVarRange { */ bool IsUnitStride(HInstruction* context, HInstruction* instruction, + HGraph* graph, /*out*/ HInstruction** offset) const; /** diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc index d01d3146fc..67d2093829 100644 --- a/compiler/optimizing/induction_var_range_test.cc +++ b/compiler/optimizing/induction_var_range_test.cc @@ -770,8 +770,8 @@ TEST_F(InductionVarRangeTest, ConstantTripCountUp) { EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); EXPECT_EQ(1000, tc); HInstruction* offset = nullptr; - EXPECT_TRUE(range_.IsUnitStride(phi, phi, &offset)); - EXPECT_TRUE(offset == nullptr); + EXPECT_TRUE(range_.IsUnitStride(phi, phi, graph_, &offset)); + ExpectInt(0, offset); HInstruction* tce = range_.GenerateTripCount( loop_header_->GetLoopInformation(), graph_, loop_preheader_); ASSERT_TRUE(tce != nullptr); @@ -826,7 +826,7 @@ TEST_F(InductionVarRangeTest, ConstantTripCountDown) { EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); EXPECT_EQ(1000, tc); HInstruction* offset = nullptr; - EXPECT_FALSE(range_.IsUnitStride(phi, phi, &offset)); + EXPECT_FALSE(range_.IsUnitStride(phi, phi, graph_, &offset)); HInstruction* tce = range_.GenerateTripCount( loop_header_->GetLoopInformation(), graph_, loop_preheader_); ASSERT_TRUE(tce != nullptr); @@ -908,8 +908,8 @@ TEST_F(InductionVarRangeTest, SymbolicTripCountUp) { EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); EXPECT_EQ(0, tc); // unknown HInstruction* offset = nullptr; - EXPECT_TRUE(range_.IsUnitStride(phi, phi, &offset)); - EXPECT_TRUE(offset == nullptr); + EXPECT_TRUE(range_.IsUnitStride(phi, phi, graph_, &offset)); + ExpectInt(0, offset); HInstruction* tce = range_.GenerateTripCount( loop_header_->GetLoopInformation(), graph_, loop_preheader_); ASSERT_TRUE(tce != nullptr); @@ -994,7 +994,7 @@ TEST_F(InductionVarRangeTest, SymbolicTripCountDown) { EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); EXPECT_EQ(0, tc); // unknown HInstruction* offset = nullptr; - EXPECT_FALSE(range_.IsUnitStride(phi, phi, &offset)); + EXPECT_FALSE(range_.IsUnitStride(phi, phi, graph_, &offset)); HInstruction* tce = range_.GenerateTripCount( loop_header_->GetLoopInformation(), graph_, loop_preheader_); ASSERT_TRUE(tce != nullptr); diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index d2493137fe..b61d7b80d1 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -31,6 +31,9 @@ namespace art { // Enables vectorization (SIMDization) in the loop optimizer. static constexpr bool kEnableVectorization = true; +// All current SIMD targets want 16-byte alignment. +static constexpr size_t kAlignedBase = 16; + // Remove the instruction from the graph. A bit more elaborate than the usual // instruction removal, since there may be a cycle in the use structure. static void RemoveFromCycle(HInstruction* instruction) { @@ -283,6 +286,9 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, simplified_(false), vector_length_(0), vector_refs_(nullptr), + vector_peeling_candidate_(nullptr), + vector_runtime_test_a_(nullptr), + vector_runtime_test_b_(nullptr), vector_map_(nullptr) { } @@ -422,23 +428,6 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) { // Optimization. // -bool HLoopOptimization::CanRemoveCycle() { - for (HInstruction* i : *iset_) { - // We can never remove instructions that have environment - // uses when we compile 'debuggable'. - if (i->HasEnvironmentUses() && graph_->IsDebuggable()) { - return false; - } - // A deoptimization should never have an environment input removed. - for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) { - if (use.GetUser()->GetHolder()->IsDeoptimize()) { - return false; - } - } - } - return true; -} - void HLoopOptimization::SimplifyInduction(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); @@ -565,7 +554,7 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { if (kEnableVectorization) { iset_->clear(); // prepare phi induction if (TrySetSimpleLoopHeader(header) && - CanVectorize(node, body, trip_count) && + ShouldVectorize(node, body, trip_count) && TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) { Vectorize(node, body, exit, trip_count); graph_->SetHasSIMD(true); // flag SIMD usage @@ -580,10 +569,11 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { // Intel Press, June, 2004 (http://www.aartbik.com/). // -bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) { +bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) { // Reset vector bookkeeping. vector_length_ = 0; vector_refs_->clear(); + vector_peeling_candidate_ = nullptr; vector_runtime_test_a_ = vector_runtime_test_b_= nullptr; @@ -600,12 +590,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t } } - // Heuristics. Does vectorization seem profitable? - // TODO: refine - if (vector_length_ == 0) { - return false; // nothing found - } else if (0 < trip_count && trip_count < vector_length_) { - return false; // insufficient iterations + // Does vectorization seem profitable? + if (!IsVectorizationProfitable(trip_count)) { + return false; } // Data dependence analysis. Find each pair of references with same type, where @@ -633,18 +620,24 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t // Conservatively assume a potential loop-carried data dependence otherwise, avoided by // generating an explicit a != b disambiguation runtime test on the two references. if (x != y) { - // For now, we reject after one test to avoid excessive overhead. - if (vector_runtime_test_a_ != nullptr) { - return false; + // To avoid excessive overhead, we only accept one a != b test. + if (vector_runtime_test_a_ == nullptr) { + // First test found. + vector_runtime_test_a_ = a; + vector_runtime_test_b_ = b; + } else if ((vector_runtime_test_a_ != a || vector_runtime_test_b_ != b) && + (vector_runtime_test_a_ != b || vector_runtime_test_b_ != a)) { + return false; // second test would be needed } - vector_runtime_test_a_ = a; - vector_runtime_test_b_ = b; } } } } } + // Consider dynamic loop peeling for alignment. + SetPeelingCandidate(trip_count); + // Success! return true; } @@ -657,28 +650,52 @@ void HLoopOptimization::Vectorize(LoopNode* node, HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); - // A cleanup is needed for any unknown trip count or for a known trip count - // with remainder iterations after vectorization. - bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0; + // Pick a loop unrolling factor for the vector loop. + uint32_t unroll = GetUnrollingFactor(block, trip_count); + uint32_t chunk = vector_length_ * unroll; + + // A cleanup loop is needed, at least, for any unknown trip count or + // for a known trip count with remainder iterations after vectorization. + bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0; // Adjust vector bookkeeping. iset_->clear(); // prepare phi induction bool is_simple_loop_header = TrySetSimpleLoopHeader(header); // fills iset_ DCHECK(is_simple_loop_header); + vector_header_ = header; + vector_body_ = block; + + // Generate dynamic loop peeling trip count, if needed: + // ptc = <peeling-needed-for-candidate> + HInstruction* ptc = nullptr; + if (vector_peeling_candidate_ != nullptr) { + DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count"; + // + // TODO: Implement this. Compute address of first access memory location and + // compute peeling factor to obtain kAlignedBase alignment. + // + needs_cleanup = true; + } - // Generate preheader: + // Generate loop control: // stc = <trip-count>; - // vtc = stc - stc % VL; + // vtc = stc - (stc - ptc) % chunk; + // i = 0; HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); HInstruction* vtc = stc; if (needs_cleanup) { - DCHECK(IsPowerOfTwo(vector_length_)); + DCHECK(IsPowerOfTwo(chunk)); + HInstruction* diff = stc; + if (ptc != nullptr) { + diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc)); + } HInstruction* rem = Insert( preheader, new (global_allocator_) HAnd(induc_type, - stc, - graph_->GetIntConstant(vector_length_ - 1))); + diff, + graph_->GetIntConstant(chunk - 1))); vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem)); } + vector_index_ = graph_->GetIntConstant(0); // Generate runtime disambiguation test: // vtc = a != b ? vtc : 0; @@ -691,16 +708,31 @@ void HLoopOptimization::Vectorize(LoopNode* node, needs_cleanup = true; } - // Generate vector loop: - // for (i = 0; i < vtc; i += VL) + // Generate dynamic peeling loop for alignment, if needed: + // for ( ; i < ptc; i += 1) + // <loop-body> + if (ptc != nullptr) { + vector_mode_ = kSequential; + GenerateNewLoop(node, + block, + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), + vector_index_, + ptc, + graph_->GetIntConstant(1), + /*unroll*/ 1); + } + + // Generate vector loop, possibly further unrolled: + // for ( ; i < vtc; i += chunk) // <vectorized-loop-body> vector_mode_ = kVector; GenerateNewLoop(node, block, - graph_->TransformLoopForVectorization(header, block, exit), - graph_->GetIntConstant(0), + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), + vector_index_, vtc, - graph_->GetIntConstant(vector_length_)); + graph_->GetIntConstant(vector_length_), // increment per unroll + unroll); HLoopInformation* vloop = vector_header_->GetLoopInformation(); // Generate cleanup loop, if needed: @@ -711,9 +743,10 @@ void HLoopOptimization::Vectorize(LoopNode* node, GenerateNewLoop(node, block, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), - vector_phi_, + vector_index_, stc, - graph_->GetIntConstant(1)); + graph_->GetIntConstant(1), + /*unroll*/ 1); } // Remove the original loop by disconnecting the body block @@ -722,8 +755,9 @@ void HLoopOptimization::Vectorize(LoopNode* node, while (!header->GetFirstInstruction()->IsGoto()) { header->RemoveInstruction(header->GetFirstInstruction()); } - // Update loop hierarchy: the old header now resides in the - // same outer loop as the old preheader. + // Update loop hierarchy: the old header now resides in the same outer loop + // as the old preheader. Note that we don't bother putting sequential + // loops back in the hierarchy at this point. header->SetLoopInformation(preheader->GetLoopInformation()); // outward node->loop_info = vloop; } @@ -733,44 +767,64 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, - HInstruction* step) { + HInstruction* step, + uint32_t unroll) { + DCHECK(unroll == 1 || vector_mode_ == kVector); Primitive::Type induc_type = Primitive::kPrimInt; // Prepare new loop. - vector_map_->clear(); vector_preheader_ = new_preheader, vector_header_ = vector_preheader_->GetSingleSuccessor(); vector_body_ = vector_header_->GetSuccessors()[1]; - vector_phi_ = new (global_allocator_) HPhi(global_allocator_, - kNoRegNumber, - 0, - HPhi::ToPhiType(induc_type)); + HPhi* phi = new (global_allocator_) HPhi(global_allocator_, + kNoRegNumber, + 0, + HPhi::ToPhiType(induc_type)); // Generate header and prepare body. // for (i = lo; i < hi; i += step) // <loop-body> - HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi); - vector_header_->AddPhi(vector_phi_); + HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi); + vector_header_->AddPhi(phi); vector_header_->AddInstruction(cond); vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); - DCHECK(vectorized_def); - } - // Generate body from the instruction map, but in original program order. - HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - auto i = vector_map_->find(it.Current()); - if (i != vector_map_->end() && !i->second->IsInBlock()) { - Insert(vector_body_, i->second); - // Deal with instructions that need an environment, such as the scalar intrinsics. - if (i->second->NeedsEnvironment()) { - i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + vector_index_ = phi; + for (uint32_t u = 0; u < unroll; u++) { + // Clear map, leaving loop invariants setup during unrolling. + if (u == 0) { + vector_map_->clear(); + } else { + for (auto i = vector_map_->begin(); i != vector_map_->end(); ) { + if (i->second->IsVecReplicateScalar()) { + DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first)); + ++i; + } else { + i = vector_map_->erase(i); + } + } + } + // Generate instruction map. + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); + DCHECK(vectorized_def); + } + // Generate body from the instruction map, but in original program order. + HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end() && !i->second->IsInBlock()) { + Insert(vector_body_, i->second); + // Deal with instructions that need an environment, such as the scalar intrinsics. + if (i->second->NeedsEnvironment()) { + i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + } } } + vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step); + Insert(vector_body_, vector_index_); } - // Finalize increment and phi. - HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step); - vector_phi_->AddInput(lo); - vector_phi_->AddInput(Insert(vector_body_, inc)); + // Finalize phi for the loop index. + phi->AddInput(lo); + phi->AddInput(vector_index_); + vector_index_ = phi; } // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc. @@ -791,11 +845,11 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, HInstruction* offset = nullptr; if (TrySetVectorType(type, &restrictions) && node->loop_info->IsDefinedOutOfTheLoop(base) && - induction_range_.IsUnitStride(instruction, index, &offset) && + induction_range_.IsUnitStride(instruction, index, graph_, &offset) && VectorizeUse(node, value, generate_code, type, restrictions)) { if (generate_code) { GenerateVecSub(index, offset); - GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type); + GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type); } else { vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true)); } @@ -849,10 +903,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, HInstruction* offset = nullptr; if (type == instruction->GetType() && node->loop_info->IsDefinedOutOfTheLoop(base) && - induction_range_.IsUnitStride(instruction, index, &offset)) { + induction_range_.IsUnitStride(instruction, index, graph_, &offset)) { if (generate_code) { GenerateVecSub(index, offset); - GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type); + GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type); } else { vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false)); } @@ -1164,8 +1218,9 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) { if (vector_map_->find(org) == vector_map_->end()) { - HInstruction* subscript = vector_phi_; - if (offset != nullptr) { + HInstruction* subscript = vector_index_; + int64_t value = 0; + if (!IsInt64AndGet(offset, &value) || value != 0) { subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset); if (org->IsPhi()) { Insert(vector_body_, subscript); // lacks layout placeholder @@ -1178,17 +1233,27 @@ void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) void HLoopOptimization::GenerateVecMem(HInstruction* org, HInstruction* opa, HInstruction* opb, + HInstruction* offset, Primitive::Type type) { HInstruction* vector = nullptr; if (vector_mode_ == kVector) { // Vector store or load. + HInstruction* base = org->InputAt(0); if (opb != nullptr) { vector = new (global_allocator_) HVecStore( - global_allocator_, org->InputAt(0), opa, opb, type, vector_length_); + global_allocator_, base, opa, opb, type, vector_length_); } else { bool is_string_char_at = org->AsArrayGet()->IsStringCharAt(); vector = new (global_allocator_) HVecLoad( - global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at); + global_allocator_, base, opa, type, vector_length_, is_string_char_at); + } + // Known dynamically enforced alignment? + // TODO: detect offset + constant differences. + // TODO: long run, static alignment analysis? + if (vector_peeling_candidate_ != nullptr && + vector_peeling_candidate_->base == base && + vector_peeling_candidate_->offset == offset) { + vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0)); } } else { // Scalar store or load. @@ -1444,10 +1509,57 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, } // +// Vectorization heuristics. +// + +bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) { + // Current heuristic: non-empty body with sufficient number + // of iterations (if known). + // TODO: refine by looking at e.g. operation count, alignment, etc. + if (vector_length_ == 0) { + return false; // nothing found + } else if (0 < trip_count && trip_count < vector_length_) { + return false; // insufficient iterations + } + return true; +} + +void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) { + // Current heuristic: none. + // TODO: implement +} + +uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) { + // Current heuristic: unroll by 2 on ARM64/X86 for large known trip + // counts and small loop bodies. + // TODO: refine with operation count, remaining iterations, etc. + // Artem had some really cool ideas for this already. + switch (compiler_driver_->GetInstructionSet()) { + case kArm64: + case kX86: + case kX86_64: { + size_t num_instructions = block->GetInstructions().CountSize(); + if (num_instructions <= 10 && trip_count >= 4 * vector_length_) { + return 2; + } + return 1; + } + default: + return 1; + } +} + +// // Helpers. // bool HLoopOptimization::TrySetPhiInduction(HPhi* phi, bool restrict_uses) { + // Special case Phis that have equivalent in a debuggable setup. Our graph checker isn't + // smart enough to follow strongly connected components (and it's probably not worth + // it to make it so). See b/33775412. + if (graph_->IsDebuggable() && phi->HasEquivalentPhi()) { + return false; + } DCHECK(iset_->empty()); ArenaSet<HInstruction*>* set = induction_range_.LookupCycle(phi); if (set != nullptr) { @@ -1576,8 +1688,8 @@ bool HLoopOptimization::TryReplaceWithLastValue(HLoopInformation* loop_info, size_t index = it->GetIndex(); ++it; // increment before replacing if (iset_->find(user->GetHolder()) == iset_->end()) { // not excluded? - HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation(); // Only update environment uses after the loop. + HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation(); if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) { user->RemoveAsUserOfInput(index); user->SetRawEnvAt(index, replacement); @@ -1614,4 +1726,21 @@ void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) { } } +bool HLoopOptimization::CanRemoveCycle() { + for (HInstruction* i : *iset_) { + // We can never remove instructions that have environment + // uses when we compile 'debuggable'. + if (i->HasEnvironmentUses() && graph_->IsDebuggable()) { + return false; + } + // A deoptimization should never have an environment input removed. + for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) { + if (use.GetUser()->GetHolder()->IsDeoptimize()) { + return false; + } + } + } + return true; +} + } // namespace art diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index cc6343aeb5..de4bd85fc8 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -116,14 +116,15 @@ class HLoopOptimization : public HOptimization { void OptimizeInnerLoop(LoopNode* node); // Vectorization analysis and synthesis. - bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); + bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); void GenerateNewLoop(LoopNode* node, HBasicBlock* block, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, - HInstruction* step); + HInstruction* step, + uint32_t unroll); bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); bool VectorizeUse(LoopNode* node, HInstruction* instruction, @@ -133,10 +134,11 @@ class HLoopOptimization : public HOptimization { bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions); bool TrySetVectorLength(uint32_t length); void GenerateVecInv(HInstruction* org, Primitive::Type type); - void GenerateVecSub(HInstruction* org, HInstruction* off); + void GenerateVecSub(HInstruction* org, HInstruction* offset); void GenerateVecMem(HInstruction* org, HInstruction* opa, HInstruction* opb, + HInstruction* offset, Primitive::Type type); void GenerateVecOp(HInstruction* org, HInstruction* opa, @@ -151,6 +153,11 @@ class HLoopOptimization : public HOptimization { Primitive::Type type, uint64_t restrictions); + // Vectorization heuristics. + bool IsVectorizationProfitable(int64_t trip_count); + void SetPeelingCandidate(int64_t trip_count); + uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count); + // Helpers. bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); bool TrySetSimpleLoopHeader(HBasicBlock* block); @@ -208,20 +215,25 @@ class HLoopOptimization : public HOptimization { // Contents reside in phase-local heap memory. ArenaSet<ArrayReference>* vector_refs_; + // Dynamic loop peeling candidate for alignment. + const ArrayReference* vector_peeling_candidate_; + + // Dynamic data dependence test of the form a != b. + HInstruction* vector_runtime_test_a_; + HInstruction* vector_runtime_test_b_; + // Mapping used during vectorization synthesis for both the scalar peeling/cleanup - // loop (simd_ is false) and the actual vector loop (simd_ is true). The data + // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data // structure maps original instructions into the new instructions. // Contents reside in phase-local heap memory. ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; // Temporary vectorization bookkeeping. + VectorMode vector_mode_; // synthesis mode HBasicBlock* vector_preheader_; // preheader of the new loop HBasicBlock* vector_header_; // header of the new loop HBasicBlock* vector_body_; // body of the new loop - HInstruction* vector_runtime_test_a_; - HInstruction* vector_runtime_test_b_; // defines a != b runtime test - HPhi* vector_phi_; // the Phi representing the normalized loop index - VectorMode vector_mode_; // selects synthesis mode + HInstruction* vector_index_; // normalized index of the new loop friend class LoopOptimizationTest; diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index d0047c54f2..4ca833707b 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -967,6 +967,7 @@ void HInstructionList::AddInstruction(HInstruction* instruction) { DCHECK(last_instruction_ == nullptr); first_instruction_ = last_instruction_ = instruction; } else { + DCHECK(last_instruction_ != nullptr); last_instruction_->next_ = instruction; instruction->previous_ = last_instruction_; last_instruction_ = instruction; diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index ffa16dd787..5e072cdb67 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -421,7 +421,7 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { void SimplifyLoop(HBasicBlock* header); int32_t GetNextInstructionId() { - DCHECK_NE(current_instruction_id_, INT32_MAX); + CHECK_NE(current_instruction_id_, INT32_MAX); return current_instruction_id_++; } @@ -430,7 +430,7 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { } void SetCurrentInstructionId(int32_t id) { - DCHECK_GE(id, current_instruction_id_); + CHECK_GE(id, current_instruction_id_); current_instruction_id_ = id; } @@ -2612,6 +2612,16 @@ class HPhi FINAL : public HVariableInputSizeInstruction { && other->AsPhi()->GetRegNumber() == GetRegNumber(); } + bool HasEquivalentPhi() const { + if (GetPrevious() != nullptr && GetPrevious()->AsPhi()->GetRegNumber() == GetRegNumber()) { + return true; + } + if (GetNext() != nullptr && GetNext()->AsPhi()->GetRegNumber() == GetRegNumber()) { + return true; + } + return false; + } + // Returns the next equivalent phi (starting from the current one) or null if there is none. // An equivalent phi is a phi having the same dex register and type. // It assumes that phis with the same dex register are adjacent. diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index 320f01a727..147fa1c05a 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -109,6 +109,10 @@ void SchedulingGraph::AddDependencies(HInstruction* instruction, bool is_schedul // barrier depend on it. for (HInstruction* other = instruction->GetNext(); other != nullptr; other = other->GetNext()) { SchedulingNode* other_node = GetNode(other); + CHECK(other_node != nullptr) + << other->DebugName() + << " is in block " << other->GetBlock()->GetBlockId() + << ", and expected in block " << instruction->GetBlock()->GetBlockId(); bool other_is_barrier = other_node->IsSchedulingBarrier(); if (is_scheduling_barrier || other_is_barrier) { AddOtherDependency(other_node, instruction_node); @@ -377,6 +381,10 @@ void HScheduler::Schedule(HBasicBlock* block) { scheduling_graph_.Clear(); for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { HInstruction* instruction = it.Current(); + CHECK_EQ(instruction->GetBlock(), block) + << instruction->DebugName() + << " is in block " << instruction->GetBlock()->GetBlockId() + << ", and expected in block " << block->GetBlockId(); SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction)); CalculateLatency(node); scheduling_nodes.push_back(node); diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc index 7b7495bf3b..185303bc8c 100644 --- a/compiler/optimizing/ssa_liveness_analysis.cc +++ b/compiler/optimizing/ssa_liveness_analysis.cc @@ -197,7 +197,7 @@ void SsaLivenessAnalysis::ComputeLiveRanges() { HInstruction* instruction = environment->GetInstructionAt(i); bool should_be_live = ShouldBeLiveForEnvironment(current, instruction); if (should_be_live) { - DCHECK(instruction->HasSsaIndex()); + CHECK(instruction->HasSsaIndex()) << instruction->DebugName(); live_in->SetBit(instruction->GetSsaIndex()); } if (instruction != nullptr) { |