diff options
author | 2018-07-26 14:42:17 +0100 | |
---|---|---|
committer | 2018-08-02 15:47:02 +0100 | |
commit | ced04835d8e3cd3f68576cfffc1d21283ca151b4 (patch) | |
tree | 125ddd1d222f4fb1710e17c76803ad3e92572a5c /compiler | |
parent | 350b6a312222b9b27bfee0e72ce261a45cb60e1c (diff) |
Reuse arena memory for each block in scheduler.
This reduces the peak memory used for large methods with
multiple blocks to schedule.
Compiling the aosp_taimen-userdebug boot image, the most
memory hungry method BatteryStats.dumpLocked has the
Scheduler memory allocations in ArenaStack hidden by the
register allocator:
- before:
MEM: used: 8300224, allocated: 9175040, lost: 197360
Scheduler 8300224
- after:
MEM: used: 5914296, allocated: 7864320, lost: 78200
SsaLiveness 5532840
RegAllocator 144968
RegAllocVldt 236488
The total arena memory used, including the ArenaAllocator
not listed above, goes from 44333648 to 41950324 (-5.4%).
(Measured with kArenaAllocatorCountAllocations=true,
kArenaAllocatorPreciseTracking=false.)
Also remove one unnecessary -Wframe-larger-than= workaround
and add one workaround for large frame with the above arena
alloc tracking flags.
Test: m test-art-host-gtest
Test: testrunner.py --host
Bug: 34053922
Change-Id: I7fd8d90dcc13b184b1e5bd0bcac072388710a129
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/driver/compiler_options.cc | 5 | ||||
-rw-r--r-- | compiler/optimizing/optimizing_compiler.cc | 13 | ||||
-rw-r--r-- | compiler/optimizing/scheduler.cc | 49 | ||||
-rw-r--r-- | compiler/optimizing/scheduler.h | 42 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm.h | 5 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm64.h | 4 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_test.cc | 15 | ||||
-rw-r--r-- | compiler/optimizing/stack_map_stream.h | 3 |
8 files changed, 63 insertions, 73 deletions
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc index 62d547de44..8cc6cf10f0 100644 --- a/compiler/driver/compiler_options.cc +++ b/compiler/driver/compiler_options.cc @@ -116,9 +116,6 @@ bool CompilerOptions::ParseRegisterAllocationStrategy(const std::string& option, return true; } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wframe-larger-than=" - bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& options, bool ignore_unrecognized, std::string* error_msg) { @@ -133,8 +130,6 @@ bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& optio return ReadCompilerOptions(args, this, error_msg); } -#pragma GCC diagnostic pop - bool CompilerOptions::IsImageClass(const char* descriptor) const { // Historical note: We used to hold the set indirectly and there was a distinction between an // empty set and a null, null meaning to include all classes. However, the distiction has been diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index d96746fdd7..b2733ee1f2 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -1101,15 +1101,18 @@ static void CreateJniStackMap(ArenaStack* arena_stack, const JniCompiledMethod& jni_compiled_method, /* out */ ArenaVector<uint8_t>* stack_map) { ScopedArenaAllocator allocator(arena_stack); - StackMapStream stack_map_stream(&allocator, jni_compiled_method.GetInstructionSet()); - stack_map_stream.BeginMethod( + // StackMapStream is quite large, so allocate it using the ScopedArenaAllocator + // to stay clear of the frame size limit. + std::unique_ptr<StackMapStream> stack_map_stream( + new (&allocator) StackMapStream(&allocator, jni_compiled_method.GetInstructionSet())); + stack_map_stream->BeginMethod( jni_compiled_method.GetFrameSize(), jni_compiled_method.GetCoreSpillMask(), jni_compiled_method.GetFpSpillMask(), /* num_dex_registers */ 0); - stack_map_stream.EndMethod(); - stack_map->resize(stack_map_stream.PrepareForFillIn()); - stack_map_stream.FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size())); + stack_map_stream->EndMethod(); + stack_map->resize(stack_map_stream->PrepareForFillIn()); + stack_map_stream->FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size())); } CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags, diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index 588ea03d69..1aa16f45bc 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -545,60 +545,67 @@ SchedulingNode* CriticalPathSchedulingNodeSelector::GetHigherPrioritySchedulingN void HScheduler::Schedule(HGraph* graph) { // We run lsa here instead of in a separate pass to better control whether we // should run the analysis or not. + const HeapLocationCollector* heap_location_collector = nullptr; LoadStoreAnalysis lsa(graph); if (!only_optimize_loop_blocks_ || graph->HasLoops()) { lsa.Run(); - scheduling_graph_.SetHeapLocationCollector(lsa.GetHeapLocationCollector()); + heap_location_collector = &lsa.GetHeapLocationCollector(); } for (HBasicBlock* block : graph->GetReversePostOrder()) { if (IsSchedulable(block)) { - Schedule(block); + Schedule(block, heap_location_collector); } } } -void HScheduler::Schedule(HBasicBlock* block) { - ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator_->Adapter(kArenaAllocScheduler)); +void HScheduler::Schedule(HBasicBlock* block, + const HeapLocationCollector* heap_location_collector) { + ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack()); + ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler)); // Build the scheduling graph. - scheduling_graph_.Clear(); + SchedulingGraph scheduling_graph(this, &allocator, heap_location_collector); for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { HInstruction* instruction = it.Current(); CHECK_EQ(instruction->GetBlock(), block) << instruction->DebugName() << " is in block " << instruction->GetBlock()->GetBlockId() << ", and expected in block " << block->GetBlockId(); - SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction)); + SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction)); CalculateLatency(node); scheduling_nodes.push_back(node); } - if (scheduling_graph_.Size() <= 1) { - scheduling_graph_.Clear(); + if (scheduling_graph.Size() <= 1) { return; } cursor_ = block->GetLastInstruction(); + // The list of candidates for scheduling. A node becomes a candidate when all + // its predecessors have been scheduled. + ScopedArenaVector<SchedulingNode*> candidates(allocator.Adapter(kArenaAllocScheduler)); + // Find the initial candidates for scheduling. - candidates_.clear(); for (SchedulingNode* node : scheduling_nodes) { if (!node->HasUnscheduledSuccessors()) { node->MaybeUpdateCriticalPath(node->GetLatency()); - candidates_.push_back(node); + candidates.push_back(node); } } - ScopedArenaVector<SchedulingNode*> initial_candidates(allocator_->Adapter(kArenaAllocScheduler)); + ScopedArenaVector<SchedulingNode*> initial_candidates(allocator.Adapter(kArenaAllocScheduler)); if (kDumpDotSchedulingGraphs) { // Remember the list of initial candidates for debug output purposes. - initial_candidates.assign(candidates_.begin(), candidates_.end()); + initial_candidates.assign(candidates.begin(), candidates.end()); } // Schedule all nodes. - while (!candidates_.empty()) { - Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_)); + selector_->Reset(); + while (!candidates.empty()) { + SchedulingNode* node = selector_->PopHighestPriorityNode(&candidates, scheduling_graph); + Schedule(node, &candidates); } if (kDumpDotSchedulingGraphs) { @@ -607,11 +614,12 @@ void HScheduler::Schedule(HBasicBlock* block) { std::stringstream description; description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx()) << " B" << block->GetBlockId(); - scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates); + scheduling_graph.DumpAsDotGraph(description.str(), initial_candidates); } } -void HScheduler::Schedule(SchedulingNode* scheduling_node) { +void HScheduler::Schedule(SchedulingNode* scheduling_node, + /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates) { // Check whether any of the node's predecessors will be valid candidates after // this node is scheduled. uint32_t path_to_node = scheduling_node->GetCriticalPath(); @@ -620,7 +628,7 @@ void HScheduler::Schedule(SchedulingNode* scheduling_node) { path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency()); predecessor->DecrementNumberOfUnscheduledSuccessors(); if (!predecessor->HasUnscheduledSuccessors()) { - candidates_.push_back(predecessor); + candidates->push_back(predecessor); } } for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) { @@ -630,7 +638,7 @@ void HScheduler::Schedule(SchedulingNode* scheduling_node) { // correctness. So we do not use them to compute the critical path. predecessor->DecrementNumberOfUnscheduledSuccessors(); if (!predecessor->HasUnscheduledSuccessors()) { - candidates_.push_back(predecessor); + candidates->push_back(predecessor); } } @@ -779,7 +787,6 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks, #if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm) // Phase-local allocator that allocates scheduler internal data structures like // scheduling nodes, internel nodes map, dependencies, etc. - ScopedArenaAllocator allocator(graph_->GetArenaStack()); CriticalPathSchedulingNodeSelector critical_path_selector; RandomSchedulingNodeSelector random_selector; SchedulingNodeSelector* selector = schedule_randomly @@ -795,7 +802,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks, switch (instruction_set_) { #ifdef ART_ENABLE_CODEGEN_arm64 case InstructionSet::kArm64: { - arm64::HSchedulerARM64 scheduler(&allocator, selector); + arm64::HSchedulerARM64 scheduler(selector); scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); scheduler.Schedule(graph_); break; @@ -805,7 +812,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks, case InstructionSet::kThumb2: case InstructionSet::kArm: { arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_); - arm::HSchedulerARM scheduler(&allocator, selector, &arm_latency_visitor); + arm::HSchedulerARM scheduler(selector, &arm_latency_visitor); scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); scheduler.Schedule(graph_); break; diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index c7683e04a7..fd48d844e6 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -251,12 +251,14 @@ class SchedulingNode : public DeletableArenaObject<kArenaAllocScheduler> { */ class SchedulingGraph : public ValueObject { public: - SchedulingGraph(const HScheduler* scheduler, ScopedArenaAllocator* allocator) + SchedulingGraph(const HScheduler* scheduler, + ScopedArenaAllocator* allocator, + const HeapLocationCollector* heap_location_collector) : scheduler_(scheduler), allocator_(allocator), contains_scheduling_barrier_(false), nodes_map_(allocator_->Adapter(kArenaAllocScheduler)), - heap_location_collector_(nullptr) {} + heap_location_collector_(heap_location_collector) {} SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) { std::unique_ptr<SchedulingNode> node( @@ -268,15 +270,6 @@ class SchedulingGraph : public ValueObject { return result; } - void Clear() { - nodes_map_.clear(); - contains_scheduling_barrier_ = false; - } - - void SetHeapLocationCollector(const HeapLocationCollector& heap_location_collector) { - heap_location_collector_ = &heap_location_collector; - } - SchedulingNode* GetNode(const HInstruction* instr) const { auto it = nodes_map_.find(instr); if (it == nodes_map_.end()) { @@ -329,7 +322,7 @@ class SchedulingGraph : public ValueObject { ScopedArenaHashMap<const HInstruction*, std::unique_ptr<SchedulingNode>> nodes_map_; - const HeapLocationCollector* heap_location_collector_; + const HeapLocationCollector* const heap_location_collector_; }; /* @@ -377,6 +370,7 @@ class SchedulingLatencyVisitor : public HGraphDelegateVisitor { class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> { public: + virtual void Reset() {} virtual SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes, const SchedulingGraph& graph) = 0; virtual ~SchedulingNodeSelector() {} @@ -418,6 +412,7 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector { public: CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {} + void Reset() OVERRIDE { prev_select_ = nullptr; } SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes, const SchedulingGraph& graph) OVERRIDE; @@ -434,16 +429,11 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector { class HScheduler { public: - HScheduler(ScopedArenaAllocator* allocator, - SchedulingLatencyVisitor* latency_visitor, - SchedulingNodeSelector* selector) - : allocator_(allocator), - latency_visitor_(latency_visitor), + HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector) + : latency_visitor_(latency_visitor), selector_(selector), only_optimize_loop_blocks_(true), - scheduling_graph_(this, allocator), - cursor_(nullptr), - candidates_(allocator_->Adapter(kArenaAllocScheduler)) {} + cursor_(nullptr) {} virtual ~HScheduler() {} void Schedule(HGraph* graph); @@ -454,8 +444,9 @@ class HScheduler { virtual bool IsSchedulingBarrier(const HInstruction* instruction) const; protected: - void Schedule(HBasicBlock* block); - void Schedule(SchedulingNode* scheduling_node); + void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector); + void Schedule(SchedulingNode* scheduling_node, + /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates); void Schedule(HInstruction* instruction); // Any instruction returning `false` via this method will prevent its @@ -476,19 +467,12 @@ class HScheduler { node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency()); } - ScopedArenaAllocator* const allocator_; SchedulingLatencyVisitor* const latency_visitor_; SchedulingNodeSelector* const selector_; bool only_optimize_loop_blocks_; - // We instantiate the members below as part of this class to avoid - // instantiating them locally for every chunk scheduled. - SchedulingGraph scheduling_graph_; // A pointer indicating where the next instruction to be scheduled will be inserted. HInstruction* cursor_; - // The list of candidates for scheduling. A node becomes a candidate when all - // its predecessors have been scheduled. - ScopedArenaVector<SchedulingNode*> candidates_; private: DISALLOW_COPY_AND_ASSIGN(HScheduler); diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h index 0cb8684376..2f369486b3 100644 --- a/compiler/optimizing/scheduler_arm.h +++ b/compiler/optimizing/scheduler_arm.h @@ -137,10 +137,9 @@ class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor { class HSchedulerARM : public HScheduler { public: - HSchedulerARM(ScopedArenaAllocator* allocator, - SchedulingNodeSelector* selector, + HSchedulerARM(SchedulingNodeSelector* selector, SchedulingLatencyVisitorARM* arm_latency_visitor) - : HScheduler(allocator, arm_latency_visitor, selector) {} + : HScheduler(arm_latency_visitor, selector) {} ~HSchedulerARM() OVERRIDE {} bool IsSchedulable(const HInstruction* instruction) const OVERRIDE { diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h index 4f394d5e16..0d2f8d9fa0 100644 --- a/compiler/optimizing/scheduler_arm64.h +++ b/compiler/optimizing/scheduler_arm64.h @@ -134,8 +134,8 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor { class HSchedulerARM64 : public HScheduler { public: - HSchedulerARM64(ScopedArenaAllocator* allocator, SchedulingNodeSelector* selector) - : HScheduler(allocator, &arm64_latency_visitor_, selector) {} + explicit HSchedulerARM64(SchedulingNodeSelector* selector) + : HScheduler(&arm64_latency_visitor_, selector) {} ~HSchedulerARM64() OVERRIDE {} bool IsSchedulable(const HInstruction* instruction) const OVERRIDE { diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc index 7079e07ae1..fe23fb4cff 100644 --- a/compiler/optimizing/scheduler_test.cc +++ b/compiler/optimizing/scheduler_test.cc @@ -146,7 +146,9 @@ class SchedulerTest : public OptimizingUnitTest { environment->SetRawEnvAt(1, mul); mul->AddEnvUseAt(div_check->GetEnvironment(), 1); - SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator()); + SchedulingGraph scheduling_graph(scheduler, + GetScopedAllocator(), + /* heap_location_collector */ nullptr); // Instructions must be inserted in reverse order into the scheduling graph. for (HInstruction* instr : ReverseRange(block_instructions)) { scheduling_graph.AddNode(instr); @@ -276,11 +278,10 @@ class SchedulerTest : public OptimizingUnitTest { entry->AddInstruction(instr); } - SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator()); HeapLocationCollector heap_location_collector(graph_); heap_location_collector.VisitBasicBlock(entry); heap_location_collector.BuildAliasingMatrix(); - scheduling_graph.SetHeapLocationCollector(heap_location_collector); + SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator(), &heap_location_collector); for (HInstruction* instr : ReverseRange(block_instructions)) { // Build scheduling graph with memory access aliasing information @@ -354,13 +355,13 @@ class SchedulerTest : public OptimizingUnitTest { #if defined(ART_ENABLE_CODEGEN_arm64) TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) { CriticalPathSchedulingNodeSelector critical_path_selector; - arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector); + arm64::HSchedulerARM64 scheduler(&critical_path_selector); TestBuildDependencyGraphAndSchedule(&scheduler); } TEST_F(SchedulerTest, ArrayAccessAliasingARM64) { CriticalPathSchedulingNodeSelector critical_path_selector; - arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector); + arm64::HSchedulerARM64 scheduler(&critical_path_selector); TestDependencyGraphOnAliasingArrayAccesses(&scheduler); } #endif @@ -369,14 +370,14 @@ TEST_F(SchedulerTest, ArrayAccessAliasingARM64) { TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) { CriticalPathSchedulingNodeSelector critical_path_selector; arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); - arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor); + arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor); TestBuildDependencyGraphAndSchedule(&scheduler); } TEST_F(SchedulerTest, ArrayAccessAliasingARM) { CriticalPathSchedulingNodeSelector critical_path_selector; arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); - arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor); + arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor); TestDependencyGraphOnAliasingArrayAccesses(&scheduler); } #endif diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h index de79f4921e..9d4598d3d9 100644 --- a/compiler/optimizing/stack_map_stream.h +++ b/compiler/optimizing/stack_map_stream.h @@ -34,7 +34,7 @@ namespace art { * Collects and builds stack maps for a method. All the stack maps * for a method are placed in a CodeInfo object. */ -class StackMapStream : public ValueObject { +class StackMapStream : public DeletableArenaObject<kArenaAllocStackMapStream> { public: explicit StackMapStream(ScopedArenaAllocator* allocator, InstructionSet instruction_set) : instruction_set_(instruction_set), @@ -53,6 +53,7 @@ class StackMapStream : public ValueObject { current_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)), previous_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)), dex_register_timestamp_(allocator->Adapter(kArenaAllocStackMapStream)), + expected_num_dex_registers_(0u), temp_dex_register_mask_(allocator, 32, true, kArenaAllocStackMapStream), temp_dex_register_map_(allocator->Adapter(kArenaAllocStackMapStream)) { } |