Reuse arena memory for each block in scheduler.
This reduces the peak memory used for large methods with
multiple blocks to schedule.
Compiling the aosp_taimen-userdebug boot image, the most
memory hungry method BatteryStats.dumpLocked has the
Scheduler memory allocations in ArenaStack hidden by the
register allocator:
- before:
MEM: used: 8300224, allocated: 9175040, lost: 197360
Scheduler 8300224
- after:
MEM: used: 5914296, allocated: 7864320, lost: 78200
SsaLiveness 5532840
RegAllocator 144968
RegAllocVldt 236488
The total arena memory used, including the ArenaAllocator
not listed above, goes from 44333648 to 41950324 (-5.4%).
(Measured with kArenaAllocatorCountAllocations=true,
kArenaAllocatorPreciseTracking=false.)
Also remove one unnecessary -Wframe-larger-than= workaround
and add one workaround for large frame with the above arena
alloc tracking flags.
Test: m test-art-host-gtest
Test: testrunner.py --host
Bug: 34053922
Change-Id: I7fd8d90dcc13b184b1e5bd0bcac072388710a129
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 588ea03..1aa16f4 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -545,60 +545,67 @@
void HScheduler::Schedule(HGraph* graph) {
// We run lsa here instead of in a separate pass to better control whether we
// should run the analysis or not.
+ const HeapLocationCollector* heap_location_collector = nullptr;
LoadStoreAnalysis lsa(graph);
if (!only_optimize_loop_blocks_ || graph->HasLoops()) {
lsa.Run();
- scheduling_graph_.SetHeapLocationCollector(lsa.GetHeapLocationCollector());
+ heap_location_collector = &lsa.GetHeapLocationCollector();
}
for (HBasicBlock* block : graph->GetReversePostOrder()) {
if (IsSchedulable(block)) {
- Schedule(block);
+ Schedule(block, heap_location_collector);
}
}
}
-void HScheduler::Schedule(HBasicBlock* block) {
- ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator_->Adapter(kArenaAllocScheduler));
+void HScheduler::Schedule(HBasicBlock* block,
+ const HeapLocationCollector* heap_location_collector) {
+ ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
+ ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
// Build the scheduling graph.
- scheduling_graph_.Clear();
+ SchedulingGraph scheduling_graph(this, &allocator, heap_location_collector);
for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
HInstruction* instruction = it.Current();
CHECK_EQ(instruction->GetBlock(), block)
<< instruction->DebugName()
<< " is in block " << instruction->GetBlock()->GetBlockId()
<< ", and expected in block " << block->GetBlockId();
- SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+ SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
CalculateLatency(node);
scheduling_nodes.push_back(node);
}
- if (scheduling_graph_.Size() <= 1) {
- scheduling_graph_.Clear();
+ if (scheduling_graph.Size() <= 1) {
return;
}
cursor_ = block->GetLastInstruction();
+ // The list of candidates for scheduling. A node becomes a candidate when all
+ // its predecessors have been scheduled.
+ ScopedArenaVector<SchedulingNode*> candidates(allocator.Adapter(kArenaAllocScheduler));
+
// Find the initial candidates for scheduling.
- candidates_.clear();
for (SchedulingNode* node : scheduling_nodes) {
if (!node->HasUnscheduledSuccessors()) {
node->MaybeUpdateCriticalPath(node->GetLatency());
- candidates_.push_back(node);
+ candidates.push_back(node);
}
}
- ScopedArenaVector<SchedulingNode*> initial_candidates(allocator_->Adapter(kArenaAllocScheduler));
+ ScopedArenaVector<SchedulingNode*> initial_candidates(allocator.Adapter(kArenaAllocScheduler));
if (kDumpDotSchedulingGraphs) {
// Remember the list of initial candidates for debug output purposes.
- initial_candidates.assign(candidates_.begin(), candidates_.end());
+ initial_candidates.assign(candidates.begin(), candidates.end());
}
// Schedule all nodes.
- while (!candidates_.empty()) {
- Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+ selector_->Reset();
+ while (!candidates.empty()) {
+ SchedulingNode* node = selector_->PopHighestPriorityNode(&candidates, scheduling_graph);
+ Schedule(node, &candidates);
}
if (kDumpDotSchedulingGraphs) {
@@ -607,11 +614,12 @@
std::stringstream description;
description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
<< " B" << block->GetBlockId();
- scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+ scheduling_graph.DumpAsDotGraph(description.str(), initial_candidates);
}
}
-void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+void HScheduler::Schedule(SchedulingNode* scheduling_node,
+ /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates) {
// Check whether any of the node's predecessors will be valid candidates after
// this node is scheduled.
uint32_t path_to_node = scheduling_node->GetCriticalPath();
@@ -620,7 +628,7 @@
path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
predecessor->DecrementNumberOfUnscheduledSuccessors();
if (!predecessor->HasUnscheduledSuccessors()) {
- candidates_.push_back(predecessor);
+ candidates->push_back(predecessor);
}
}
for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
@@ -630,7 +638,7 @@
// correctness. So we do not use them to compute the critical path.
predecessor->DecrementNumberOfUnscheduledSuccessors();
if (!predecessor->HasUnscheduledSuccessors()) {
- candidates_.push_back(predecessor);
+ candidates->push_back(predecessor);
}
}
@@ -779,7 +787,6 @@
#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
// Phase-local allocator that allocates scheduler internal data structures like
// scheduling nodes, internel nodes map, dependencies, etc.
- ScopedArenaAllocator allocator(graph_->GetArenaStack());
CriticalPathSchedulingNodeSelector critical_path_selector;
RandomSchedulingNodeSelector random_selector;
SchedulingNodeSelector* selector = schedule_randomly
@@ -795,7 +802,7 @@
switch (instruction_set_) {
#ifdef ART_ENABLE_CODEGEN_arm64
case InstructionSet::kArm64: {
- arm64::HSchedulerARM64 scheduler(&allocator, selector);
+ arm64::HSchedulerARM64 scheduler(selector);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;
@@ -805,7 +812,7 @@
case InstructionSet::kThumb2:
case InstructionSet::kArm: {
arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
- arm::HSchedulerARM scheduler(&allocator, selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;