Reuse arena memory for each block in scheduler.
This reduces the peak memory used for large methods with
multiple blocks to schedule.
Compiling the aosp_taimen-userdebug boot image, the most
memory hungry method BatteryStats.dumpLocked has the
Scheduler memory allocations in ArenaStack hidden by the
register allocator:
- before:
MEM: used: 8300224, allocated: 9175040, lost: 197360
Scheduler 8300224
- after:
MEM: used: 5914296, allocated: 7864320, lost: 78200
SsaLiveness 5532840
RegAllocator 144968
RegAllocVldt 236488
The total arena memory used, including the ArenaAllocator
not listed above, goes from 44333648 to 41950324 (-5.4%).
(Measured with kArenaAllocatorCountAllocations=true,
kArenaAllocatorPreciseTracking=false.)
Also remove one unnecessary -Wframe-larger-than= workaround
and add one workaround for large frame with the above arena
alloc tracking flags.
Test: m test-art-host-gtest
Test: testrunner.py --host
Bug: 34053922
Change-Id: I7fd8d90dcc13b184b1e5bd0bcac072388710a129
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc
index 62d547d..8cc6cf1 100644
--- a/compiler/driver/compiler_options.cc
+++ b/compiler/driver/compiler_options.cc
@@ -116,9 +116,6 @@
return true;
}
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wframe-larger-than="
-
bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& options,
bool ignore_unrecognized,
std::string* error_msg) {
@@ -133,8 +130,6 @@
return ReadCompilerOptions(args, this, error_msg);
}
-#pragma GCC diagnostic pop
-
bool CompilerOptions::IsImageClass(const char* descriptor) const {
// Historical note: We used to hold the set indirectly and there was a distinction between an
// empty set and a null, null meaning to include all classes. However, the distiction has been
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index d96746f..b2733ee 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1101,15 +1101,18 @@
const JniCompiledMethod& jni_compiled_method,
/* out */ ArenaVector<uint8_t>* stack_map) {
ScopedArenaAllocator allocator(arena_stack);
- StackMapStream stack_map_stream(&allocator, jni_compiled_method.GetInstructionSet());
- stack_map_stream.BeginMethod(
+ // StackMapStream is quite large, so allocate it using the ScopedArenaAllocator
+ // to stay clear of the frame size limit.
+ std::unique_ptr<StackMapStream> stack_map_stream(
+ new (&allocator) StackMapStream(&allocator, jni_compiled_method.GetInstructionSet()));
+ stack_map_stream->BeginMethod(
jni_compiled_method.GetFrameSize(),
jni_compiled_method.GetCoreSpillMask(),
jni_compiled_method.GetFpSpillMask(),
/* num_dex_registers */ 0);
- stack_map_stream.EndMethod();
- stack_map->resize(stack_map_stream.PrepareForFillIn());
- stack_map_stream.FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
+ stack_map_stream->EndMethod();
+ stack_map->resize(stack_map_stream->PrepareForFillIn());
+ stack_map_stream->FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
}
CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 588ea03..1aa16f4 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -545,60 +545,67 @@
void HScheduler::Schedule(HGraph* graph) {
// We run lsa here instead of in a separate pass to better control whether we
// should run the analysis or not.
+ const HeapLocationCollector* heap_location_collector = nullptr;
LoadStoreAnalysis lsa(graph);
if (!only_optimize_loop_blocks_ || graph->HasLoops()) {
lsa.Run();
- scheduling_graph_.SetHeapLocationCollector(lsa.GetHeapLocationCollector());
+ heap_location_collector = &lsa.GetHeapLocationCollector();
}
for (HBasicBlock* block : graph->GetReversePostOrder()) {
if (IsSchedulable(block)) {
- Schedule(block);
+ Schedule(block, heap_location_collector);
}
}
}
-void HScheduler::Schedule(HBasicBlock* block) {
- ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator_->Adapter(kArenaAllocScheduler));
+void HScheduler::Schedule(HBasicBlock* block,
+ const HeapLocationCollector* heap_location_collector) {
+ ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
+ ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
// Build the scheduling graph.
- scheduling_graph_.Clear();
+ SchedulingGraph scheduling_graph(this, &allocator, heap_location_collector);
for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
HInstruction* instruction = it.Current();
CHECK_EQ(instruction->GetBlock(), block)
<< instruction->DebugName()
<< " is in block " << instruction->GetBlock()->GetBlockId()
<< ", and expected in block " << block->GetBlockId();
- SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+ SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
CalculateLatency(node);
scheduling_nodes.push_back(node);
}
- if (scheduling_graph_.Size() <= 1) {
- scheduling_graph_.Clear();
+ if (scheduling_graph.Size() <= 1) {
return;
}
cursor_ = block->GetLastInstruction();
+ // The list of candidates for scheduling. A node becomes a candidate when all
+ // its predecessors have been scheduled.
+ ScopedArenaVector<SchedulingNode*> candidates(allocator.Adapter(kArenaAllocScheduler));
+
// Find the initial candidates for scheduling.
- candidates_.clear();
for (SchedulingNode* node : scheduling_nodes) {
if (!node->HasUnscheduledSuccessors()) {
node->MaybeUpdateCriticalPath(node->GetLatency());
- candidates_.push_back(node);
+ candidates.push_back(node);
}
}
- ScopedArenaVector<SchedulingNode*> initial_candidates(allocator_->Adapter(kArenaAllocScheduler));
+ ScopedArenaVector<SchedulingNode*> initial_candidates(allocator.Adapter(kArenaAllocScheduler));
if (kDumpDotSchedulingGraphs) {
// Remember the list of initial candidates for debug output purposes.
- initial_candidates.assign(candidates_.begin(), candidates_.end());
+ initial_candidates.assign(candidates.begin(), candidates.end());
}
// Schedule all nodes.
- while (!candidates_.empty()) {
- Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+ selector_->Reset();
+ while (!candidates.empty()) {
+ SchedulingNode* node = selector_->PopHighestPriorityNode(&candidates, scheduling_graph);
+ Schedule(node, &candidates);
}
if (kDumpDotSchedulingGraphs) {
@@ -607,11 +614,12 @@
std::stringstream description;
description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
<< " B" << block->GetBlockId();
- scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+ scheduling_graph.DumpAsDotGraph(description.str(), initial_candidates);
}
}
-void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+void HScheduler::Schedule(SchedulingNode* scheduling_node,
+ /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates) {
// Check whether any of the node's predecessors will be valid candidates after
// this node is scheduled.
uint32_t path_to_node = scheduling_node->GetCriticalPath();
@@ -620,7 +628,7 @@
path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
predecessor->DecrementNumberOfUnscheduledSuccessors();
if (!predecessor->HasUnscheduledSuccessors()) {
- candidates_.push_back(predecessor);
+ candidates->push_back(predecessor);
}
}
for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
@@ -630,7 +638,7 @@
// correctness. So we do not use them to compute the critical path.
predecessor->DecrementNumberOfUnscheduledSuccessors();
if (!predecessor->HasUnscheduledSuccessors()) {
- candidates_.push_back(predecessor);
+ candidates->push_back(predecessor);
}
}
@@ -779,7 +787,6 @@
#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
// Phase-local allocator that allocates scheduler internal data structures like
// scheduling nodes, internel nodes map, dependencies, etc.
- ScopedArenaAllocator allocator(graph_->GetArenaStack());
CriticalPathSchedulingNodeSelector critical_path_selector;
RandomSchedulingNodeSelector random_selector;
SchedulingNodeSelector* selector = schedule_randomly
@@ -795,7 +802,7 @@
switch (instruction_set_) {
#ifdef ART_ENABLE_CODEGEN_arm64
case InstructionSet::kArm64: {
- arm64::HSchedulerARM64 scheduler(&allocator, selector);
+ arm64::HSchedulerARM64 scheduler(selector);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;
@@ -805,7 +812,7 @@
case InstructionSet::kThumb2:
case InstructionSet::kArm: {
arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
- arm::HSchedulerARM scheduler(&allocator, selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index c7683e0..fd48d84 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -251,12 +251,14 @@
*/
class SchedulingGraph : public ValueObject {
public:
- SchedulingGraph(const HScheduler* scheduler, ScopedArenaAllocator* allocator)
+ SchedulingGraph(const HScheduler* scheduler,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector)
: scheduler_(scheduler),
allocator_(allocator),
contains_scheduling_barrier_(false),
nodes_map_(allocator_->Adapter(kArenaAllocScheduler)),
- heap_location_collector_(nullptr) {}
+ heap_location_collector_(heap_location_collector) {}
SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
std::unique_ptr<SchedulingNode> node(
@@ -268,15 +270,6 @@
return result;
}
- void Clear() {
- nodes_map_.clear();
- contains_scheduling_barrier_ = false;
- }
-
- void SetHeapLocationCollector(const HeapLocationCollector& heap_location_collector) {
- heap_location_collector_ = &heap_location_collector;
- }
-
SchedulingNode* GetNode(const HInstruction* instr) const {
auto it = nodes_map_.find(instr);
if (it == nodes_map_.end()) {
@@ -329,7 +322,7 @@
ScopedArenaHashMap<const HInstruction*, std::unique_ptr<SchedulingNode>> nodes_map_;
- const HeapLocationCollector* heap_location_collector_;
+ const HeapLocationCollector* const heap_location_collector_;
};
/*
@@ -377,6 +370,7 @@
class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
public:
+ virtual void Reset() {}
virtual SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
const SchedulingGraph& graph) = 0;
virtual ~SchedulingNodeSelector() {}
@@ -418,6 +412,7 @@
public:
CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
+ void Reset() OVERRIDE { prev_select_ = nullptr; }
SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
const SchedulingGraph& graph) OVERRIDE;
@@ -434,16 +429,11 @@
class HScheduler {
public:
- HScheduler(ScopedArenaAllocator* allocator,
- SchedulingLatencyVisitor* latency_visitor,
- SchedulingNodeSelector* selector)
- : allocator_(allocator),
- latency_visitor_(latency_visitor),
+ HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector)
+ : latency_visitor_(latency_visitor),
selector_(selector),
only_optimize_loop_blocks_(true),
- scheduling_graph_(this, allocator),
- cursor_(nullptr),
- candidates_(allocator_->Adapter(kArenaAllocScheduler)) {}
+ cursor_(nullptr) {}
virtual ~HScheduler() {}
void Schedule(HGraph* graph);
@@ -454,8 +444,9 @@
virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
protected:
- void Schedule(HBasicBlock* block);
- void Schedule(SchedulingNode* scheduling_node);
+ void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector);
+ void Schedule(SchedulingNode* scheduling_node,
+ /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates);
void Schedule(HInstruction* instruction);
// Any instruction returning `false` via this method will prevent its
@@ -476,19 +467,12 @@
node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
}
- ScopedArenaAllocator* const allocator_;
SchedulingLatencyVisitor* const latency_visitor_;
SchedulingNodeSelector* const selector_;
bool only_optimize_loop_blocks_;
- // We instantiate the members below as part of this class to avoid
- // instantiating them locally for every chunk scheduled.
- SchedulingGraph scheduling_graph_;
// A pointer indicating where the next instruction to be scheduled will be inserted.
HInstruction* cursor_;
- // The list of candidates for scheduling. A node becomes a candidate when all
- // its predecessors have been scheduled.
- ScopedArenaVector<SchedulingNode*> candidates_;
private:
DISALLOW_COPY_AND_ASSIGN(HScheduler);
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
index 0cb8684..2f36948 100644
--- a/compiler/optimizing/scheduler_arm.h
+++ b/compiler/optimizing/scheduler_arm.h
@@ -137,10 +137,9 @@
class HSchedulerARM : public HScheduler {
public:
- HSchedulerARM(ScopedArenaAllocator* allocator,
- SchedulingNodeSelector* selector,
+ HSchedulerARM(SchedulingNodeSelector* selector,
SchedulingLatencyVisitorARM* arm_latency_visitor)
- : HScheduler(allocator, arm_latency_visitor, selector) {}
+ : HScheduler(arm_latency_visitor, selector) {}
~HSchedulerARM() OVERRIDE {}
bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 4f394d5..0d2f8d9 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -134,8 +134,8 @@
class HSchedulerARM64 : public HScheduler {
public:
- HSchedulerARM64(ScopedArenaAllocator* allocator, SchedulingNodeSelector* selector)
- : HScheduler(allocator, &arm64_latency_visitor_, selector) {}
+ explicit HSchedulerARM64(SchedulingNodeSelector* selector)
+ : HScheduler(&arm64_latency_visitor_, selector) {}
~HSchedulerARM64() OVERRIDE {}
bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index 7079e07..fe23fb4 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -146,7 +146,9 @@
environment->SetRawEnvAt(1, mul);
mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
- SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator());
+ SchedulingGraph scheduling_graph(scheduler,
+ GetScopedAllocator(),
+ /* heap_location_collector */ nullptr);
// Instructions must be inserted in reverse order into the scheduling graph.
for (HInstruction* instr : ReverseRange(block_instructions)) {
scheduling_graph.AddNode(instr);
@@ -276,11 +278,10 @@
entry->AddInstruction(instr);
}
- SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator());
HeapLocationCollector heap_location_collector(graph_);
heap_location_collector.VisitBasicBlock(entry);
heap_location_collector.BuildAliasingMatrix();
- scheduling_graph.SetHeapLocationCollector(heap_location_collector);
+ SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator(), &heap_location_collector);
for (HInstruction* instr : ReverseRange(block_instructions)) {
// Build scheduling graph with memory access aliasing information
@@ -354,13 +355,13 @@
#if defined(ART_ENABLE_CODEGEN_arm64)
TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) {
CriticalPathSchedulingNodeSelector critical_path_selector;
- arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector);
+ arm64::HSchedulerARM64 scheduler(&critical_path_selector);
TestBuildDependencyGraphAndSchedule(&scheduler);
}
TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
CriticalPathSchedulingNodeSelector critical_path_selector;
- arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector);
+ arm64::HSchedulerARM64 scheduler(&critical_path_selector);
TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
}
#endif
@@ -369,14 +370,14 @@
TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) {
CriticalPathSchedulingNodeSelector critical_path_selector;
arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
- arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
TestBuildDependencyGraphAndSchedule(&scheduler);
}
TEST_F(SchedulerTest, ArrayAccessAliasingARM) {
CriticalPathSchedulingNodeSelector critical_path_selector;
arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
- arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
}
#endif
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index de79f49..9d4598d 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -34,7 +34,7 @@
* Collects and builds stack maps for a method. All the stack maps
* for a method are placed in a CodeInfo object.
*/
-class StackMapStream : public ValueObject {
+class StackMapStream : public DeletableArenaObject<kArenaAllocStackMapStream> {
public:
explicit StackMapStream(ScopedArenaAllocator* allocator, InstructionSet instruction_set)
: instruction_set_(instruction_set),
@@ -53,6 +53,7 @@
current_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)),
previous_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)),
dex_register_timestamp_(allocator->Adapter(kArenaAllocStackMapStream)),
+ expected_num_dex_registers_(0u),
temp_dex_register_mask_(allocator, 32, true, kArenaAllocStackMapStream),
temp_dex_register_map_(allocator->Adapter(kArenaAllocStackMapStream)) {
}