summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Vladimir Marko <vmarko@google.com> 2018-07-26 14:42:17 +0100
committer Vladimir Marko <vmarko@google.com> 2018-08-02 15:47:02 +0100
commitced04835d8e3cd3f68576cfffc1d21283ca151b4 (patch)
tree125ddd1d222f4fb1710e17c76803ad3e92572a5c
parent350b6a312222b9b27bfee0e72ce261a45cb60e1c (diff)
Reuse arena memory for each block in scheduler.
This reduces the peak memory used for large methods with multiple blocks to schedule. Compiling the aosp_taimen-userdebug boot image, the most memory hungry method BatteryStats.dumpLocked has the Scheduler memory allocations in ArenaStack hidden by the register allocator: - before: MEM: used: 8300224, allocated: 9175040, lost: 197360 Scheduler 8300224 - after: MEM: used: 5914296, allocated: 7864320, lost: 78200 SsaLiveness 5532840 RegAllocator 144968 RegAllocVldt 236488 The total arena memory used, including the ArenaAllocator not listed above, goes from 44333648 to 41950324 (-5.4%). (Measured with kArenaAllocatorCountAllocations=true, kArenaAllocatorPreciseTracking=false.) Also remove one unnecessary -Wframe-larger-than= workaround and add one workaround for large frame with the above arena alloc tracking flags. Test: m test-art-host-gtest Test: testrunner.py --host Bug: 34053922 Change-Id: I7fd8d90dcc13b184b1e5bd0bcac072388710a129
-rw-r--r--compiler/driver/compiler_options.cc5
-rw-r--r--compiler/optimizing/optimizing_compiler.cc13
-rw-r--r--compiler/optimizing/scheduler.cc49
-rw-r--r--compiler/optimizing/scheduler.h42
-rw-r--r--compiler/optimizing/scheduler_arm.h5
-rw-r--r--compiler/optimizing/scheduler_arm64.h4
-rw-r--r--compiler/optimizing/scheduler_test.cc15
-rw-r--r--compiler/optimizing/stack_map_stream.h3
8 files changed, 63 insertions, 73 deletions
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc
index 62d547de44..8cc6cf10f0 100644
--- a/compiler/driver/compiler_options.cc
+++ b/compiler/driver/compiler_options.cc
@@ -116,9 +116,6 @@ bool CompilerOptions::ParseRegisterAllocationStrategy(const std::string& option,
return true;
}
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wframe-larger-than="
-
bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& options,
bool ignore_unrecognized,
std::string* error_msg) {
@@ -133,8 +130,6 @@ bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& optio
return ReadCompilerOptions(args, this, error_msg);
}
-#pragma GCC diagnostic pop
-
bool CompilerOptions::IsImageClass(const char* descriptor) const {
// Historical note: We used to hold the set indirectly and there was a distinction between an
// empty set and a null, null meaning to include all classes. However, the distiction has been
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index d96746fdd7..b2733ee1f2 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1101,15 +1101,18 @@ static void CreateJniStackMap(ArenaStack* arena_stack,
const JniCompiledMethod& jni_compiled_method,
/* out */ ArenaVector<uint8_t>* stack_map) {
ScopedArenaAllocator allocator(arena_stack);
- StackMapStream stack_map_stream(&allocator, jni_compiled_method.GetInstructionSet());
- stack_map_stream.BeginMethod(
+ // StackMapStream is quite large, so allocate it using the ScopedArenaAllocator
+ // to stay clear of the frame size limit.
+ std::unique_ptr<StackMapStream> stack_map_stream(
+ new (&allocator) StackMapStream(&allocator, jni_compiled_method.GetInstructionSet()));
+ stack_map_stream->BeginMethod(
jni_compiled_method.GetFrameSize(),
jni_compiled_method.GetCoreSpillMask(),
jni_compiled_method.GetFpSpillMask(),
/* num_dex_registers */ 0);
- stack_map_stream.EndMethod();
- stack_map->resize(stack_map_stream.PrepareForFillIn());
- stack_map_stream.FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
+ stack_map_stream->EndMethod();
+ stack_map->resize(stack_map_stream->PrepareForFillIn());
+ stack_map_stream->FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
}
CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 588ea03d69..1aa16f45bc 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -545,60 +545,67 @@ SchedulingNode* CriticalPathSchedulingNodeSelector::GetHigherPrioritySchedulingN
void HScheduler::Schedule(HGraph* graph) {
// We run lsa here instead of in a separate pass to better control whether we
// should run the analysis or not.
+ const HeapLocationCollector* heap_location_collector = nullptr;
LoadStoreAnalysis lsa(graph);
if (!only_optimize_loop_blocks_ || graph->HasLoops()) {
lsa.Run();
- scheduling_graph_.SetHeapLocationCollector(lsa.GetHeapLocationCollector());
+ heap_location_collector = &lsa.GetHeapLocationCollector();
}
for (HBasicBlock* block : graph->GetReversePostOrder()) {
if (IsSchedulable(block)) {
- Schedule(block);
+ Schedule(block, heap_location_collector);
}
}
}
-void HScheduler::Schedule(HBasicBlock* block) {
- ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator_->Adapter(kArenaAllocScheduler));
+void HScheduler::Schedule(HBasicBlock* block,
+ const HeapLocationCollector* heap_location_collector) {
+ ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
+ ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
// Build the scheduling graph.
- scheduling_graph_.Clear();
+ SchedulingGraph scheduling_graph(this, &allocator, heap_location_collector);
for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
HInstruction* instruction = it.Current();
CHECK_EQ(instruction->GetBlock(), block)
<< instruction->DebugName()
<< " is in block " << instruction->GetBlock()->GetBlockId()
<< ", and expected in block " << block->GetBlockId();
- SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+ SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
CalculateLatency(node);
scheduling_nodes.push_back(node);
}
- if (scheduling_graph_.Size() <= 1) {
- scheduling_graph_.Clear();
+ if (scheduling_graph.Size() <= 1) {
return;
}
cursor_ = block->GetLastInstruction();
+ // The list of candidates for scheduling. A node becomes a candidate when all
+ // its predecessors have been scheduled.
+ ScopedArenaVector<SchedulingNode*> candidates(allocator.Adapter(kArenaAllocScheduler));
+
// Find the initial candidates for scheduling.
- candidates_.clear();
for (SchedulingNode* node : scheduling_nodes) {
if (!node->HasUnscheduledSuccessors()) {
node->MaybeUpdateCriticalPath(node->GetLatency());
- candidates_.push_back(node);
+ candidates.push_back(node);
}
}
- ScopedArenaVector<SchedulingNode*> initial_candidates(allocator_->Adapter(kArenaAllocScheduler));
+ ScopedArenaVector<SchedulingNode*> initial_candidates(allocator.Adapter(kArenaAllocScheduler));
if (kDumpDotSchedulingGraphs) {
// Remember the list of initial candidates for debug output purposes.
- initial_candidates.assign(candidates_.begin(), candidates_.end());
+ initial_candidates.assign(candidates.begin(), candidates.end());
}
// Schedule all nodes.
- while (!candidates_.empty()) {
- Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+ selector_->Reset();
+ while (!candidates.empty()) {
+ SchedulingNode* node = selector_->PopHighestPriorityNode(&candidates, scheduling_graph);
+ Schedule(node, &candidates);
}
if (kDumpDotSchedulingGraphs) {
@@ -607,11 +614,12 @@ void HScheduler::Schedule(HBasicBlock* block) {
std::stringstream description;
description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
<< " B" << block->GetBlockId();
- scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+ scheduling_graph.DumpAsDotGraph(description.str(), initial_candidates);
}
}
-void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+void HScheduler::Schedule(SchedulingNode* scheduling_node,
+ /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates) {
// Check whether any of the node's predecessors will be valid candidates after
// this node is scheduled.
uint32_t path_to_node = scheduling_node->GetCriticalPath();
@@ -620,7 +628,7 @@ void HScheduler::Schedule(SchedulingNode* scheduling_node) {
path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
predecessor->DecrementNumberOfUnscheduledSuccessors();
if (!predecessor->HasUnscheduledSuccessors()) {
- candidates_.push_back(predecessor);
+ candidates->push_back(predecessor);
}
}
for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
@@ -630,7 +638,7 @@ void HScheduler::Schedule(SchedulingNode* scheduling_node) {
// correctness. So we do not use them to compute the critical path.
predecessor->DecrementNumberOfUnscheduledSuccessors();
if (!predecessor->HasUnscheduledSuccessors()) {
- candidates_.push_back(predecessor);
+ candidates->push_back(predecessor);
}
}
@@ -779,7 +787,6 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
// Phase-local allocator that allocates scheduler internal data structures like
// scheduling nodes, internel nodes map, dependencies, etc.
- ScopedArenaAllocator allocator(graph_->GetArenaStack());
CriticalPathSchedulingNodeSelector critical_path_selector;
RandomSchedulingNodeSelector random_selector;
SchedulingNodeSelector* selector = schedule_randomly
@@ -795,7 +802,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
switch (instruction_set_) {
#ifdef ART_ENABLE_CODEGEN_arm64
case InstructionSet::kArm64: {
- arm64::HSchedulerARM64 scheduler(&allocator, selector);
+ arm64::HSchedulerARM64 scheduler(selector);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;
@@ -805,7 +812,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
case InstructionSet::kThumb2:
case InstructionSet::kArm: {
arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
- arm::HSchedulerARM scheduler(&allocator, selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
scheduler.Schedule(graph_);
break;
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index c7683e04a7..fd48d844e6 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -251,12 +251,14 @@ class SchedulingNode : public DeletableArenaObject<kArenaAllocScheduler> {
*/
class SchedulingGraph : public ValueObject {
public:
- SchedulingGraph(const HScheduler* scheduler, ScopedArenaAllocator* allocator)
+ SchedulingGraph(const HScheduler* scheduler,
+ ScopedArenaAllocator* allocator,
+ const HeapLocationCollector* heap_location_collector)
: scheduler_(scheduler),
allocator_(allocator),
contains_scheduling_barrier_(false),
nodes_map_(allocator_->Adapter(kArenaAllocScheduler)),
- heap_location_collector_(nullptr) {}
+ heap_location_collector_(heap_location_collector) {}
SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
std::unique_ptr<SchedulingNode> node(
@@ -268,15 +270,6 @@ class SchedulingGraph : public ValueObject {
return result;
}
- void Clear() {
- nodes_map_.clear();
- contains_scheduling_barrier_ = false;
- }
-
- void SetHeapLocationCollector(const HeapLocationCollector& heap_location_collector) {
- heap_location_collector_ = &heap_location_collector;
- }
-
SchedulingNode* GetNode(const HInstruction* instr) const {
auto it = nodes_map_.find(instr);
if (it == nodes_map_.end()) {
@@ -329,7 +322,7 @@ class SchedulingGraph : public ValueObject {
ScopedArenaHashMap<const HInstruction*, std::unique_ptr<SchedulingNode>> nodes_map_;
- const HeapLocationCollector* heap_location_collector_;
+ const HeapLocationCollector* const heap_location_collector_;
};
/*
@@ -377,6 +370,7 @@ class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
public:
+ virtual void Reset() {}
virtual SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
const SchedulingGraph& graph) = 0;
virtual ~SchedulingNodeSelector() {}
@@ -418,6 +412,7 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
public:
CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
+ void Reset() OVERRIDE { prev_select_ = nullptr; }
SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
const SchedulingGraph& graph) OVERRIDE;
@@ -434,16 +429,11 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
class HScheduler {
public:
- HScheduler(ScopedArenaAllocator* allocator,
- SchedulingLatencyVisitor* latency_visitor,
- SchedulingNodeSelector* selector)
- : allocator_(allocator),
- latency_visitor_(latency_visitor),
+ HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector)
+ : latency_visitor_(latency_visitor),
selector_(selector),
only_optimize_loop_blocks_(true),
- scheduling_graph_(this, allocator),
- cursor_(nullptr),
- candidates_(allocator_->Adapter(kArenaAllocScheduler)) {}
+ cursor_(nullptr) {}
virtual ~HScheduler() {}
void Schedule(HGraph* graph);
@@ -454,8 +444,9 @@ class HScheduler {
virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
protected:
- void Schedule(HBasicBlock* block);
- void Schedule(SchedulingNode* scheduling_node);
+ void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector);
+ void Schedule(SchedulingNode* scheduling_node,
+ /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates);
void Schedule(HInstruction* instruction);
// Any instruction returning `false` via this method will prevent its
@@ -476,19 +467,12 @@ class HScheduler {
node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
}
- ScopedArenaAllocator* const allocator_;
SchedulingLatencyVisitor* const latency_visitor_;
SchedulingNodeSelector* const selector_;
bool only_optimize_loop_blocks_;
- // We instantiate the members below as part of this class to avoid
- // instantiating them locally for every chunk scheduled.
- SchedulingGraph scheduling_graph_;
// A pointer indicating where the next instruction to be scheduled will be inserted.
HInstruction* cursor_;
- // The list of candidates for scheduling. A node becomes a candidate when all
- // its predecessors have been scheduled.
- ScopedArenaVector<SchedulingNode*> candidates_;
private:
DISALLOW_COPY_AND_ASSIGN(HScheduler);
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
index 0cb8684376..2f369486b3 100644
--- a/compiler/optimizing/scheduler_arm.h
+++ b/compiler/optimizing/scheduler_arm.h
@@ -137,10 +137,9 @@ class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor {
class HSchedulerARM : public HScheduler {
public:
- HSchedulerARM(ScopedArenaAllocator* allocator,
- SchedulingNodeSelector* selector,
+ HSchedulerARM(SchedulingNodeSelector* selector,
SchedulingLatencyVisitorARM* arm_latency_visitor)
- : HScheduler(allocator, arm_latency_visitor, selector) {}
+ : HScheduler(arm_latency_visitor, selector) {}
~HSchedulerARM() OVERRIDE {}
bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 4f394d5e16..0d2f8d9fa0 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -134,8 +134,8 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
class HSchedulerARM64 : public HScheduler {
public:
- HSchedulerARM64(ScopedArenaAllocator* allocator, SchedulingNodeSelector* selector)
- : HScheduler(allocator, &arm64_latency_visitor_, selector) {}
+ explicit HSchedulerARM64(SchedulingNodeSelector* selector)
+ : HScheduler(&arm64_latency_visitor_, selector) {}
~HSchedulerARM64() OVERRIDE {}
bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index 7079e07ae1..fe23fb4cff 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -146,7 +146,9 @@ class SchedulerTest : public OptimizingUnitTest {
environment->SetRawEnvAt(1, mul);
mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
- SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator());
+ SchedulingGraph scheduling_graph(scheduler,
+ GetScopedAllocator(),
+ /* heap_location_collector */ nullptr);
// Instructions must be inserted in reverse order into the scheduling graph.
for (HInstruction* instr : ReverseRange(block_instructions)) {
scheduling_graph.AddNode(instr);
@@ -276,11 +278,10 @@ class SchedulerTest : public OptimizingUnitTest {
entry->AddInstruction(instr);
}
- SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator());
HeapLocationCollector heap_location_collector(graph_);
heap_location_collector.VisitBasicBlock(entry);
heap_location_collector.BuildAliasingMatrix();
- scheduling_graph.SetHeapLocationCollector(heap_location_collector);
+ SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator(), &heap_location_collector);
for (HInstruction* instr : ReverseRange(block_instructions)) {
// Build scheduling graph with memory access aliasing information
@@ -354,13 +355,13 @@ class SchedulerTest : public OptimizingUnitTest {
#if defined(ART_ENABLE_CODEGEN_arm64)
TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) {
CriticalPathSchedulingNodeSelector critical_path_selector;
- arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector);
+ arm64::HSchedulerARM64 scheduler(&critical_path_selector);
TestBuildDependencyGraphAndSchedule(&scheduler);
}
TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
CriticalPathSchedulingNodeSelector critical_path_selector;
- arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector);
+ arm64::HSchedulerARM64 scheduler(&critical_path_selector);
TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
}
#endif
@@ -369,14 +370,14 @@ TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) {
CriticalPathSchedulingNodeSelector critical_path_selector;
arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
- arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
TestBuildDependencyGraphAndSchedule(&scheduler);
}
TEST_F(SchedulerTest, ArrayAccessAliasingARM) {
CriticalPathSchedulingNodeSelector critical_path_selector;
arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
- arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor);
+ arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
}
#endif
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index de79f4921e..9d4598d3d9 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -34,7 +34,7 @@ namespace art {
* Collects and builds stack maps for a method. All the stack maps
* for a method are placed in a CodeInfo object.
*/
-class StackMapStream : public ValueObject {
+class StackMapStream : public DeletableArenaObject<kArenaAllocStackMapStream> {
public:
explicit StackMapStream(ScopedArenaAllocator* allocator, InstructionSet instruction_set)
: instruction_set_(instruction_set),
@@ -53,6 +53,7 @@ class StackMapStream : public ValueObject {
current_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)),
previous_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)),
dex_register_timestamp_(allocator->Adapter(kArenaAllocStackMapStream)),
+ expected_num_dex_registers_(0u),
temp_dex_register_mask_(allocator, 32, true, kArenaAllocStackMapStream),
temp_dex_register_map_(allocator->Adapter(kArenaAllocStackMapStream)) {
}