Reuse arena memory for each block in scheduler.

This reduces the peak memory used for large methods with multiple blocks to schedule. Compiling the aosp_taimen-userdebug boot image, the most memory hungry method BatteryStats.dumpLocked has the Scheduler memory allocations in ArenaStack hidden by the register allocator: - before: MEM: used: 8300224, allocated: 9175040, lost: 197360 Scheduler 8300224 - after: MEM: used: 5914296, allocated: 7864320, lost: 78200 SsaLiveness 5532840 RegAllocator 144968 RegAllocVldt 236488 The total arena memory used, including the ArenaAllocator not listed above, goes from 44333648 to 41950324 (-5.4%). (Measured with kArenaAllocatorCountAllocations=true, kArenaAllocatorPreciseTracking=false.) Also remove one unnecessary -Wframe-larger-than= workaround and add one workaround for large frame with the above arena alloc tracking flags. Test: m test-art-host-gtest Test: testrunner.py --host Bug: 34053922 Change-Id: I7fd8d90dcc13b184b1e5bd0bcac072388710a129
author: Vladimir Marko <vmarko@google.com> 2018-07-26 14:42:17 +0100
committer: Vladimir Marko <vmarko@google.com> 2018-08-02 15:47:02 +0100
commit: ced04835d8e3cd3f68576cfffc1d21283ca151b4 (patch)
tree: 125ddd1d222f4fb1710e17c76803ad3e92572a5c /compiler
parent: 350b6a312222b9b27bfee0e72ce261a45cb60e1c (diff)
8 files changed, 63 insertions, 73 deletions
diff --git a/compiler/driver/compiler_options.cc b/compiler/driver/compiler_options.cc
index 62d547de44..8cc6cf10f0 100644
--- a/compiler/driver/compiler_options.cc
+++ b/compiler/driver/compiler_options.cc
@@ -116,9 +116,6 @@ bool CompilerOptions::ParseRegisterAllocationStrategy(const std::string& option,
   return true;
 }
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wframe-larger-than="
-
 bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& options,
                                            bool ignore_unrecognized,
                                            std::string* error_msg) {
@@ -133,8 +130,6 @@ bool CompilerOptions::ParseCompilerOptions(const std::vector<std::string>& optio
   return ReadCompilerOptions(args, this, error_msg);
 }
 
-#pragma GCC diagnostic pop
-
 bool CompilerOptions::IsImageClass(const char* descriptor) const {
   // Historical note: We used to hold the set indirectly and there was a distinction between an
   // empty set and a null, null meaning to include all classes. However, the distiction has been
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index d96746fdd7..b2733ee1f2 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1101,15 +1101,18 @@ static void CreateJniStackMap(ArenaStack* arena_stack,
                               const JniCompiledMethod& jni_compiled_method,
                               /* out */ ArenaVector<uint8_t>* stack_map) {
   ScopedArenaAllocator allocator(arena_stack);
-  StackMapStream stack_map_stream(&allocator, jni_compiled_method.GetInstructionSet());
-  stack_map_stream.BeginMethod(
+  // StackMapStream is quite large, so allocate it using the ScopedArenaAllocator
+  // to stay clear of the frame size limit.
+  std::unique_ptr<StackMapStream> stack_map_stream(
+      new (&allocator) StackMapStream(&allocator, jni_compiled_method.GetInstructionSet()));
+  stack_map_stream->BeginMethod(
       jni_compiled_method.GetFrameSize(),
       jni_compiled_method.GetCoreSpillMask(),
       jni_compiled_method.GetFpSpillMask(),
       /* num_dex_registers */ 0);
-  stack_map_stream.EndMethod();
-  stack_map->resize(stack_map_stream.PrepareForFillIn());
-  stack_map_stream.FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
+  stack_map_stream->EndMethod();
+  stack_map->resize(stack_map_stream->PrepareForFillIn());
+  stack_map_stream->FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
 }
 
 CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 588ea03d69..1aa16f45bc 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -545,60 +545,67 @@ SchedulingNode* CriticalPathSchedulingNodeSelector::GetHigherPrioritySchedulingN
 void HScheduler::Schedule(HGraph* graph) {
   // We run lsa here instead of in a separate pass to better control whether we
   // should run the analysis or not.
+  const HeapLocationCollector* heap_location_collector = nullptr;
   LoadStoreAnalysis lsa(graph);
   if (!only_optimize_loop_blocks_ || graph->HasLoops()) {
     lsa.Run();
-    scheduling_graph_.SetHeapLocationCollector(lsa.GetHeapLocationCollector());
+    heap_location_collector = &lsa.GetHeapLocationCollector();
   }
 
   for (HBasicBlock* block : graph->GetReversePostOrder()) {
     if (IsSchedulable(block)) {
-      Schedule(block);
+      Schedule(block, heap_location_collector);
     }
   }
 }
 
-void HScheduler::Schedule(HBasicBlock* block) {
-  ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator_->Adapter(kArenaAllocScheduler));
+void HScheduler::Schedule(HBasicBlock* block,
+                          const HeapLocationCollector* heap_location_collector) {
+  ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
+  ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
 
   // Build the scheduling graph.
-  scheduling_graph_.Clear();
+  SchedulingGraph scheduling_graph(this, &allocator, heap_location_collector);
   for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
     HInstruction* instruction = it.Current();
     CHECK_EQ(instruction->GetBlock(), block)
         << instruction->DebugName()
         << " is in block " << instruction->GetBlock()->GetBlockId()
         << ", and expected in block " << block->GetBlockId();
-    SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+    SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
     CalculateLatency(node);
     scheduling_nodes.push_back(node);
   }
 
-  if (scheduling_graph_.Size() <= 1) {
-    scheduling_graph_.Clear();
+  if (scheduling_graph.Size() <= 1) {
     return;
   }
 
   cursor_ = block->GetLastInstruction();
 
+  // The list of candidates for scheduling. A node becomes a candidate when all
+  // its predecessors have been scheduled.
+  ScopedArenaVector<SchedulingNode*> candidates(allocator.Adapter(kArenaAllocScheduler));
+
   // Find the initial candidates for scheduling.
-  candidates_.clear();
   for (SchedulingNode* node : scheduling_nodes) {
     if (!node->HasUnscheduledSuccessors()) {
       node->MaybeUpdateCriticalPath(node->GetLatency());
-      candidates_.push_back(node);
+      candidates.push_back(node);
     }
   }
 
-  ScopedArenaVector<SchedulingNode*> initial_candidates(allocator_->Adapter(kArenaAllocScheduler));
+  ScopedArenaVector<SchedulingNode*> initial_candidates(allocator.Adapter(kArenaAllocScheduler));
   if (kDumpDotSchedulingGraphs) {
     // Remember the list of initial candidates for debug output purposes.
-    initial_candidates.assign(candidates_.begin(), candidates_.end());
+    initial_candidates.assign(candidates.begin(), candidates.end());
   }
 
   // Schedule all nodes.
-  while (!candidates_.empty()) {
-    Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+  selector_->Reset();
+  while (!candidates.empty()) {
+    SchedulingNode* node = selector_->PopHighestPriorityNode(&candidates, scheduling_graph);
+    Schedule(node, &candidates);
   }
 
   if (kDumpDotSchedulingGraphs) {
@@ -607,11 +614,12 @@ void HScheduler::Schedule(HBasicBlock* block) {
     std::stringstream description;
     description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
         << " B" << block->GetBlockId();
-    scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+    scheduling_graph.DumpAsDotGraph(description.str(), initial_candidates);
   }
 }
 
-void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+void HScheduler::Schedule(SchedulingNode* scheduling_node,
+                          /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates) {
   // Check whether any of the node's predecessors will be valid candidates after
   // this node is scheduled.
   uint32_t path_to_node = scheduling_node->GetCriticalPath();
@@ -620,7 +628,7 @@ void HScheduler::Schedule(SchedulingNode* scheduling_node) {
         path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
     predecessor->DecrementNumberOfUnscheduledSuccessors();
     if (!predecessor->HasUnscheduledSuccessors()) {
-      candidates_.push_back(predecessor);
+      candidates->push_back(predecessor);
     }
   }
   for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
@@ -630,7 +638,7 @@ void HScheduler::Schedule(SchedulingNode* scheduling_node) {
     // correctness. So we do not use them to compute the critical path.
     predecessor->DecrementNumberOfUnscheduledSuccessors();
     if (!predecessor->HasUnscheduledSuccessors()) {
-      candidates_.push_back(predecessor);
+      candidates->push_back(predecessor);
     }
   }
 
@@ -779,7 +787,6 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
 #if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
   // Phase-local allocator that allocates scheduler internal data structures like
   // scheduling nodes, internel nodes map, dependencies, etc.
-  ScopedArenaAllocator allocator(graph_->GetArenaStack());
   CriticalPathSchedulingNodeSelector critical_path_selector;
   RandomSchedulingNodeSelector random_selector;
   SchedulingNodeSelector* selector = schedule_randomly
@@ -795,7 +802,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
   switch (instruction_set_) {
 #ifdef ART_ENABLE_CODEGEN_arm64
     case InstructionSet::kArm64: {
-      arm64::HSchedulerARM64 scheduler(&allocator, selector);
+      arm64::HSchedulerARM64 scheduler(selector);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;
@@ -805,7 +812,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
     case InstructionSet::kThumb2:
     case InstructionSet::kArm: {
       arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
-      arm::HSchedulerARM scheduler(&allocator, selector, &arm_latency_visitor);
+      arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index c7683e04a7..fd48d844e6 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -251,12 +251,14 @@ class SchedulingNode : public DeletableArenaObject<kArenaAllocScheduler> {
  */
 class SchedulingGraph : public ValueObject {
  public:
-  SchedulingGraph(const HScheduler* scheduler, ScopedArenaAllocator* allocator)
+  SchedulingGraph(const HScheduler* scheduler,
+                  ScopedArenaAllocator* allocator,
+                  const HeapLocationCollector* heap_location_collector)
       : scheduler_(scheduler),
         allocator_(allocator),
         contains_scheduling_barrier_(false),
         nodes_map_(allocator_->Adapter(kArenaAllocScheduler)),
-        heap_location_collector_(nullptr) {}
+        heap_location_collector_(heap_location_collector) {}
 
   SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
     std::unique_ptr<SchedulingNode> node(
@@ -268,15 +270,6 @@ class SchedulingGraph : public ValueObject {
     return result;
   }
 
-  void Clear() {
-    nodes_map_.clear();
-    contains_scheduling_barrier_ = false;
-  }
-
-  void SetHeapLocationCollector(const HeapLocationCollector& heap_location_collector) {
-    heap_location_collector_ = &heap_location_collector;
-  }
-
   SchedulingNode* GetNode(const HInstruction* instr) const {
     auto it = nodes_map_.find(instr);
     if (it == nodes_map_.end()) {
@@ -329,7 +322,7 @@ class SchedulingGraph : public ValueObject {
 
   ScopedArenaHashMap<const HInstruction*, std::unique_ptr<SchedulingNode>> nodes_map_;
 
-  const HeapLocationCollector* heap_location_collector_;
+  const HeapLocationCollector* const heap_location_collector_;
 };
 
 /*
@@ -377,6 +370,7 @@ class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
 
 class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
  public:
+  virtual void Reset() {}
   virtual SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
                                                  const SchedulingGraph& graph) = 0;
   virtual ~SchedulingNodeSelector() {}
@@ -418,6 +412,7 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
  public:
   CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
 
+  void Reset() OVERRIDE { prev_select_ = nullptr; }
   SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
                                          const SchedulingGraph& graph) OVERRIDE;
 
@@ -434,16 +429,11 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
 
 class HScheduler {
  public:
-  HScheduler(ScopedArenaAllocator* allocator,
-             SchedulingLatencyVisitor* latency_visitor,
-             SchedulingNodeSelector* selector)
-      : allocator_(allocator),
-        latency_visitor_(latency_visitor),
+  HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector)
+      : latency_visitor_(latency_visitor),
         selector_(selector),
         only_optimize_loop_blocks_(true),
-        scheduling_graph_(this, allocator),
-        cursor_(nullptr),
-        candidates_(allocator_->Adapter(kArenaAllocScheduler)) {}
+        cursor_(nullptr) {}
   virtual ~HScheduler() {}
 
   void Schedule(HGraph* graph);
@@ -454,8 +444,9 @@ class HScheduler {
   virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
 
  protected:
-  void Schedule(HBasicBlock* block);
-  void Schedule(SchedulingNode* scheduling_node);
+  void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector);
+  void Schedule(SchedulingNode* scheduling_node,
+                /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates);
   void Schedule(HInstruction* instruction);
 
   // Any instruction returning `false` via this method will prevent its
@@ -476,19 +467,12 @@ class HScheduler {
     node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
   }
 
-  ScopedArenaAllocator* const allocator_;
   SchedulingLatencyVisitor* const latency_visitor_;
   SchedulingNodeSelector* const selector_;
   bool only_optimize_loop_blocks_;
 
-  // We instantiate the members below as part of this class to avoid
-  // instantiating them locally for every chunk scheduled.
-  SchedulingGraph scheduling_graph_;
   // A pointer indicating where the next instruction to be scheduled will be inserted.
   HInstruction* cursor_;
-  // The list of candidates for scheduling. A node becomes a candidate when all
-  // its predecessors have been scheduled.
-  ScopedArenaVector<SchedulingNode*> candidates_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HScheduler);
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
index 0cb8684376..2f369486b3 100644
--- a/compiler/optimizing/scheduler_arm.h
+++ b/compiler/optimizing/scheduler_arm.h
@@ -137,10 +137,9 @@ class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor {
 
 class HSchedulerARM : public HScheduler {
  public:
-  HSchedulerARM(ScopedArenaAllocator* allocator,
-                SchedulingNodeSelector* selector,
+  HSchedulerARM(SchedulingNodeSelector* selector,
                 SchedulingLatencyVisitorARM* arm_latency_visitor)
-      : HScheduler(allocator, arm_latency_visitor, selector) {}
+      : HScheduler(arm_latency_visitor, selector) {}
   ~HSchedulerARM() OVERRIDE {}
 
   bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 4f394d5e16..0d2f8d9fa0 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -134,8 +134,8 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
 
 class HSchedulerARM64 : public HScheduler {
  public:
-  HSchedulerARM64(ScopedArenaAllocator* allocator, SchedulingNodeSelector* selector)
-      : HScheduler(allocator, &arm64_latency_visitor_, selector) {}
+  explicit HSchedulerARM64(SchedulingNodeSelector* selector)
+      : HScheduler(&arm64_latency_visitor_, selector) {}
   ~HSchedulerARM64() OVERRIDE {}
 
   bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index 7079e07ae1..fe23fb4cff 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -146,7 +146,9 @@ class SchedulerTest : public OptimizingUnitTest {
     environment->SetRawEnvAt(1, mul);
     mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
 
-    SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator());
+    SchedulingGraph scheduling_graph(scheduler,
+                                     GetScopedAllocator(),
+                                     /* heap_location_collector */ nullptr);
     // Instructions must be inserted in reverse order into the scheduling graph.
     for (HInstruction* instr : ReverseRange(block_instructions)) {
       scheduling_graph.AddNode(instr);
@@ -276,11 +278,10 @@ class SchedulerTest : public OptimizingUnitTest {
       entry->AddInstruction(instr);
     }
 
-    SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator());
     HeapLocationCollector heap_location_collector(graph_);
     heap_location_collector.VisitBasicBlock(entry);
     heap_location_collector.BuildAliasingMatrix();
-    scheduling_graph.SetHeapLocationCollector(heap_location_collector);
+    SchedulingGraph scheduling_graph(scheduler, GetScopedAllocator(), &heap_location_collector);
 
     for (HInstruction* instr : ReverseRange(block_instructions)) {
       // Build scheduling graph with memory access aliasing information
@@ -354,13 +355,13 @@ class SchedulerTest : public OptimizingUnitTest {
 #if defined(ART_ENABLE_CODEGEN_arm64)
 TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) {
   CriticalPathSchedulingNodeSelector critical_path_selector;
-  arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector);
+  arm64::HSchedulerARM64 scheduler(&critical_path_selector);
   TestBuildDependencyGraphAndSchedule(&scheduler);
 }
 
 TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
   CriticalPathSchedulingNodeSelector critical_path_selector;
-  arm64::HSchedulerARM64 scheduler(GetScopedAllocator(), &critical_path_selector);
+  arm64::HSchedulerARM64 scheduler(&critical_path_selector);
   TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
 }
 #endif
@@ -369,14 +370,14 @@ TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
 TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) {
   CriticalPathSchedulingNodeSelector critical_path_selector;
   arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
-  arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor);
+  arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
   TestBuildDependencyGraphAndSchedule(&scheduler);
 }
 
 TEST_F(SchedulerTest, ArrayAccessAliasingARM) {
   CriticalPathSchedulingNodeSelector critical_path_selector;
   arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
-  arm::HSchedulerARM scheduler(GetScopedAllocator(), &critical_path_selector, &arm_latency_visitor);
+  arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
   TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
 }
 #endif
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index de79f4921e..9d4598d3d9 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -34,7 +34,7 @@ namespace art {
  * Collects and builds stack maps for a method. All the stack maps
  * for a method are placed in a CodeInfo object.
  */
-class StackMapStream : public ValueObject {
+class StackMapStream : public DeletableArenaObject<kArenaAllocStackMapStream> {
  public:
   explicit StackMapStream(ScopedArenaAllocator* allocator, InstructionSet instruction_set)
       : instruction_set_(instruction_set),
@@ -53,6 +53,7 @@ class StackMapStream : public ValueObject {
         current_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)),
         previous_dex_registers_(allocator->Adapter(kArenaAllocStackMapStream)),
         dex_register_timestamp_(allocator->Adapter(kArenaAllocStackMapStream)),
+        expected_num_dex_registers_(0u),
         temp_dex_register_mask_(allocator, 32, true, kArenaAllocStackMapStream),
         temp_dex_register_map_(allocator->Adapter(kArenaAllocStackMapStream)) {
   }
author	Vladimir Marko <vmarko@google.com>	2018-07-26 14:42:17 +0100
committer	Vladimir Marko <vmarko@google.com>	2018-08-02 15:47:02 +0100
commit	ced04835d8e3cd3f68576cfffc1d21283ca151b4 (patch)
tree	125ddd1d222f4fb1710e17c76803ad3e92572a5c /compiler
parent	350b6a312222b9b27bfee0e72ce261a45cb60e1c (diff)