Reuse arena memory for each block in scheduler.

This reduces the peak memory used for large methods with
multiple blocks to schedule.

Compiling the aosp_taimen-userdebug boot image, the most
memory hungry method BatteryStats.dumpLocked has the
Scheduler memory allocations in ArenaStack hidden by the
register allocator:
  - before:
    MEM: used: 8300224, allocated: 9175040, lost: 197360
    Scheduler       8300224
  - after:
    MEM: used: 5914296, allocated: 7864320, lost: 78200
    SsaLiveness     5532840
    RegAllocator     144968
    RegAllocVldt     236488
The total arena memory used, including the ArenaAllocator
not listed above, goes from 44333648 to 41950324 (-5.4%).
(Measured with kArenaAllocatorCountAllocations=true,
kArenaAllocatorPreciseTracking=false.)

Also remove one unnecessary -Wframe-larger-than= workaround
and add one workaround for large frame with the above arena
alloc tracking flags.

Test: m test-art-host-gtest
Test: testrunner.py --host
Bug: 34053922
Change-Id: I7fd8d90dcc13b184b1e5bd0bcac072388710a129
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 588ea03..1aa16f4 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -545,60 +545,67 @@
 void HScheduler::Schedule(HGraph* graph) {
   // We run lsa here instead of in a separate pass to better control whether we
   // should run the analysis or not.
+  const HeapLocationCollector* heap_location_collector = nullptr;
   LoadStoreAnalysis lsa(graph);
   if (!only_optimize_loop_blocks_ || graph->HasLoops()) {
     lsa.Run();
-    scheduling_graph_.SetHeapLocationCollector(lsa.GetHeapLocationCollector());
+    heap_location_collector = &lsa.GetHeapLocationCollector();
   }
 
   for (HBasicBlock* block : graph->GetReversePostOrder()) {
     if (IsSchedulable(block)) {
-      Schedule(block);
+      Schedule(block, heap_location_collector);
     }
   }
 }
 
-void HScheduler::Schedule(HBasicBlock* block) {
-  ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator_->Adapter(kArenaAllocScheduler));
+void HScheduler::Schedule(HBasicBlock* block,
+                          const HeapLocationCollector* heap_location_collector) {
+  ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
+  ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
 
   // Build the scheduling graph.
-  scheduling_graph_.Clear();
+  SchedulingGraph scheduling_graph(this, &allocator, heap_location_collector);
   for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
     HInstruction* instruction = it.Current();
     CHECK_EQ(instruction->GetBlock(), block)
         << instruction->DebugName()
         << " is in block " << instruction->GetBlock()->GetBlockId()
         << ", and expected in block " << block->GetBlockId();
-    SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+    SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
     CalculateLatency(node);
     scheduling_nodes.push_back(node);
   }
 
-  if (scheduling_graph_.Size() <= 1) {
-    scheduling_graph_.Clear();
+  if (scheduling_graph.Size() <= 1) {
     return;
   }
 
   cursor_ = block->GetLastInstruction();
 
+  // The list of candidates for scheduling. A node becomes a candidate when all
+  // its predecessors have been scheduled.
+  ScopedArenaVector<SchedulingNode*> candidates(allocator.Adapter(kArenaAllocScheduler));
+
   // Find the initial candidates for scheduling.
-  candidates_.clear();
   for (SchedulingNode* node : scheduling_nodes) {
     if (!node->HasUnscheduledSuccessors()) {
       node->MaybeUpdateCriticalPath(node->GetLatency());
-      candidates_.push_back(node);
+      candidates.push_back(node);
     }
   }
 
-  ScopedArenaVector<SchedulingNode*> initial_candidates(allocator_->Adapter(kArenaAllocScheduler));
+  ScopedArenaVector<SchedulingNode*> initial_candidates(allocator.Adapter(kArenaAllocScheduler));
   if (kDumpDotSchedulingGraphs) {
     // Remember the list of initial candidates for debug output purposes.
-    initial_candidates.assign(candidates_.begin(), candidates_.end());
+    initial_candidates.assign(candidates.begin(), candidates.end());
   }
 
   // Schedule all nodes.
-  while (!candidates_.empty()) {
-    Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+  selector_->Reset();
+  while (!candidates.empty()) {
+    SchedulingNode* node = selector_->PopHighestPriorityNode(&candidates, scheduling_graph);
+    Schedule(node, &candidates);
   }
 
   if (kDumpDotSchedulingGraphs) {
@@ -607,11 +614,12 @@
     std::stringstream description;
     description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
         << " B" << block->GetBlockId();
-    scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+    scheduling_graph.DumpAsDotGraph(description.str(), initial_candidates);
   }
 }
 
-void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+void HScheduler::Schedule(SchedulingNode* scheduling_node,
+                          /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates) {
   // Check whether any of the node's predecessors will be valid candidates after
   // this node is scheduled.
   uint32_t path_to_node = scheduling_node->GetCriticalPath();
@@ -620,7 +628,7 @@
         path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
     predecessor->DecrementNumberOfUnscheduledSuccessors();
     if (!predecessor->HasUnscheduledSuccessors()) {
-      candidates_.push_back(predecessor);
+      candidates->push_back(predecessor);
     }
   }
   for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
@@ -630,7 +638,7 @@
     // correctness. So we do not use them to compute the critical path.
     predecessor->DecrementNumberOfUnscheduledSuccessors();
     if (!predecessor->HasUnscheduledSuccessors()) {
-      candidates_.push_back(predecessor);
+      candidates->push_back(predecessor);
     }
   }
 
@@ -779,7 +787,6 @@
 #if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
   // Phase-local allocator that allocates scheduler internal data structures like
   // scheduling nodes, internel nodes map, dependencies, etc.
-  ScopedArenaAllocator allocator(graph_->GetArenaStack());
   CriticalPathSchedulingNodeSelector critical_path_selector;
   RandomSchedulingNodeSelector random_selector;
   SchedulingNodeSelector* selector = schedule_randomly
@@ -795,7 +802,7 @@
   switch (instruction_set_) {
 #ifdef ART_ENABLE_CODEGEN_arm64
     case InstructionSet::kArm64: {
-      arm64::HSchedulerARM64 scheduler(&allocator, selector);
+      arm64::HSchedulerARM64 scheduler(selector);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;
@@ -805,7 +812,7 @@
     case InstructionSet::kThumb2:
     case InstructionSet::kArm: {
       arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
-      arm::HSchedulerARM scheduler(&allocator, selector, &arm_latency_visitor);
+      arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;