LSE: Index load substitutes by load ids.

And address some late comments on
    https://android-review.googlesource.com/1355117 .

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 77906240
Change-Id: I3726f62125387b87f78857cbce8256ac2e94ba3f
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index e08c585..d69442e 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -31,17 +31,19 @@
  * We use load-store analysis to collect a list of heap locations and perform
  * alias analysis of those heap locations. LSE then keeps track of a list of
  * heap values corresponding to the heap locations and stores that put those
- * values in these locations. In phase 1, we visit basic blocks in reverse post
- * order and for each basic block, visit instructions sequentially, recording
- * heap values and looking for loads and stores to eliminate without relying on
- * loop Phis. In phase, we look for loads that can be replaced by creating loop
- * Phis or using a loop-invariant value. In phase 3, we determine which stores
- * are dead and can be eliminated and based on that information we re-evaluate
- * whether some kept stores are storing the same value as the value in the heap
- * location; such stores are also marked for elimination. In phase 4, we commit
- * the changes, replacing loads marked for elimination in previous processing
- * and removing stores not marked for keeping. We also remove allocations that
- * are no longer needed.
+ * values in these locations.
+ *  - In phase 1, we visit basic blocks in reverse post order and for each basic
+ *    block, visit instructions sequentially, recording heap values and looking
+ *    for loads and stores to eliminate without relying on loop Phis.
+ *  - In phase 2, we look for loads that can be replaced by creating loop Phis
+ *    or using a loop-invariant value.
+ *  - In phase 3, we determine which stores are dead and can be eliminated and
+ *    based on that information we re-evaluate whether some kept stores are
+ *    storing the same value as the value in the heap location; such stores are
+ *    also marked for elimination.
+ *  - In phase 4, we commit the changes, replacing loads marked for elimination
+ *    in previous processing and removing stores not marked for keeping. We also
+ *    remove allocations that are no longer needed.
  *
  * 1. Walk over blocks and their instructions.
  *
@@ -213,12 +215,10 @@
  * The time complexity of this phase is
  *    O(instructions + instruction_uses) .
  *
- * FIXME: The time complexity described above assumes that FindSubstitute()
- * is O(1) but it is currently O(removed_loads_.size()); this can be fixed
- * by introducing a vector of replacements indexed by instruction id.
- * It also assumes that the HeapLocationCollector finds a heap location for
- * an instruction in O(1) time but it is currently O(heap_locations); this
- * can be fixed by adding a hash map to the HeapLocationCollector.
+ * FIXME: The time complexity described above assumes that the
+ * HeapLocationCollector finds a heap location for an instruction in O(1)
+ * time but it is currently O(heap_locations); this can be fixed by adding
+ * a hash map to the HeapLocationCollector.
  */
 
 namespace art {
@@ -481,18 +481,14 @@
   // Find an instruction's substitute if it's a removed load.
   // Return the same instruction if it should not be removed.
   HInstruction* FindSubstitute(HInstruction* instruction) const {
-    if (!IsLoad(instruction)) {
+    size_t id = static_cast<size_t>(instruction->GetId());
+    if (id >= substitute_instructions_for_loads_.size()) {
+      DCHECK(!IsLoad(instruction));  // New Phi (may not be in the graph yet) or default value.
       return instruction;
     }
-    for (size_t i = 0u, size = removed_loads_.size(); i != size; ++i) {
-      if (removed_loads_[i] == instruction) {
-        HInstruction* substitute = substitute_instructions_for_loads_[i];
-        // The substitute list is a flat hierarchy.
-        DCHECK_EQ(FindSubstitute(substitute), substitute);
-        return substitute;
-      }
-    }
-    return instruction;
+    HInstruction* substitute = substitute_instructions_for_loads_[id];
+    DCHECK(substitute == nullptr || IsLoad(instruction));
+    return (substitute != nullptr) ? substitute : instruction;
   }
 
   void AddRemovedLoad(HInstruction* load, HInstruction* heap_value) {
@@ -516,9 +512,8 @@
     HTypeConversion* type_conversion = FindOrAddTypeConversionIfNecessary(
         load, heap_value, load->GetType());
 
-    removed_loads_.push_back(load);
-    substitute_instructions_for_loads_.push_back(
-        type_conversion != nullptr ? type_conversion : heap_value);
+    substitute_instructions_for_loads_[load->GetId()] =
+        type_conversion != nullptr ? type_conversion : heap_value;
   }
 
   static bool IsLoad(HInstruction* instruction) {
@@ -921,11 +916,6 @@
   // One array of heap value records for each block.
   ScopedArenaVector<ScopedArenaVector<ValueRecord>> heap_values_for_;
 
-  // We record the instructions that should be eliminated but may be
-  // used by heap locations. They'll be removed in the end.
-  ScopedArenaVector<HInstruction*> removed_loads_;
-  ScopedArenaVector<HInstruction*> substitute_instructions_for_loads_;
-
   // We record loads and stores for re-processing when we find a loop Phi placeholder
   // with unknown value from a predecessor, and also for removing stores that are
   // found to be dead, i.e. not marked in `kept_stores_` at the end.
@@ -935,6 +925,11 @@
   };
   ScopedArenaVector<LoadStoreRecord> loads_and_stores_;
 
+  // We record the substitute instructions for loads that should be
+  // eliminated but may be used by heap locations. They'll be removed
+  // in the end. These are indexed by the load's id.
+  ScopedArenaVector<HInstruction*> substitute_instructions_for_loads_;
+
   // Record stores to keep in a bit vector indexed by instruction ID.
   ArenaBitVector kept_stores_;
   // When we need to keep all stores that feed a Phi placeholder, we just record the
@@ -1017,11 +1012,12 @@
       heap_values_for_(graph->GetBlocks().size(),
                        ScopedArenaVector<ValueRecord>(allocator_.Adapter(kArenaAllocLSE)),
                        allocator_.Adapter(kArenaAllocLSE)),
-      removed_loads_(allocator_.Adapter(kArenaAllocLSE)),
-      substitute_instructions_for_loads_(allocator_.Adapter(kArenaAllocLSE)),
       loads_and_stores_(allocator_.Adapter(kArenaAllocLSE)),
-      // We may add new instructions (default values, Phis) but we're not adding stores,
-      // so we do not need the following BitVector to be expandable.
+      // We may add new instructions (default values, Phis) but we're not adding loads
+      // or stores, so we shall not need to resize following vector and BitVector.
+      substitute_instructions_for_loads_(graph->GetCurrentInstructionId(),
+                                         nullptr,
+                                         allocator_.Adapter(kArenaAllocLSE)),
       kept_stores_(&allocator_,
                    /*start_bits=*/ graph->GetCurrentInstructionId(),
                    /*expandable=*/ false,
@@ -1889,13 +1885,10 @@
   }
 
   // Use local allocator to reduce peak memory usage.
-  // FIXME: This is currently incompatible with calls to AddRemovedLoad();
-  //        that function should change also for performance reasons as
-  //        searches in `removed_loads_` are O(n).
-  // ScopedArenaAllocator allocator(allocator_.GetArenaStack());
+  ScopedArenaAllocator allocator(allocator_.GetArenaStack());
   // Reuse one temporary vector for all remaining blocks.
   size_t num_heap_locations = heap_location_collector_.GetNumberOfHeapLocations();
-  ScopedArenaVector<Value> local_heap_values(allocator_.Adapter(kArenaAllocLSE));
+  ScopedArenaVector<Value> local_heap_values(allocator.Adapter(kArenaAllocLSE));
 
   auto get_initial_value = [this](HBasicBlock* block, size_t idx) {
     Value value;
@@ -1945,7 +1938,21 @@
         continue;  // This load or store never needed a loop Phi.
       }
       ValueRecord& record = it->second;
-      if (!is_store) {
+      if (is_store) {
+        // Process the store by updating `local_heap_values[idx]`. The last update shall
+        // be propagated to the `heap_values[idx].value` if it previously needed a loop Phi
+        // at the end of the block.
+        Value replacement = ReplacementOrValue(record.value);
+        if (replacement.NeedsLoopPhi()) {
+          // No replacement yet, use the Phi placeholder from the load.
+          DCHECK(record.value.NeedsLoopPhi());
+          local_heap_values[idx] = record.value;
+        } else {
+          // If the load fetched a known value, use it, otherwise use the load.
+          local_heap_values[idx] = Value::ForInstruction(
+              replacement.IsUnknown() ? stored_value : replacement.GetInstruction());
+        }
+      } else {
         // Process the load unless it has previously been marked unreplaceable.
         if (record.value.NeedsLoopPhi()) {
           if (local_heap_values[idx].IsInvalid()) {
@@ -1975,20 +1982,6 @@
             TryRemovingNullCheck(load_or_store);
           }
         }
-      } else {
-        // Process the store by updating `local_heap_values[idx]`. The last update shall
-        // be propagated to the `heap_values[idx].value` if it previously needed a loop Phi
-        // at the end of the block.
-        Value replacement = ReplacementOrValue(record.value);
-        if (replacement.NeedsLoopPhi()) {
-          // No replacement yet, use the Phi placeholder from the load.
-          DCHECK(record.value.NeedsLoopPhi());
-          local_heap_values[idx] = record.value;
-        } else {
-          // If the load fetched a known value, use it, otherwise use the load.
-          local_heap_values[idx] = Value::ForInstruction(
-              replacement.IsUnknown() ? stored_value : replacement.GetInstruction());
-        }
       }
     }
 
@@ -2217,15 +2210,16 @@
   // 4. Replace loads and remove unnecessary stores and singleton allocations.
 
   // Remove recorded load instructions that should be eliminated.
-  size_t size = removed_loads_.size();
-  DCHECK_EQ(size, substitute_instructions_for_loads_.size());
-  for (size_t i = 0; i != size; ++i) {
-    HInstruction* load = removed_loads_[i];
+  for (const LoadStoreRecord& record : loads_and_stores_) {
+    size_t id = dchecked_integral_cast<size_t>(record.load_or_store->GetId());
+    HInstruction* substitute = substitute_instructions_for_loads_[id];
+    if (substitute == nullptr) {
+      continue;
+    }
+    HInstruction* load = record.load_or_store;
     DCHECK(load != nullptr);
     DCHECK(IsLoad(load));
     DCHECK(load->GetBlock() != nullptr) << load->DebugName() << "@" << load->GetDexPc();
-    HInstruction* substitute = substitute_instructions_for_loads_[i];
-    DCHECK(substitute != nullptr);
     // We proactively retrieve the substitute for a removed load, so
     // a load that has a substitute should not be observed as a heap
     // location value.