Fix array location aliasing checks in LSE.

Test: New tests in load_store_elimination_test.
Test: New test in 539-checker-lse.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 187487955
Change-Id: Iff66d5406cf1b36c3bebbce1d48117f83bb50553
diff --git a/compiler/optimizing/block_namer.cc b/compiler/optimizing/block_namer.cc
index 9936bf1..d30448c 100644
--- a/compiler/optimizing/block_namer.cc
+++ b/compiler/optimizing/block_namer.cc
@@ -21,7 +21,13 @@
 namespace art {
 
 std::ostream& BlockNamer::PrintName(std::ostream& os, HBasicBlock* blk) const {
-  return os << "B" << blk->GetBlockId();
+  os << "B";
+  if (blk != nullptr) {
+    os << blk->GetBlockId();
+  } else {
+    os << "<none>";
+  }
+  return os;
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 5a264b7..716fee4 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -656,9 +656,10 @@
     } else {
       StartAttributeStream("dex_pc") << "n/a";
     }
+    HBasicBlock* block = instruction->GetBlock();
     if (IsPass(kDebugDumpName)) {
       // Include block name for logcat use.
-      StartAttributeStream("block") << namer_.GetName(instruction->GetBlock());
+      StartAttributeStream("block") << namer_.GetName(block);
     }
     instruction->Accept(this);
     if (instruction->HasEnvironment()) {
@@ -710,7 +711,7 @@
       }
     }
 
-    HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation();
+    HLoopInformation* loop_info = (block != nullptr) ? block->GetLoopInformation() : nullptr;
     if (loop_info == nullptr) {
       StartAttributeStream("loop") << "none";
     } else {
diff --git a/compiler/optimizing/load_store_analysis.h b/compiler/optimizing/load_store_analysis.h
index e815727..5fda8df 100644
--- a/compiler/optimizing/load_store_analysis.h
+++ b/compiler/optimizing/load_store_analysis.h
@@ -417,20 +417,7 @@
     }
   }
 
- private:
-  // An allocation cannot alias with a name which already exists at the point
-  // of the allocation, such as a parameter or a load happening before the allocation.
-  bool MayAliasWithPreexistenceChecking(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const {
-    if (ref_info1->GetReference()->IsNewInstance() || ref_info1->GetReference()->IsNewArray()) {
-      // Any reference that can alias with the allocation must appear after it in the block/in
-      // the block's successors. In reverse post order, those instructions will be visited after
-      // the allocation.
-      return ref_info2->GetPosition() >= ref_info1->GetPosition();
-    }
-    return true;
-  }
-
-  bool CanReferencesAlias(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) const {
+  static bool CanReferencesAlias(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) {
     if (ref_info1 == ref_info2) {
       return true;
     } else if (ref_info1->IsSingleton()) {
@@ -444,6 +431,19 @@
     return true;
   }
 
+ private:
+  // An allocation cannot alias with a name which already exists at the point
+  // of the allocation, such as a parameter or a load happening before the allocation.
+  static bool MayAliasWithPreexistenceChecking(ReferenceInfo* ref_info1, ReferenceInfo* ref_info2) {
+    if (ref_info1->GetReference()->IsNewInstance() || ref_info1->GetReference()->IsNewArray()) {
+      // Any reference that can alias with the allocation must appear after it in the block/in
+      // the block's successors. In reverse post order, those instructions will be visited after
+      // the allocation.
+      return ref_info2->GetPosition() >= ref_info1->GetPosition();
+    }
+    return true;
+  }
+
   bool CanArrayElementsAlias(const HInstruction* idx1,
                              const size_t vector_length1,
                              const HInstruction* idx2,
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 6f8e441..fe2cdef 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -929,6 +929,8 @@
     kPartialElimination,
   };
 
+  bool MayAliasOnBackEdge(HBasicBlock* loop_header, size_t idx1, size_t idx2) const;
+
   bool TryReplacingLoopPhiPlaceholderWithDefault(
       PhiPlaceholder phi_placeholder,
       DataType::Type type,
@@ -1848,6 +1850,34 @@
   HGraphVisitor::VisitBasicBlock(block);
 }
 
+bool LSEVisitor::MayAliasOnBackEdge(HBasicBlock* loop_header, size_t idx1, size_t idx2) const {
+  DCHECK_NE(idx1, idx2);
+  DCHECK(loop_header->IsLoopHeader());
+  if (heap_location_collector_.MayAlias(idx1, idx2)) {
+    return true;
+  }
+  // For array locations with index defined inside the loop, include
+  // all other locations in the array, even those that LSA declares
+  // non-aliasing, such as `a[i]` and `a[i + 1]`, as they may actually
+  // refer to the same locations for different iterations. (LSA's
+  // `ComputeMayAlias()` does not consider different loop iterations.)
+  HeapLocation* loc1 = heap_location_collector_.GetHeapLocation(idx1);
+  HeapLocation* loc2 = heap_location_collector_.GetHeapLocation(idx2);
+  if (loc1->IsArray() &&
+      loc2->IsArray() &&
+      HeapLocationCollector::CanReferencesAlias(loc1->GetReferenceInfo(),
+                                                loc2->GetReferenceInfo())) {
+    HLoopInformation* loop_info = loop_header->GetLoopInformation();
+    if (loop_info->Contains(*loc1->GetIndex()->GetBlock()) ||
+        loop_info->Contains(*loc2->GetIndex()->GetBlock())) {
+      // Consider the locations aliasing. Do not optimize the case where both indexes
+      // are loop invariants defined inside the loop, rely on LICM to pull them out.
+      return true;
+    }
+  }
+  return false;
+}
+
 bool LSEVisitor::TryReplacingLoopPhiPlaceholderWithDefault(
     PhiPlaceholder phi_placeholder,
     DataType::Type type,
@@ -1972,6 +2002,12 @@
         replacement = value.GetInstruction();
       }
     }
+    // While `TryReplacingLoopPhiPlaceholderWithDefault()` has special treatment
+    // for back-edges, it is not needed here. When looking for a single input
+    // instruction coming from before the loop, the array index must also be
+    // defined before the loop and the aliasing analysis done by LSA is sufficient.
+    // Any writes of a different value with an index that is not loop invariant
+    // would invalidate the heap location in `VisitSetLocation()`.
   }
 
   // Record replacement and report success.
@@ -2033,6 +2069,7 @@
       // default values or Phis, i.e. for vector loads, as we can only replace the Phi
       // placeholder with a single instruction defined before the loop.
       if (!can_use_default_or_phi) {
+        DCHECK(index != nullptr);  // Vector operations are array operations.
         if (TryReplacingLoopPhiPlaceholderWithSingleInput(current_phi_placeholder,
                                                           phi_placeholders_to_materialize)) {
           continue;
@@ -2042,11 +2079,26 @@
       }
     }
     for (HBasicBlock* predecessor : current_block->GetPredecessors()) {
-      Value value = ReplacementOrValue(heap_values_for_[predecessor->GetBlockId()][idx].value);
+      ScopedArenaVector<ValueRecord>& heap_values = heap_values_for_[predecessor->GetBlockId()];
+      Value value = ReplacementOrValue(heap_values[idx].value);
       if (value.IsUnknown()) {
         // We cannot create a Phi for this loop Phi placeholder.
         return current_phi_placeholder;  // Report the loop Phi placeholder.
       }
+      // For arrays, the location may have been clobbered by writes to other locations
+      // in a loop that LSA does not consider aliasing, such as `a[i]` and `a[i + 1]`.
+      if (current_block->IsLoopHeader() &&
+          predecessor != current_block->GetLoopInformation()->GetPreHeader() &&
+          heap_location_collector_.GetHeapLocation(idx)->GetIndex() != nullptr) {
+        for (size_t i = 0, size = heap_values.size(); i != size; ++i) {
+          if (i != idx &&
+              !heap_values[i].stored_by.IsUnknown() &&
+              MayAliasOnBackEdge(current_block, idx, i)) {
+            // We cannot create a Phi for this loop Phi placeholder.
+            return current_phi_placeholder;
+          }
+        }
+      }
       if (value.NeedsLoopPhi()) {
         // Visit the predecessor Phi placeholder if it's not visited yet.
         if (!phi_placeholders_to_materialize->IsBitSet(PhiPlaceholderIndex(value))) {
@@ -2574,7 +2626,6 @@
       ScopedArenaVector<ValueRecord>& heap_values = heap_values_for_[predecessor->GetBlockId()];
       // For loop back-edges we must also preserve all stores to locations that
       // may alias with the location `idx`.
-      // TODO: Review whether we need to keep stores to aliased locations from pre-header.
       // TODO: Add tests cases around this.
       bool is_back_edge =
           block->IsLoopHeader() && predecessor != block->GetLoopInformation()->GetPreHeader();
@@ -2582,31 +2633,7 @@
       size_t end = is_back_edge ? heap_values.size() : idx + 1u;
       for (size_t i = start; i != end; ++i) {
         Value stored_by = heap_values[i].stored_by;
-        auto may_alias = [this, block, idx](size_t i) {
-            DCHECK_NE(i, idx);
-            DCHECK(block->IsLoopHeader());
-            if (heap_location_collector_.MayAlias(i, idx)) {
-              return true;
-            }
-            // For array locations with index defined inside the loop, include
-            // all other locations in the array, even those that LSA declares
-            // non-aliasing, such as `a[i]` and `a[i + 1]`, as they may actually
-            // refer to the same locations for different iterations. (LSA's
-            // `ComputeMayAlias()` does not consider different loop iterations.)
-            HeapLocation* heap_loc = heap_location_collector_.GetHeapLocation(idx);
-            HeapLocation* other_loc = heap_location_collector_.GetHeapLocation(i);
-            if (heap_loc->IsArray() &&
-                other_loc->IsArray() &&
-                heap_loc->GetReferenceInfo() == other_loc->GetReferenceInfo() &&
-                block->GetLoopInformation()->Contains(*heap_loc->GetIndex()->GetBlock())) {
-              // If one location has index defined inside and the other index defined outside
-              // of the loop, LSA considers them aliasing and we take an early return above.
-              DCHECK(block->GetLoopInformation()->Contains(*other_loc->GetIndex()->GetBlock()));
-              return true;
-            }
-            return false;
-        };
-        if (!stored_by.IsUnknown() && (i == idx || may_alias(i))) {
+        if (!stored_by.IsUnknown() && (i == idx || MayAliasOnBackEdge(block, idx, i))) {
           if (stored_by.NeedsPhi()) {
             size_t phi_placeholder_index = PhiPlaceholderIndex(stored_by);
             if (is_partial_kept_merged_unknown) {
diff --git a/compiler/optimizing/load_store_elimination_test.cc b/compiler/optimizing/load_store_elimination_test.cc
index 87ccd1a..56cc769 100644
--- a/compiler/optimizing/load_store_elimination_test.cc
+++ b/compiler/optimizing/load_store_elimination_test.cc
@@ -1679,6 +1679,196 @@
   EXPECT_INS_REMOVED(alloc_w);
 }
 
+// Regression test for b/187487955.
+// We previusly failed to consider aliasing between an array location
+// with index `idx` defined in the loop (such as a loop Phi) and another
+// array location with index `idx + constant`. This could have led to
+// replacing the load with, for example, the default value 0.
+TEST_F(LoadStoreEliminationTest, ArrayLoopAliasing1) {
+  VariableSizedHandleScope vshs(Thread::Current());
+  CreateGraph(&vshs);
+  AdjacencyListGraph blocks(graph_,
+                            GetAllocator(),
+                            "entry",
+                            "exit",
+                            { { "entry", "preheader" },
+                              { "preheader", "loop" },
+                              { "loop", "body" },
+                              { "body", "loop" },
+                              { "loop", "ret" },
+                              { "ret", "exit" } });
+#define GET_BLOCK(name) HBasicBlock* name = blocks.Get(#name)
+  GET_BLOCK(entry);
+  GET_BLOCK(preheader);
+  GET_BLOCK(loop);
+  GET_BLOCK(body);
+  GET_BLOCK(ret);
+  GET_BLOCK(exit);
+#undef GET_BLOCK
+  HInstruction* n = MakeParam(DataType::Type::kInt32);
+  HInstruction* c0 = graph_->GetIntConstant(0);
+  HInstruction* c1 = graph_->GetIntConstant(1);
+
+  // entry
+  HInstruction* cls = MakeClassLoad();
+  HInstruction* array = new (GetAllocator()) HNewArray(
+      cls, n, /*dex_pc=*/ 0u, DataType::SizeShift(DataType::Type::kInt32));
+  HInstruction* entry_goto = new (GetAllocator()) HGoto();
+  entry->AddInstruction(cls);
+  entry->AddInstruction(array);
+  entry->AddInstruction(entry_goto);
+  ManuallyBuildEnvFor(cls, {});
+  ManuallyBuildEnvFor(array, {});
+
+  HInstruction* preheader_goto = new (GetAllocator()) HGoto();
+  preheader->AddInstruction(preheader_goto);
+
+  // loop
+  HPhi* i_phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+  HInstruction* loop_suspend_check = new (GetAllocator()) HSuspendCheck();
+  HInstruction* loop_cond = new (GetAllocator()) HLessThan(i_phi, n);
+  HIf* loop_if = new (GetAllocator()) HIf(loop_cond);
+  loop->AddPhi(i_phi);
+  loop->AddInstruction(loop_suspend_check);
+  loop->AddInstruction(loop_cond);
+  loop->AddInstruction(loop_if);
+  CHECK(loop_if->IfTrueSuccessor() == body);
+  ManuallyBuildEnvFor(loop_suspend_check, {});
+
+  // body
+  HInstruction* body_set =
+      new (GetAllocator()) HArraySet(array, i_phi, i_phi, DataType::Type::kInt32, /*dex_pc=*/ 0u);
+  HInstruction* body_add = new (GetAllocator()) HAdd(DataType::Type::kInt32, i_phi, c1);
+  HInstruction* body_goto = new (GetAllocator()) HGoto();
+  body->AddInstruction(body_set);
+  body->AddInstruction(body_add);
+  body->AddInstruction(body_goto);
+
+  // i_phi inputs
+  i_phi->AddInput(c0);
+  i_phi->AddInput(body_add);
+
+  // ret
+  HInstruction* ret_sub = new (GetAllocator()) HSub(DataType::Type::kInt32, i_phi, c1);
+  HInstruction* ret_get =
+      new (GetAllocator()) HArrayGet(array, ret_sub, DataType::Type::kInt32, /*dex_pc=*/ 0);
+  HInstruction* ret_return = new (GetAllocator()) HReturn(ret_get);
+  ret->AddInstruction(ret_sub);
+  ret->AddInstruction(ret_get);
+  ret->AddInstruction(ret_return);
+
+  // exit
+  SetupExit(exit);
+
+  graph_->ClearDominanceInformation();
+  graph_->ClearLoopInformation();
+  PerformLSE();
+
+  EXPECT_INS_RETAINED(cls);
+  EXPECT_INS_RETAINED(array);
+  EXPECT_INS_RETAINED(body_set);
+  EXPECT_INS_RETAINED(ret_get);
+}
+
+// Regression test for b/187487955.
+// Similar to the `ArrayLoopAliasing1` test above but with additional load
+// that marks a loop Phi placeholder as kept which used to trigger a DCHECK().
+// There is also an LSE run-test for this but it relies on BCE eliminating
+// BoundsCheck instructions and adds extra code in loop body to avoid
+// loop unrolling. This gtest does not need to jump through those hoops
+// as we do not unnecessarily run those optimization passes.
+TEST_F(LoadStoreEliminationTest, ArrayLoopAliasing2) {
+  VariableSizedHandleScope vshs(Thread::Current());
+  CreateGraph(&vshs);
+  AdjacencyListGraph blocks(graph_,
+                            GetAllocator(),
+                            "entry",
+                            "exit",
+                            { { "entry", "preheader" },
+                              { "preheader", "loop" },
+                              { "loop", "body" },
+                              { "body", "loop" },
+                              { "loop", "ret" },
+                              { "ret", "exit" } });
+#define GET_BLOCK(name) HBasicBlock* name = blocks.Get(#name)
+  GET_BLOCK(entry);
+  GET_BLOCK(preheader);
+  GET_BLOCK(loop);
+  GET_BLOCK(body);
+  GET_BLOCK(ret);
+  GET_BLOCK(exit);
+#undef GET_BLOCK
+  HInstruction* n = MakeParam(DataType::Type::kInt32);
+  HInstruction* c0 = graph_->GetIntConstant(0);
+  HInstruction* c1 = graph_->GetIntConstant(1);
+
+  // entry
+  HInstruction* cls = MakeClassLoad();
+  HInstruction* array = new (GetAllocator()) HNewArray(
+      cls, n, /*dex_pc=*/ 0u, DataType::SizeShift(DataType::Type::kInt32));
+  HInstruction* entry_goto = new (GetAllocator()) HGoto();
+  entry->AddInstruction(cls);
+  entry->AddInstruction(array);
+  entry->AddInstruction(entry_goto);
+  ManuallyBuildEnvFor(cls, {});
+  ManuallyBuildEnvFor(array, {});
+
+  HInstruction* preheader_goto = new (GetAllocator()) HGoto();
+  preheader->AddInstruction(preheader_goto);
+
+  // loop
+  HPhi* i_phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+  HInstruction* loop_suspend_check = new (GetAllocator()) HSuspendCheck();
+  HInstruction* loop_cond = new (GetAllocator()) HLessThan(i_phi, n);
+  HIf* loop_if = new (GetAllocator()) HIf(loop_cond);
+  loop->AddPhi(i_phi);
+  loop->AddInstruction(loop_suspend_check);
+  loop->AddInstruction(loop_cond);
+  loop->AddInstruction(loop_if);
+  CHECK(loop_if->IfTrueSuccessor() == body);
+  ManuallyBuildEnvFor(loop_suspend_check, {});
+
+  // body
+  HInstruction* body_set =
+      new (GetAllocator()) HArraySet(array, i_phi, i_phi, DataType::Type::kInt32, /*dex_pc=*/ 0u);
+  HInstruction* body_add = new (GetAllocator()) HAdd(DataType::Type::kInt32, i_phi, c1);
+  HInstruction* body_goto = new (GetAllocator()) HGoto();
+  body->AddInstruction(body_set);
+  body->AddInstruction(body_add);
+  body->AddInstruction(body_goto);
+
+  // i_phi inputs
+  i_phi->AddInput(c0);
+  i_phi->AddInput(body_add);
+
+  // ret
+  HInstruction* ret_sub = new (GetAllocator()) HSub(DataType::Type::kInt32, i_phi, c1);
+  HInstruction* ret_get1 =
+      new (GetAllocator()) HArrayGet(array, ret_sub, DataType::Type::kInt32, /*dex_pc=*/ 0);
+  HInstruction* ret_get2 =
+      new (GetAllocator()) HArrayGet(array, i_phi, DataType::Type::kInt32, /*dex_pc=*/ 0);
+  HInstruction* ret_add = new (GetAllocator()) HAdd(DataType::Type::kInt32, ret_get1, ret_get2);
+  HInstruction* ret_return = new (GetAllocator()) HReturn(ret_add);
+  ret->AddInstruction(ret_sub);
+  ret->AddInstruction(ret_get1);
+  ret->AddInstruction(ret_get2);
+  ret->AddInstruction(ret_add);
+  ret->AddInstruction(ret_return);
+
+  // exit
+  SetupExit(exit);
+
+  graph_->ClearDominanceInformation();
+  graph_->ClearLoopInformation();
+  PerformLSE();
+
+  EXPECT_INS_RETAINED(cls);
+  EXPECT_INS_RETAINED(array);
+  EXPECT_INS_RETAINED(body_set);
+  EXPECT_INS_RETAINED(ret_get1);
+  EXPECT_INS_RETAINED(ret_get2);
+}
+
 // // ENTRY
 // obj = new Obj();
 // // ALL should be kept
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 215fa3e..17080f0 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -27,6 +27,7 @@
 #include "base/bit_vector.h"
 #include "base/iteration_range.h"
 #include "base/logging.h"
+#include "base/malloc_arena_pool.h"
 #include "base/scoped_arena_allocator.h"
 #include "base/scoped_arena_containers.h"
 #include "base/stl_util.h"
@@ -1314,14 +1315,25 @@
 }
 
 std::ostream& HInstruction::Dump(std::ostream& os, bool dump_args) {
-  HGraph* graph = GetBlock()->GetGraph();
+  // Note: Handle the case where the instruction has been removed from
+  // the graph to support debugging output for failed gtests.
+  HGraph* graph = (GetBlock() != nullptr) ? GetBlock()->GetGraph() : nullptr;
   HGraphVisualizer::DumpInstruction(&os, graph, this);
   if (dump_args) {
     // Allocate memory from local ScopedArenaAllocator.
-    ScopedArenaAllocator allocator(graph->GetArenaStack());
+    std::optional<MallocArenaPool> local_arena_pool;
+    std::optional<ArenaStack> local_arena_stack;
+    if (UNLIKELY(graph == nullptr)) {
+      local_arena_pool.emplace();
+      local_arena_stack.emplace(&local_arena_pool.value());
+    }
+    ScopedArenaAllocator allocator(
+        graph != nullptr ? graph->GetArenaStack() : &local_arena_stack.value());
     // Instructions that we already visited. We print each instruction only once.
-    ArenaBitVector visited(
-        &allocator, graph->GetCurrentInstructionId(), /* expandable= */ false, kArenaAllocMisc);
+    ArenaBitVector visited(&allocator,
+                           (graph != nullptr) ? graph->GetCurrentInstructionId() : 0u,
+                           /* expandable= */ (graph == nullptr),
+                           kArenaAllocMisc);
     visited.ClearAllBits();
     visited.SetBit(GetId());
     // Keep a queue of instructions with their indentations.
diff --git a/test/530-checker-lse/src/Main.java b/test/530-checker-lse/src/Main.java
index 23e1d53..35f1dc2 100644
--- a/test/530-checker-lse/src/Main.java
+++ b/test/530-checker-lse/src/Main.java
@@ -3455,6 +3455,134 @@
     return sum;
   }
 
+  /// CHECK-START: int Main.testLoop36(int) load_store_elimination (before)
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Deoptimize
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+
+  /// CHECK-START: int Main.testLoop36(int) load_store_elimination (before)
+  /// CHECK-NOT:                 BoundsCheck
+
+  /// CHECK-START: int Main.testLoop36(int) load_store_elimination (after)
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Deoptimize
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+
+  // Regression test for b/187487955.
+  // We previously failed a DCHECK() during the search for kept stores when
+  // we encountered two array locations for the same array and considered
+  // non-aliasing by LSA when only one of the array locations had index
+  // defined inside the loop. Note that this situation requires that BCE
+  // eliminates BoundsCheck instructions, otherwise LSA considers those
+  // locations aliasing.
+  private static int testLoop36(int n) {
+    int[] a = new int[n];
+    int zero = 0;
+    int i = 0;
+    for (; i < n; ++i) {
+      a[i] = i;
+      // Extra instructions to avoid loop unrolling.
+      zero = (((zero ^ 1) + 2) ^ 1) - 2;
+      zero = (((zero ^ 4) + 8) ^ 4) - 8;
+    }
+    // Use 4 loads with consecutive fixed offsets from the loop Phi for `i`.
+    // BCE shall replace BoundsChecks with Deoptimize, so that indexes here are
+    // the Phi plus/minus a constant, something that LSA considers non-aliasing
+    // with the Phi (LSA does not take different loop iterations into account)
+    // but LSE must consider aliasing across dfferent loop iterations.
+    return a[i - 1] + a[i - 2] + a[i - 3] + a[i - 4] + zero;
+  }
+
+  /// CHECK-START: int Main.testLoop37(int) load_store_elimination (before)
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Deoptimize
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+
+  /// CHECK-START: int Main.testLoop37(int) load_store_elimination (before)
+  /// CHECK-NOT:                 BoundsCheck
+
+  /// CHECK-START: int Main.testLoop37(int) load_store_elimination (after)
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Deoptimize
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+
+  // Similar to testLoop36 but the writes are done via a different reference to the same array.
+  // We previously used a reference comparison for back-edge aliasing analysis but this test
+  // has different references and therefore needs `HeapLocationCollector::CanReferencesAlias()`.
+  private static int testLoop37(int n) {
+    int[] a = new int[n];
+    int[] b = $noinline$returnArg(a);
+    int zero = 0;
+    int i = 0;
+    for (; i < n; ++i) {
+      b[i] = i;
+      // Extra instructions to avoid loop unrolling.
+      zero = (((zero ^ 1) + 2) ^ 1) - 2;
+      zero = (((zero ^ 4) + 8) ^ 4) - 8;
+    }
+    // Use 4 loads with consecutive fixed offsets from the loop Phi for `i`.
+    // BCE shall replace BoundsChecks with Deoptimize, so that indexes here are
+    // the Phi plus/minus a constant, something that LSA considers non-aliasing
+    // with the Phi (LSA does not take different loop iterations into account)
+    // but LSE must consider aliasing across dfferent loop iterations.
+    return a[i - 1] + a[i - 2] + a[i - 3] + a[i - 4] + zero;
+  }
+
+  private static int[] $noinline$returnArg(int[] a) {
+    return a;
+  }
+
+  /// CHECK-START: int Main.testLoop38(int, int[]) load_store_elimination (before)
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Deoptimize
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+
+  /// CHECK-START: int Main.testLoop38(int, int[]) load_store_elimination (before)
+  /// CHECK-NOT:                 BoundsCheck
+
+  /// CHECK-START: int Main.testLoop38(int, int[]) load_store_elimination (after)
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Deoptimize
+
+  /// CHECK-START: int Main.testLoop38(int, int[]) load_store_elimination (after)
+  /// CHECK-NOT:                 ArrayGet
+
+  // Similar to testLoop37 but writing to a different array that exists before allocating `a`,
+  // so that `HeapLocationCollector::CanReferencesAlias()` returns false and all the ArrayGet
+  // instructions are actually eliminated.
+  private static int testLoop38(int n, int[] b) {
+    int[] a = new int[n];
+    int zero = 0;
+    int i = 0;
+    for (; i < n; ++i) {
+      b[i] = i;
+      // Extra instructions to avoid loop unrolling.
+      zero = (((zero ^ 1) + 2) ^ 1) - 2;
+      zero = (((zero ^ 4) + 8) ^ 4) - 8;
+    }
+    // Use 4 loads with consecutive fixed offsets from the loop Phi for `i`.
+    // BCE shall replace BoundsChecks with Deoptimize, so that indexes here are
+    // the Phi plus/minus a constant, something that LSA considers non-aliasing
+    // with the Phi (LSA does not take different loop iterations into account)
+    // but LSE must consider aliasing across dfferent loop iterations.
+    return a[i - 1] + a[i - 2] + a[i - 3] + a[i - 4] + zero;
+  }
+
   /// CHECK-START: int Main.testNestedLoop1(TestClass, int) load_store_elimination (before)
   /// CHECK-DAG:                 InstanceFieldSet
   /// CHECK-DAG:                 InstanceFieldGet
@@ -4445,6 +4573,9 @@
     assertIntEquals(testLoop35(1), 1);
     assertIntEquals(testLoop35(2), 3);
     assertIntEquals(testLoop35(3), 6);
+    assertIntEquals(testLoop36(4), 6);
+    assertIntEquals(testLoop37(4), 6);
+    assertIntEquals(testLoop38(4, new int[4]), 0);
 
     assertIntEquals(testNestedLoop1(new TestClass(), 0), 1);
     assertIntEquals(testNestedLoop1(new TestClass(), 1), 1);