Fix LSE-array overlap issue

In cases where an array has overlapping accesses on separate
iterations LSE could get confused and incorrectly believe removing
array stores is safe even when later iterations rely on the store
occurring. For example consider the following code:

```
int do_cal(int len) {
  if (len < 5) {
    return -1;
  }
  int w[] = new w[len];
  int t = 0;
  for (int i = 5; i < w.length; i++) {
    w[i] = please_interleave(w[i - 1], w[i - 5]);
    t = please_select(w[i], i);
  }
  return t;
}
```

We would either need to materialize 5 PHIs to hold the values
`w[i - 1], ..., w[i - 5]` or avoid removing the write to `w[i]`. Our
LSE pass is unable to do the former and would (incorrectly) fail to
recognize that, by not being able to determine the values of `w[i -
1]` and `w[i-5]` that the later store to `w[i]` must be preserved.

Bug: 168446366
Test: ./test.py --host
Change-Id: I89772c8bf49ebf6de70f86bd68484e14bd189406
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index d69442e..9427a70 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -23,6 +23,7 @@
 #include "base/scoped_arena_containers.h"
 #include "escape.h"
 #include "load_store_analysis.h"
+#include "optimizing/optimizing_compiler_stats.h"
 #include "reference_type_propagation.h"
 
 /**
@@ -2051,13 +2052,13 @@
     work_queue.pop_back();
     size_t idx = phi_placeholder->GetHeapLocation();
     HBasicBlock* block = blocks[phi_placeholder->GetBlockId()];
+    HeapLocation* heap_loc = heap_location_collector_.GetHeapLocation(idx);
+    bool unknown_array_stores = false;
     for (HBasicBlock* predecessor : block->GetPredecessors()) {
       ScopedArenaVector<ValueRecord>& heap_values = heap_values_for_[predecessor->GetBlockId()];
       // For loop back-edges we must also preserve all stores to locations that may alias
       // with the location `idx`.
       // TODO: Review whether we need to keep stores to aliased locations from pre-header.
-      // TODO: Review in light of testLoop29(); may need to handle locations that LSA considers
-      //       non-aliasing.
       // TODO: Add tests cases around this.
       bool is_back_edge =
           block->IsLoopHeader() && predecessor != block->GetLoopInformation()->GetPreHeader();
@@ -2076,6 +2077,51 @@
             DCHECK(IsStore(stored_by.GetInstruction()));
             kept_stores_.SetBit(stored_by.GetInstruction()->GetId());
           }
+        } else if (stored_by.IsUnknown() && heap_loc->IsArray()) {
+          // See comment below. We are an array load where we don't know how the
+          // value got there. Need to ensure that writes aren't omitted (in
+          // ways that are observable) since this looks like a read from an
+          // arbitrary point in the array.
+          unknown_array_stores = true;
+        }
+      }
+    }
+    if (unknown_array_stores) {
+      // Since we are an array, actually ended up here, and don't have a known
+      // set this must be from a previous iteration (otherwise we'd either not
+      // reach this because stores were kept from escapes or just known from
+      // earlier). We need to keep all stores that could affect this read.
+      //
+      // This is sufficient since
+      // (1) If we are not an array then all loads and stores are at known,
+      //     exact offsets (fields and such).
+      // (2) If we didn't end up here then we either had no chance to turn
+      //     this load into a phi or we know what the actual value is (above
+      //     branch).
+      //
+      // Since none of these are true we are reading from some location in
+      // the array and we have no idea where/if it was written. This means
+      // we need to be super conservative and preserve all writes that could
+      // possibly occur before this read, in case one of them is the write
+      // that supplies the value. This means keeping the last store for
+      // every heap location touching this array in every block.
+      //
+      // TODO We can do better and do partial LSE around this but we don't
+      // have the data for that at the moment.
+      // TODO Can we be more selective w/o partial LSE data?
+      // TODO Do some analysis to see how much we are leaving on the table
+      // with this.
+      for (uint32_t j = 0; j < heap_location_collector_.GetNumberOfHeapLocations(); ++j) {
+        auto candidate_heap_location = heap_location_collector_.GetHeapLocation(j);
+        // If the heap-location is part of this same array.
+        if (candidate_heap_location->GetReferenceInfo() == heap_loc->GetReferenceInfo()) {
+          for (const auto& hv : heap_values_for_) {
+            if (hv.size() > j && hv[j].stored_by.IsInstruction()) {
+              // This is the last store in this block. We can't tell if it's
+              // to the right location or not so we'll need to keep it.
+              kept_stores_.SetBit(hv[j].stored_by.GetInstruction()->GetId());
+            }
+          }
         }
       }
     }
diff --git a/compiler/optimizing/load_store_elimination_test.cc b/compiler/optimizing/load_store_elimination_test.cc
index 6c5eec8..f71a473 100644
--- a/compiler/optimizing/load_store_elimination_test.cc
+++ b/compiler/optimizing/load_store_elimination_test.cc
@@ -900,4 +900,546 @@
   ASSERT_FALSE(IsRemoved(vstore2));
 }
 
+// void DO_CAL() {
+//   int i = 1;
+//   int[] w = new int[80];
+//   int t = 0;
+//   while (i < 80) {
+//     w[i] = PLEASE_INTERLEAVE(w[i - 1], 1)
+//     t = PLEASE_SELECT(w[i], t);
+//     i++;
+//   }
+//   return t;
+// }
+TEST_F(LoadStoreEliminationTest, ArrayLoopOverlap) {
+  CreateGraph();
+  AdjacencyListGraph blocks(graph_,
+                            GetAllocator(),
+                            "entry",
+                            "exit",
+                            { { "entry", "loop_pre_header" },
+                              { "loop_pre_header", "loop_entry" },
+                              { "loop_entry", "loop_body" },
+                              { "loop_entry", "loop_post" },
+                              { "loop_body", "loop_entry" },
+                              { "loop_post", "exit" } });
+#define GET_BLOCK(name) HBasicBlock* name = blocks.Get(#name)
+  GET_BLOCK(entry);
+  GET_BLOCK(loop_pre_header);
+  GET_BLOCK(loop_entry);
+  GET_BLOCK(loop_body);
+  GET_BLOCK(loop_post);
+  GET_BLOCK(exit);
+#undef GET_BLOCK
+
+  HInstruction* zero_const = graph_->GetConstant(DataType::Type::kInt32, 0);
+  HInstruction* one_const = graph_->GetConstant(DataType::Type::kInt32, 1);
+  HInstruction* eighty_const = graph_->GetConstant(DataType::Type::kInt32, 80);
+  HInstruction* entry_goto = new (GetAllocator()) HGoto();
+  entry->AddInstruction(entry_goto);
+
+  HInstruction* alloc_w = new (GetAllocator()) HNewArray(zero_const, eighty_const, 0, 0);
+  HInstruction* pre_header_goto = new (GetAllocator()) HGoto();
+  loop_pre_header->AddInstruction(alloc_w);
+  loop_pre_header->AddInstruction(pre_header_goto);
+  // environment
+  ArenaVector<HInstruction*> alloc_locals({}, GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(alloc_w, &alloc_locals);
+
+  // loop-start
+  HPhi* i_phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+  HPhi* t_phi = new (GetAllocator()) HPhi(GetAllocator(), 1, 0, DataType::Type::kInt32);
+  HInstruction* suspend = new (GetAllocator()) HSuspendCheck();
+  HInstruction* i_cmp_top = new (GetAllocator()) HGreaterThanOrEqual(i_phi, eighty_const);
+  HInstruction* loop_start_branch = new (GetAllocator()) HIf(i_cmp_top);
+  loop_entry->AddPhi(i_phi);
+  loop_entry->AddPhi(t_phi);
+  loop_entry->AddInstruction(suspend);
+  loop_entry->AddInstruction(i_cmp_top);
+  loop_entry->AddInstruction(loop_start_branch);
+  CHECK_EQ(loop_entry->GetSuccessors().size(), 2u);
+  if (loop_entry->GetNormalSuccessors()[1] != loop_body) {
+    loop_entry->SwapSuccessors();
+  }
+  CHECK_EQ(loop_entry->GetPredecessors().size(), 2u);
+  if (loop_entry->GetPredecessors()[0] != loop_pre_header) {
+    loop_entry->SwapPredecessors();
+  }
+  i_phi->AddInput(one_const);
+  t_phi->AddInput(zero_const);
+
+  // environment
+  ArenaVector<HInstruction*> suspend_locals({ alloc_w, i_phi, t_phi },
+                                            GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(suspend, &suspend_locals);
+
+  // BODY
+  HInstruction* last_i = new (GetAllocator()) HSub(DataType::Type::kInt32, i_phi, one_const);
+  HInstruction* last_get =
+      new (GetAllocator()) HArrayGet(alloc_w, last_i, DataType::Type::kInt32, 0);
+  HInvoke* body_value = new (GetAllocator())
+      HInvokeStaticOrDirect(GetAllocator(),
+                            2,
+                            DataType::Type::kInt32,
+                            0,
+                            { nullptr, 0 },
+                            nullptr,
+                            {},
+                            InvokeType::kStatic,
+                            { nullptr, 0 },
+                            HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+  body_value->SetRawInputAt(0, last_get);
+  body_value->SetRawInputAt(1, one_const);
+  HInstruction* body_set =
+      new (GetAllocator()) HArraySet(alloc_w, i_phi, body_value, DataType::Type::kInt32, 0);
+  HInstruction* body_get =
+      new (GetAllocator()) HArrayGet(alloc_w, i_phi, DataType::Type::kInt32, 0);
+  HInvoke* t_next = new (GetAllocator())
+      HInvokeStaticOrDirect(GetAllocator(),
+                            2,
+                            DataType::Type::kInt32,
+                            0,
+                            { nullptr, 0 },
+                            nullptr,
+                            {},
+                            InvokeType::kStatic,
+                            { nullptr, 0 },
+                            HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+  t_next->SetRawInputAt(0, body_get);
+  t_next->SetRawInputAt(1, t_phi);
+  HInstruction* i_next = new (GetAllocator()) HAdd(DataType::Type::kInt32, i_phi, one_const);
+  HInstruction* body_goto = new (GetAllocator()) HGoto();
+  loop_body->AddInstruction(last_i);
+  loop_body->AddInstruction(last_get);
+  loop_body->AddInstruction(body_value);
+  loop_body->AddInstruction(body_set);
+  loop_body->AddInstruction(body_get);
+  loop_body->AddInstruction(t_next);
+  loop_body->AddInstruction(i_next);
+  loop_body->AddInstruction(body_goto);
+  body_value->CopyEnvironmentFrom(suspend->GetEnvironment());
+
+  i_phi->AddInput(i_next);
+  t_phi->AddInput(t_next);
+  t_next->CopyEnvironmentFrom(suspend->GetEnvironment());
+
+  // loop-post
+  HInstruction* return_inst = new (GetAllocator()) HReturn(t_phi);
+  loop_post->AddInstruction(return_inst);
+
+  // exit
+  HInstruction* exit_inst = new (GetAllocator()) HExit();
+  exit->AddInstruction(exit_inst);
+
+  graph_->ClearDominanceInformation();
+  graph_->ClearLoopInformation();
+  PerformLSE();
+
+  // TODO Technically this is optimizable. LSE just needs to add phis to keep
+  // track of the last `N` values set where `N` is how many locations we can go
+  // back into the array.
+  if (IsRemoved(last_get)) {
+    // If we were able to remove the previous read the entire array should be removable.
+    EXPECT_TRUE(IsRemoved(body_set));
+    EXPECT_TRUE(IsRemoved(alloc_w));
+  } else {
+    // This is the branch we actually take for now. If we rely on being able to
+    // read the array we'd better remember to write to it as well.
+    EXPECT_FALSE(IsRemoved(body_set));
+  }
+  // The last 'get' should always be removable.
+  EXPECT_TRUE(IsRemoved(body_get));
+}
+
+
+// void DO_CAL2() {
+//   int i = 1;
+//   int[] w = new int[80];
+//   int t = 0;
+//   while (i < 80) {
+//     w[i] = PLEASE_INTERLEAVE(w[i - 1], 1) // <-- removed
+//     t = PLEASE_SELECT(w[i], t);
+//     w[i] = PLEASE_INTERLEAVE(w[i - 1], 1) // <-- removed
+//     t = PLEASE_SELECT(w[i], t);
+//     w[i] = PLEASE_INTERLEAVE(w[i - 1], 1) // <-- kept
+//     t = PLEASE_SELECT(w[i], t);
+//     i++;
+//   }
+//   return t;
+// }
+TEST_F(LoadStoreEliminationTest, ArrayLoopOverlap2) {
+  CreateGraph();
+  AdjacencyListGraph blocks(graph_,
+                            GetAllocator(),
+                            "entry",
+                            "exit",
+                            { { "entry", "loop_pre_header" },
+                              { "loop_pre_header", "loop_entry" },
+                              { "loop_entry", "loop_body" },
+                              { "loop_entry", "loop_post" },
+                              { "loop_body", "loop_entry" },
+                              { "loop_post", "exit" } });
+#define GET_BLOCK(name) HBasicBlock* name = blocks.Get(#name)
+  GET_BLOCK(entry);
+  GET_BLOCK(loop_pre_header);
+  GET_BLOCK(loop_entry);
+  GET_BLOCK(loop_body);
+  GET_BLOCK(loop_post);
+  GET_BLOCK(exit);
+#undef GET_BLOCK
+
+  HInstruction* zero_const = graph_->GetConstant(DataType::Type::kInt32, 0);
+  HInstruction* one_const = graph_->GetConstant(DataType::Type::kInt32, 1);
+  HInstruction* eighty_const = graph_->GetConstant(DataType::Type::kInt32, 80);
+  HInstruction* entry_goto = new (GetAllocator()) HGoto();
+  entry->AddInstruction(entry_goto);
+
+  HInstruction* alloc_w = new (GetAllocator()) HNewArray(zero_const, eighty_const, 0, 0);
+  HInstruction* pre_header_goto = new (GetAllocator()) HGoto();
+  loop_pre_header->AddInstruction(alloc_w);
+  loop_pre_header->AddInstruction(pre_header_goto);
+  // environment
+  ArenaVector<HInstruction*> alloc_locals({}, GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(alloc_w, &alloc_locals);
+
+  // loop-start
+  HPhi* i_phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+  HPhi* t_phi = new (GetAllocator()) HPhi(GetAllocator(), 1, 0, DataType::Type::kInt32);
+  HInstruction* suspend = new (GetAllocator()) HSuspendCheck();
+  HInstruction* i_cmp_top = new (GetAllocator()) HGreaterThanOrEqual(i_phi, eighty_const);
+  HInstruction* loop_start_branch = new (GetAllocator()) HIf(i_cmp_top);
+  loop_entry->AddPhi(i_phi);
+  loop_entry->AddPhi(t_phi);
+  loop_entry->AddInstruction(suspend);
+  loop_entry->AddInstruction(i_cmp_top);
+  loop_entry->AddInstruction(loop_start_branch);
+  CHECK_EQ(loop_entry->GetSuccessors().size(), 2u);
+  if (loop_entry->GetNormalSuccessors()[1] != loop_body) {
+    loop_entry->SwapSuccessors();
+  }
+  CHECK_EQ(loop_entry->GetPredecessors().size(), 2u);
+  if (loop_entry->GetPredecessors()[0] != loop_pre_header) {
+    loop_entry->SwapPredecessors();
+  }
+  i_phi->AddInput(one_const);
+  t_phi->AddInput(zero_const);
+
+  // environment
+  ArenaVector<HInstruction*> suspend_locals({ alloc_w, i_phi, t_phi },
+                                            GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(suspend, &suspend_locals);
+
+  // BODY
+  HInstruction* last_i = new (GetAllocator()) HSub(DataType::Type::kInt32, i_phi, one_const);
+  HInstruction* last_get_1, *last_get_2, *last_get_3;
+  HInstruction* body_value_1, *body_value_2, *body_value_3;
+  HInstruction* body_set_1, *body_set_2, *body_set_3;
+  HInstruction* body_get_1, *body_get_2, *body_get_3;
+  HInstruction* t_next_1, *t_next_2, *t_next_3;
+  auto make_instructions = [&](HInstruction* last_t_value) {
+    HInstruction* last_get =
+        new (GetAllocator()) HArrayGet(alloc_w, last_i, DataType::Type::kInt32, 0);
+    HInvoke* body_value = new (GetAllocator())
+        HInvokeStaticOrDirect(GetAllocator(),
+                              2,
+                              DataType::Type::kInt32,
+                              0,
+                              { nullptr, 0 },
+                              nullptr,
+                              {},
+                              InvokeType::kStatic,
+                              { nullptr, 0 },
+                              HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+    body_value->SetRawInputAt(0, last_get);
+    body_value->SetRawInputAt(1, one_const);
+    HInstruction* body_set =
+        new (GetAllocator()) HArraySet(alloc_w, i_phi, body_value, DataType::Type::kInt32, 0);
+    HInstruction* body_get =
+        new (GetAllocator()) HArrayGet(alloc_w, i_phi, DataType::Type::kInt32, 0);
+    HInvoke* t_next = new (GetAllocator())
+        HInvokeStaticOrDirect(GetAllocator(),
+                              2,
+                              DataType::Type::kInt32,
+                              0,
+                              { nullptr, 0 },
+                              nullptr,
+                              {},
+                              InvokeType::kStatic,
+                              { nullptr, 0 },
+                              HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+    t_next->SetRawInputAt(0, body_get);
+    t_next->SetRawInputAt(1, last_t_value);
+    loop_body->AddInstruction(last_get);
+    loop_body->AddInstruction(body_value);
+    loop_body->AddInstruction(body_set);
+    loop_body->AddInstruction(body_get);
+    loop_body->AddInstruction(t_next);
+    return std::make_tuple(last_get, body_value, body_set, body_get, t_next);
+  };
+  std::tie(last_get_1, body_value_1, body_set_1, body_get_1, t_next_1) = make_instructions(t_phi);
+  std::tie(last_get_2, body_value_2, body_set_2, body_get_2, t_next_2) =
+      make_instructions(t_next_1);
+  std::tie(last_get_3, body_value_3, body_set_3, body_get_3, t_next_3) =
+      make_instructions(t_next_2);
+  HInstruction* i_next = new (GetAllocator()) HAdd(DataType::Type::kInt32, i_phi, one_const);
+  HInstruction* body_goto = new (GetAllocator()) HGoto();
+  loop_body->InsertInstructionBefore(last_i, last_get_1);
+  loop_body->AddInstruction(i_next);
+  loop_body->AddInstruction(body_goto);
+  body_value_1->CopyEnvironmentFrom(suspend->GetEnvironment());
+  body_value_2->CopyEnvironmentFrom(suspend->GetEnvironment());
+  body_value_3->CopyEnvironmentFrom(suspend->GetEnvironment());
+
+  i_phi->AddInput(i_next);
+  t_phi->AddInput(t_next_3);
+  t_next_1->CopyEnvironmentFrom(suspend->GetEnvironment());
+  t_next_2->CopyEnvironmentFrom(suspend->GetEnvironment());
+  t_next_3->CopyEnvironmentFrom(suspend->GetEnvironment());
+
+  // loop-post
+  HInstruction* return_inst = new (GetAllocator()) HReturn(t_phi);
+  loop_post->AddInstruction(return_inst);
+
+  // exit
+  HInstruction* exit_inst = new (GetAllocator()) HExit();
+  exit->AddInstruction(exit_inst);
+
+  graph_->ClearDominanceInformation();
+  graph_->ClearLoopInformation();
+  PerformLSE();
+
+  // TODO Technically this is optimizable. LSE just needs to add phis to keep
+  // track of the last `N` values set where `N` is how many locations we can go
+  // back into the array.
+  if (IsRemoved(last_get_1)) {
+    // If we were able to remove the previous read the entire array should be removable.
+    EXPECT_TRUE(IsRemoved(body_set_1));
+    EXPECT_TRUE(IsRemoved(body_set_2));
+    EXPECT_TRUE(IsRemoved(body_set_3));
+    EXPECT_TRUE(IsRemoved(last_get_1));
+    EXPECT_TRUE(IsRemoved(last_get_2));
+    EXPECT_TRUE(IsRemoved(alloc_w));
+  } else {
+    // This is the branch we actually take for now. If we rely on being able to
+    // read the array we'd better remember to write to it as well.
+    EXPECT_FALSE(IsRemoved(body_set_3));
+  }
+  // The last 'get' should always be removable.
+  EXPECT_TRUE(IsRemoved(body_get_1));
+  EXPECT_TRUE(IsRemoved(body_get_2));
+  EXPECT_TRUE(IsRemoved(body_get_3));
+  // shadowed writes should always be removed
+  EXPECT_TRUE(IsRemoved(body_set_1));
+  EXPECT_TRUE(IsRemoved(body_set_2));
+}
+
+TEST_F(LoadStoreEliminationTest, ArrayNonLoopPhi) {
+  CreateGraph();
+  AdjacencyListGraph blocks(graph_,
+                            GetAllocator(),
+                            "entry",
+                            "exit",
+                            { { "entry", "start" },
+                              { "start", "left" },
+                              { "start", "right" },
+                              { "left", "ret" },
+                              { "right", "ret" },
+                              { "ret", "exit" } });
+#define GET_BLOCK(name) HBasicBlock* name = blocks.Get(#name)
+  GET_BLOCK(entry);
+  GET_BLOCK(start);
+  GET_BLOCK(left);
+  GET_BLOCK(right);
+  GET_BLOCK(ret);
+  GET_BLOCK(exit);
+#undef GET_BLOCK
+
+  HInstruction* zero_const = graph_->GetConstant(DataType::Type::kInt32, 0);
+  HInstruction* one_const = graph_->GetConstant(DataType::Type::kInt32, 1);
+  HInstruction* two_const = graph_->GetConstant(DataType::Type::kInt32, 2);
+  HInstruction* param = new (GetAllocator())
+      HParameterValue(graph_->GetDexFile(), dex::TypeIndex(1), 0, DataType::Type::kBool);
+  HInstruction* entry_goto = new (GetAllocator()) HGoto();
+  entry->AddInstruction(param);
+  entry->AddInstruction(entry_goto);
+
+  HInstruction* alloc_w = new (GetAllocator()) HNewArray(zero_const, two_const, 0, 0);
+  HInstruction* branch = new (GetAllocator()) HIf(param);
+  start->AddInstruction(alloc_w);
+  start->AddInstruction(branch);
+  // environment
+  ArenaVector<HInstruction*> alloc_locals({}, GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(alloc_w, &alloc_locals);
+
+  // left
+  HInvoke* left_value = new (GetAllocator())
+      HInvokeStaticOrDirect(GetAllocator(),
+                            1,
+                            DataType::Type::kInt32,
+                            0,
+                            { nullptr, 0 },
+                            nullptr,
+                            {},
+                            InvokeType::kStatic,
+                            { nullptr, 0 },
+                            HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+  left_value->SetRawInputAt(0, zero_const);
+  HInstruction* left_set_1 =
+      new (GetAllocator()) HArraySet(alloc_w, zero_const, left_value, DataType::Type::kInt32, 0);
+  HInstruction* left_set_2 =
+      new (GetAllocator()) HArraySet(alloc_w, one_const, zero_const, DataType::Type::kInt32, 0);
+  HInstruction* left_goto = new (GetAllocator()) HGoto();
+  left->AddInstruction(left_value);
+  left->AddInstruction(left_set_1);
+  left->AddInstruction(left_set_2);
+  left->AddInstruction(left_goto);
+  ArenaVector<HInstruction*> left_locals({ alloc_w },
+                                         GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(left_value, &alloc_locals);
+
+  // right
+  HInvoke* right_value = new (GetAllocator())
+      HInvokeStaticOrDirect(GetAllocator(),
+                            1,
+                            DataType::Type::kInt32,
+                            0,
+                            { nullptr, 0 },
+                            nullptr,
+                            {},
+                            InvokeType::kStatic,
+                            { nullptr, 0 },
+                            HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+  right_value->SetRawInputAt(0, one_const);
+  HInstruction* right_set_1 =
+      new (GetAllocator()) HArraySet(alloc_w, zero_const, right_value, DataType::Type::kInt32, 0);
+  HInstruction* right_set_2 =
+      new (GetAllocator()) HArraySet(alloc_w, one_const, zero_const, DataType::Type::kInt32, 0);
+  HInstruction* right_goto = new (GetAllocator()) HGoto();
+  right->AddInstruction(right_value);
+  right->AddInstruction(right_set_1);
+  right->AddInstruction(right_set_2);
+  right->AddInstruction(right_goto);
+  ArenaVector<HInstruction*> right_locals({ alloc_w },
+                                          GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(right_value, &alloc_locals);
+
+  // ret
+  HInstruction* read_1 =
+      new (GetAllocator()) HArrayGet(alloc_w, zero_const, DataType::Type::kInt32, 0);
+  HInstruction* read_2 =
+      new (GetAllocator()) HArrayGet(alloc_w, one_const, DataType::Type::kInt32, 0);
+  HInstruction* add = new (GetAllocator()) HAdd(DataType::Type::kInt32, read_1, read_2);
+  HInstruction* return_inst = new (GetAllocator()) HReturn(add);
+  ret->AddInstruction(read_1);
+  ret->AddInstruction(read_2);
+  ret->AddInstruction(add);
+  ret->AddInstruction(return_inst);
+
+  // exit
+  HInstruction* exit_inst = new (GetAllocator()) HExit();
+  exit->AddInstruction(exit_inst);
+
+  graph_->ClearDominanceInformation();
+  graph_->ClearLoopInformation();
+  PerformLSE();
+
+  EXPECT_TRUE(IsRemoved(read_1));
+  EXPECT_TRUE(IsRemoved(read_2));
+  EXPECT_TRUE(IsRemoved(left_set_1));
+  EXPECT_TRUE(IsRemoved(left_set_2));
+  EXPECT_TRUE(IsRemoved(right_set_1));
+  EXPECT_TRUE(IsRemoved(right_set_2));
+  EXPECT_TRUE(IsRemoved(alloc_w));
+
+  EXPECT_FALSE(IsRemoved(left_value));
+  EXPECT_FALSE(IsRemoved(right_value));
+}
+
+TEST_F(LoadStoreEliminationTest, ArrayMergeDefault) {
+  CreateGraph();
+  AdjacencyListGraph blocks(graph_,
+                            GetAllocator(),
+                            "entry",
+                            "exit",
+                            { { "entry", "start" },
+                              { "start", "left" },
+                              { "start", "right" },
+                              { "left", "ret" },
+                              { "right", "ret" },
+                              { "ret", "exit" } });
+#define GET_BLOCK(name) HBasicBlock* name = blocks.Get(#name)
+  GET_BLOCK(entry);
+  GET_BLOCK(start);
+  GET_BLOCK(left);
+  GET_BLOCK(right);
+  GET_BLOCK(ret);
+  GET_BLOCK(exit);
+#undef GET_BLOCK
+
+  HInstruction* zero_const = graph_->GetConstant(DataType::Type::kInt32, 0);
+  HInstruction* one_const = graph_->GetConstant(DataType::Type::kInt32, 1);
+  HInstruction* two_const = graph_->GetConstant(DataType::Type::kInt32, 2);
+  HInstruction* param = new (GetAllocator())
+      HParameterValue(graph_->GetDexFile(), dex::TypeIndex(1), 0, DataType::Type::kBool);
+  HInstruction* entry_goto = new (GetAllocator()) HGoto();
+  entry->AddInstruction(param);
+  entry->AddInstruction(entry_goto);
+
+  HInstruction* alloc_w = new (GetAllocator()) HNewArray(zero_const, two_const, 0, 0);
+  HInstruction* branch = new (GetAllocator()) HIf(param);
+  start->AddInstruction(alloc_w);
+  start->AddInstruction(branch);
+  // environment
+  ArenaVector<HInstruction*> alloc_locals({}, GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(alloc_w, &alloc_locals);
+
+  // left
+  HInstruction* left_set_1 =
+      new (GetAllocator()) HArraySet(alloc_w, zero_const, one_const, DataType::Type::kInt32, 0);
+  HInstruction* left_set_2 =
+      new (GetAllocator()) HArraySet(alloc_w, zero_const, zero_const, DataType::Type::kInt32, 0);
+  HInstruction* left_goto = new (GetAllocator()) HGoto();
+  left->AddInstruction(left_set_1);
+  left->AddInstruction(left_set_2);
+  left->AddInstruction(left_goto);
+
+  // right
+  HInstruction* right_set_1 =
+      new (GetAllocator()) HArraySet(alloc_w, one_const, one_const, DataType::Type::kInt32, 0);
+  HInstruction* right_set_2 =
+      new (GetAllocator()) HArraySet(alloc_w, one_const, zero_const, DataType::Type::kInt32, 0);
+  HInstruction* right_goto = new (GetAllocator()) HGoto();
+  right->AddInstruction(right_set_1);
+  right->AddInstruction(right_set_2);
+  right->AddInstruction(right_goto);
+
+  // ret
+  HInstruction* read_1 =
+      new (GetAllocator()) HArrayGet(alloc_w, zero_const, DataType::Type::kInt32, 0);
+  HInstruction* read_2 =
+      new (GetAllocator()) HArrayGet(alloc_w, one_const, DataType::Type::kInt32, 0);
+  HInstruction* add = new (GetAllocator()) HAdd(DataType::Type::kInt32, read_1, read_2);
+  HInstruction* return_inst = new (GetAllocator()) HReturn(add);
+  ret->AddInstruction(read_1);
+  ret->AddInstruction(read_2);
+  ret->AddInstruction(add);
+  ret->AddInstruction(return_inst);
+
+  // exit
+  HInstruction* exit_inst = new (GetAllocator()) HExit();
+  exit->AddInstruction(exit_inst);
+
+  graph_->ClearDominanceInformation();
+  graph_->ClearLoopInformation();
+  PerformLSE();
+
+  EXPECT_TRUE(IsRemoved(read_1));
+  EXPECT_TRUE(IsRemoved(read_2));
+  EXPECT_TRUE(IsRemoved(left_set_1));
+  EXPECT_TRUE(IsRemoved(left_set_2));
+  EXPECT_TRUE(IsRemoved(right_set_1));
+  EXPECT_TRUE(IsRemoved(right_set_2));
+  EXPECT_TRUE(IsRemoved(alloc_w));
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index 484e444..792c9db 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -292,6 +292,60 @@
   return instruction->GetBlock() == nullptr;
 }
 
+class AdjacencyListGraph {
+ public:
+  using Edge = std::pair<const std::string_view, const std::string_view>;
+  AdjacencyListGraph(
+      HGraph* graph,
+      ArenaAllocator* alloc,
+      const std::string_view entry_name,
+      const std::string_view exit_name,
+      const std::vector<Edge>& adj) : graph_(graph) {
+    auto create_block = [&]() {
+      HBasicBlock* blk = new (alloc) HBasicBlock(graph_);
+      graph_->AddBlock(blk);
+      return blk;
+    };
+    HBasicBlock* entry = create_block();
+    HBasicBlock* exit = create_block();
+    graph_->SetEntryBlock(entry);
+    graph_->SetExitBlock(exit);
+    name_to_block_.Put(entry_name, entry);
+    name_to_block_.Put(exit_name, exit);
+    for (const auto& [src, dest] : adj) {
+      HBasicBlock* src_blk = name_to_block_.GetOrCreate(src, create_block);
+      HBasicBlock* dest_blk = name_to_block_.GetOrCreate(dest, create_block);
+      src_blk->AddSuccessor(dest_blk);
+    }
+    graph_->ComputeDominanceInformation();
+    for (auto [name, blk] : name_to_block_) {
+      block_to_name_.Put(blk, name);
+    }
+  }
+
+  bool HasBlock(const HBasicBlock* blk) {
+    return block_to_name_.find(blk) != block_to_name_.end();
+  }
+
+  std::string_view GetName(const HBasicBlock* blk) {
+    return block_to_name_.Get(blk);
+  }
+
+  HBasicBlock* Get(const std::string_view& sv) {
+    return name_to_block_.Get(sv);
+  }
+
+  AdjacencyListGraph(AdjacencyListGraph&&) = default;
+  AdjacencyListGraph(const AdjacencyListGraph&) = default;
+  AdjacencyListGraph& operator=(AdjacencyListGraph&&) = default;
+  AdjacencyListGraph& operator=(const AdjacencyListGraph&) = default;
+
+ private:
+  HGraph* graph_;
+  SafeMap<const std::string_view, HBasicBlock*> name_to_block_;
+  SafeMap<const HBasicBlock*, const std::string_view> block_to_name_;
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_OPTIMIZING_UNIT_TEST_H_
diff --git a/test/530-checker-lse/src/Main.java b/test/530-checker-lse/src/Main.java
index 7bc6e4b..0bc2e56 100644
--- a/test/530-checker-lse/src/Main.java
+++ b/test/530-checker-lse/src/Main.java
@@ -3627,8 +3627,48 @@
     return obj.next.i;
   }
 
+
+  /// CHECK-START: long Main.testOverlapLoop(int) load_store_elimination (before)
+  /// CHECK-DAG:                 NewArray
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 If
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 Goto
+
+  /// CHECK-START: long Main.testOverlapLoop(int) load_store_elimination (after)
+  /// CHECK-DAG:                 NewArray
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 If
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArrayGet
+  /// CHECK-DAG:                 ArraySet
+  /// CHECK-DAG:                 Goto
+  /// CHECK-NOT:                 ArrayGet
+
+  // Test that we don't incorrectly remove writes needed by later loop iterations
+  // NB This is fibonacci numbers
+  private static long testOverlapLoop(int cnt) {
+    long[] w = new long[cnt];
+    w[1] = 1;
+    long t = 1;
+    for (int i = 2; i < cnt; ++i) {
+      w[i] = w[i - 1] + w[i - 2];
+      t = w[i];
+    }
+    return t;
+  }
+
   private static void $noinline$clobberObservables() {}
 
+  static void assertLongEquals(long result, long expected) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   static void assertIntEquals(int result, int expected) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);
@@ -4012,5 +4052,7 @@
     assertIntEquals(testNestedLoop8(new TestClass(), 1), 0);
     assertIntEquals(testNestedLoop8(new TestClass(), 2), 0);
     assertIntEquals(testNestedLoop8(new TestClass(), 3), 0);
+    assertLongEquals(testOverlapLoop(10), 34l);
+    assertLongEquals(testOverlapLoop(50), 7778742049l);
   }
 }