Support VecLoad and VecStore in LSE.

Test: test-art-host
Test: test-art-target
Test: load_store_elimination_test

Change-Id: I95ac13207a0f57225bf0bc62b1e57a629ae24e02
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 8599471..1a2d9aa 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -355,6 +355,7 @@
         "jni/jni_cfi_test.cc",
         "optimizing/codegen_test.cc",
         "optimizing/load_store_analysis_test.cc",
+        "optimizing/load_store_elimination_test.cc",
         "optimizing/optimizing_cfi_test.cc",
         "optimizing/scheduler_test.cc",
     ],
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 8678fab..66806d8 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -83,7 +83,8 @@
       DCHECK(load != nullptr);
       DCHECK(load->IsInstanceFieldGet() ||
              load->IsStaticFieldGet() ||
-             load->IsArrayGet());
+             load->IsArrayGet() ||
+             load->IsVecLoad());
       HInstruction* substitute = substitute_instructions_for_loads_[i];
       DCHECK(substitute != nullptr);
       // Keep tracing substitute till one that's not removed.
@@ -98,7 +99,10 @@
 
     // At this point, stores in possibly_removed_stores_ can be safely removed.
     for (HInstruction* store : possibly_removed_stores_) {
-      DCHECK(store->IsInstanceFieldSet() || store->IsStaticFieldSet() || store->IsArraySet());
+      DCHECK(store->IsInstanceFieldSet() ||
+             store->IsStaticFieldSet() ||
+             store->IsArraySet() ||
+             store->IsVecStore());
       store->GetBlock()->RemoveInstruction(store);
     }
 
@@ -137,7 +141,9 @@
   void KeepIfIsStore(HInstruction* heap_value) {
     if (heap_value == kDefaultHeapValue ||
         heap_value == kUnknownHeapValue ||
-        !(heap_value->IsInstanceFieldSet() || heap_value->IsArraySet())) {
+        !(heap_value->IsInstanceFieldSet() ||
+          heap_value->IsArraySet() ||
+          heap_value->IsVecStore())) {
       return;
     }
     auto idx = std::find(possibly_removed_stores_.begin(),
@@ -320,7 +326,9 @@
       return;
     }
     if (heap_value != kUnknownHeapValue) {
-      if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) {
+      if (heap_value->IsInstanceFieldSet() ||
+          heap_value->IsArraySet() ||
+          heap_value->IsVecStore()) {
         HInstruction* store = heap_value;
         // This load must be from a singleton since it's from the same
         // field/element that a "removed" store puts the value. That store
@@ -416,7 +424,9 @@
 
     if (!same_value) {
       if (possibly_redundant) {
-        DCHECK(instruction->IsInstanceFieldSet() || instruction->IsArraySet());
+        DCHECK(instruction->IsInstanceFieldSet() ||
+               instruction->IsArraySet() ||
+               instruction->IsVecStore());
         // Put the store as the heap value. If the value is loaded from heap
         // by a load later, this store isn't really redundant.
         heap_values[idx] = instruction;
@@ -429,8 +439,24 @@
       if (i == idx) {
         continue;
       }
-      if (heap_values[i] == value) {
-        // Same value should be kept even if aliasing happens.
+      if (heap_values[i] == value && !instruction->IsVecOperation()) {
+        // For field/array, same value should be kept even if aliasing happens.
+        //
+        // For vector values , this is NOT safe. For example:
+        //   packed_data = [0xA, 0xB, 0xC, 0xD];            <-- Different values in each lane.
+        //   VecStore array[i  ,i+1,i+2,i+3] = packed_data;
+        //   VecStore array[i+1,i+2,i+3,i+4] = packed_data; <-- We are here (partial overlap).
+        //   VecLoad  vx = array[i,i+1,i+2,i+3];            <-- Cannot be eliminated.
+        //
+        // TODO: to allow such 'same value' optimization on vector data,
+        // LSA needs to report more fine-grain MAY alias information:
+        // (1) May alias due to two vector data partial overlap.
+        //     e.g. a[i..i+3] and a[i+1,..,i+4].
+        // (2) May alias due to two vector data may complete overlap each other.
+        //     e.g. a[i..i+3] and b[i..i+3].
+        // (3) May alias but the exact relationship between two locations is unknown.
+        //     e.g. a[i..i+3] and b[j..j+3], where values of a,b,i,j are all unknown.
+        // This 'same value' optimization can apply only on case (2).
         continue;
       }
       if (heap_values[i] == kUnknownHeapValue) {
@@ -520,6 +546,32 @@
                      value);
   }
 
+  void VisitVecLoad(HVecLoad* instruction) OVERRIDE {
+    HInstruction* array = instruction->InputAt(0);
+    HInstruction* index = instruction->InputAt(1);
+    size_t vector_length = instruction->GetVectorLength();
+    VisitGetLocation(instruction,
+                     array,
+                     HeapLocation::kInvalidFieldOffset,
+                     index,
+                     vector_length,
+                     HeapLocation::kDeclaringClassDefIndexForArrays);
+  }
+
+  void VisitVecStore(HVecStore* instruction) OVERRIDE {
+    HInstruction* array = instruction->InputAt(0);
+    HInstruction* index = instruction->InputAt(1);
+    HInstruction* value = instruction->InputAt(2);
+    size_t vector_length = instruction->GetVectorLength();
+    VisitSetLocation(instruction,
+                     array,
+                     HeapLocation::kInvalidFieldOffset,
+                     index,
+                     vector_length,
+                     HeapLocation::kDeclaringClassDefIndexForArrays,
+                     value);
+  }
+
   void VisitDeoptimize(HDeoptimize* instruction) {
     const ScopedArenaVector<HInstruction*>& heap_values =
         heap_values_for_[instruction->GetBlock()->GetBlockId()];
@@ -529,7 +581,9 @@
         continue;
       }
       // A store is kept as the heap value for possibly removed stores.
-      if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) {
+      if (heap_value->IsInstanceFieldSet() ||
+          heap_value->IsArraySet() ||
+          heap_value->IsVecStore()) {
         // Check whether the reference for a store is used by an environment local of
         // HDeoptimize.
         HInstruction* reference = heap_value->InputAt(0);
@@ -687,11 +741,6 @@
     return;
   }
 
-  // TODO: analyze VecLoad/VecStore better.
-  if (graph_->HasSIMD()) {
-    return;
-  }
-
   LSEVisitor lse_visitor(graph_, heap_location_collector, side_effects_, stats_);
   for (HBasicBlock* block : graph_->GetReversePostOrder()) {
     lse_visitor.VisitBasicBlock(block);
diff --git a/compiler/optimizing/load_store_elimination_test.cc b/compiler/optimizing/load_store_elimination_test.cc
new file mode 100644
index 0000000..6f42d96
--- /dev/null
+++ b/compiler/optimizing/load_store_elimination_test.cc
@@ -0,0 +1,406 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "side_effects_analysis.h"
+#include "load_store_analysis.h"
+#include "load_store_elimination.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+class LoadStoreEliminationTest : public OptimizingUnitTest {
+ public:
+  LoadStoreEliminationTest() : pool_() {}
+
+  void PerformLSE() {
+    graph_->BuildDominatorTree();
+    SideEffectsAnalysis side_effects(graph_);
+    side_effects.Run();
+    LoadStoreAnalysis lsa(graph_);
+    lsa.Run();
+    LoadStoreElimination lse(graph_, side_effects, lsa, nullptr);
+    lse.Run();
+  }
+
+  void CreateTestControlFlowGraph() {
+    graph_ = CreateGraph();
+
+    entry_ = new (GetAllocator()) HBasicBlock(graph_);
+    pre_header_ = new (GetAllocator()) HBasicBlock(graph_);
+    loop_header_ = new (GetAllocator()) HBasicBlock(graph_);
+    loop_body_ = new (GetAllocator()) HBasicBlock(graph_);
+    exit_ = new (GetAllocator()) HBasicBlock(graph_);
+
+    graph_->AddBlock(entry_);
+    graph_->AddBlock(pre_header_);
+    graph_->AddBlock(loop_header_);
+    graph_->AddBlock(loop_body_);
+    graph_->AddBlock(exit_);
+
+    graph_->SetEntryBlock(entry_);
+
+    // This common CFG has been used by all cases in this load_store_elimination_test.
+    //   entry
+    //     |
+    // pre_header
+    //     |
+    // loop_header <--+
+    //     |          |
+    // loop_body -----+
+    //     |
+    //    exit
+
+    entry_->AddSuccessor(pre_header_);
+    pre_header_->AddSuccessor(loop_header_);
+    loop_header_->AddSuccessor(exit_);       // true successor
+    loop_header_->AddSuccessor(loop_body_);  // false successor
+    loop_body_->AddSuccessor(loop_header_);
+
+    HInstruction* c0 = graph_->GetIntConstant(0);
+    HInstruction* c1 = graph_->GetIntConstant(1);
+    HInstruction* c4 = graph_->GetIntConstant(4);
+    HInstruction* c128 = graph_->GetIntConstant(128);
+
+    // entry block has following instructions:
+    // array, i, j, i+1, i+4.
+    array_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                               dex::TypeIndex(0),
+                                               0,
+                                               DataType::Type::kReference);
+    i_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                           dex::TypeIndex(1),
+                                           1,
+                                           DataType::Type::kInt32);
+    j_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                           dex::TypeIndex(1),
+                                           2,
+                                           DataType::Type::kInt32);
+    i_add1_ = new (GetAllocator()) HAdd(DataType::Type::kInt32, i_, c1);
+    i_add4_ = new (GetAllocator()) HAdd(DataType::Type::kInt32, i_, c4);
+    entry_->AddInstruction(array_);
+    entry_->AddInstruction(i_);
+    entry_->AddInstruction(j_);
+    entry_->AddInstruction(i_add1_);
+    entry_->AddInstruction(i_add4_);
+    entry_->AddInstruction(new (GetAllocator()) HGoto());
+
+    // pre_header block
+    pre_header_->AddInstruction(new (GetAllocator()) HGoto());
+
+    // loop header block has following instructions:
+    // phi = 0;
+    // if (phi >= 128);
+    phi_ = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+    cmp_ = new (GetAllocator()) HGreaterThanOrEqual(phi_, c128);
+    if_ = new (GetAllocator()) HIf(cmp_);
+    loop_header_->AddPhi(phi_);
+    loop_header_->AddInstruction(cmp_);
+    loop_header_->AddInstruction(if_);
+    phi_->AddInput(c0);
+
+    // loop body block has following instructions:
+    // phi++;
+    HInstruction* inc_phi = new (GetAllocator()) HAdd(DataType::Type::kInt32, phi_, c1);
+    loop_body_->AddInstruction(inc_phi);
+    loop_body_->AddInstruction(new (GetAllocator()) HGoto());
+    phi_->AddInput(inc_phi);
+
+    // exit block
+    exit_->AddInstruction(new (GetAllocator()) HExit());
+  }
+
+  // To avoid tedious HIR assembly in test functions.
+  HInstruction* AddVecLoad(HBasicBlock* block, HInstruction* array, HInstruction* index) {
+    DCHECK(block != nullptr);
+    DCHECK(array != nullptr);
+    DCHECK(index != nullptr);
+    HInstruction* vload = new (GetAllocator()) HVecLoad(
+        GetAllocator(),
+        array,
+        index,
+        DataType::Type::kInt32,
+        SideEffects::ArrayReadOfType(DataType::Type::kInt32),
+        4,
+        /*is_string_char_at*/ false,
+        kNoDexPc);
+    block->InsertInstructionBefore(vload, block->GetLastInstruction());
+    return vload;
+  }
+
+  HInstruction* AddVecStore(HBasicBlock* block,
+                            HInstruction* array,
+                            HInstruction* index,
+                            HVecOperation* vdata = nullptr) {
+    DCHECK(block != nullptr);
+    DCHECK(array != nullptr);
+    DCHECK(index != nullptr);
+    if (vdata == nullptr) {
+      HInstruction* c1 = graph_->GetIntConstant(1);
+      vdata = new (GetAllocator()) HVecReplicateScalar(GetAllocator(),
+                                                       c1,
+                                                       DataType::Type::kInt32,
+                                                       4,
+                                                       kNoDexPc);
+      block->InsertInstructionBefore(vdata, block->GetLastInstruction());
+    }
+    HInstruction* vstore = new (GetAllocator()) HVecStore(
+        GetAllocator(),
+        array,
+        index,
+        vdata,
+        DataType::Type::kInt32,
+        SideEffects::ArrayWriteOfType(DataType::Type::kInt32),
+        4,
+        kNoDexPc);
+    block->InsertInstructionBefore(vstore, block->GetLastInstruction());
+    return vstore;
+  }
+
+  HInstruction* AddArrayGet(HBasicBlock* block, HInstruction* array, HInstruction* index) {
+    DCHECK(block != nullptr);
+    DCHECK(array != nullptr);
+    DCHECK(index != nullptr);
+    HInstruction* get = new (GetAllocator()) HArrayGet(array, index, DataType::Type::kInt32, 0);
+    block->InsertInstructionBefore(get, block->GetLastInstruction());
+    return get;
+  }
+
+  HInstruction* AddArraySet(HBasicBlock* block,
+                            HInstruction* array,
+                            HInstruction* index,
+                            HInstruction* data = nullptr) {
+    DCHECK(block != nullptr);
+    DCHECK(array != nullptr);
+    DCHECK(index != nullptr);
+    if (data == nullptr) {
+      data = graph_->GetIntConstant(1);
+    }
+    HInstruction* store = new (GetAllocator()) HArraySet(array,
+                                                         index,
+                                                         data,
+                                                         DataType::Type::kInt32,
+                                                         0);
+    block->InsertInstructionBefore(store, block->GetLastInstruction());
+    return store;
+  }
+
+  ArenaPool pool_;
+
+  HGraph* graph_;
+  HBasicBlock* entry_;
+  HBasicBlock* pre_header_;
+  HBasicBlock* loop_header_;
+  HBasicBlock* loop_body_;
+  HBasicBlock* exit_;
+
+  HInstruction* array_;
+  HInstruction* i_;
+  HInstruction* j_;
+  HInstruction* i_add1_;
+  HInstruction* i_add4_;
+
+  HPhi* phi_;
+  HInstruction* cmp_;
+  HInstruction* if_;
+};
+
+TEST_F(LoadStoreEliminationTest, ArrayGetSetElimination) {
+  CreateTestControlFlowGraph();
+
+  HInstruction* c1 = graph_->GetIntConstant(1);
+  HInstruction* c2 = graph_->GetIntConstant(2);
+  HInstruction* c3 = graph_->GetIntConstant(3);
+
+  // array[1] = 1;
+  // x = array[1];  <--- Remove.
+  // y = array[2];
+  // array[1] = 1;  <--- Remove, since it stores same value.
+  // array[i] = 3;  <--- MAY alias.
+  // array[1] = 1;  <--- Cannot remove, even if it stores the same value.
+  AddArraySet(entry_, array_, c1, c1);
+  HInstruction* load1 = AddArrayGet(entry_, array_, c1);
+  HInstruction* load2 = AddArrayGet(entry_, array_, c2);
+  HInstruction* store1 = AddArraySet(entry_, array_, c1, c1);
+  AddArraySet(entry_, array_, i_, c3);
+  HInstruction* store2 = AddArraySet(entry_, array_, c1, c1);
+
+  PerformLSE();
+
+  ASSERT_TRUE(IsRemoved(load1));
+  ASSERT_FALSE(IsRemoved(load2));
+  ASSERT_TRUE(IsRemoved(store1));
+  ASSERT_FALSE(IsRemoved(store2));
+}
+
+TEST_F(LoadStoreEliminationTest, SameHeapValue) {
+  CreateTestControlFlowGraph();
+
+  HInstruction* c1 = graph_->GetIntConstant(1);
+  HInstruction* c2 = graph_->GetIntConstant(2);
+
+  // Test LSE handling same value stores on array.
+  // array[1] = 1;
+  // array[2] = 1;
+  // array[1] = 1;  <--- Can remove.
+  // array[1] = 2;  <--- Can NOT remove.
+  AddArraySet(entry_, array_, c1, c1);
+  AddArraySet(entry_, array_, c2, c1);
+  HInstruction* store1 = AddArraySet(entry_, array_, c1, c1);
+  HInstruction* store2 = AddArraySet(entry_, array_, c1, c2);
+
+  // Test LSE handling same value stores on vector.
+  // vdata = [0x1, 0x2, 0x3, 0x4, ...]
+  // VecStore array[i...] = vdata;
+  // VecStore array[j...] = vdata;  <--- MAY ALIAS.
+  // VecStore array[i...] = vdata;  <--- Cannot Remove, even if it's same value.
+  AddVecStore(entry_, array_, i_);
+  AddVecStore(entry_, array_, j_);
+  HInstruction* vstore1 = AddVecStore(entry_, array_, i_);
+
+  // VecStore array[i...] = vdata;
+  // VecStore array[i+1...] = vdata;  <--- MAY alias due to partial overlap.
+  // VecStore array[i...] = vdata;    <--- Cannot remove, even if it's same value.
+  AddVecStore(entry_, array_, i_);
+  AddVecStore(entry_, array_, i_add1_);
+  HInstruction* vstore2 = AddVecStore(entry_, array_, i_);
+
+  PerformLSE();
+
+  ASSERT_TRUE(IsRemoved(store1));
+  ASSERT_FALSE(IsRemoved(store2));
+  ASSERT_FALSE(IsRemoved(vstore1));
+  ASSERT_FALSE(IsRemoved(vstore2));
+}
+
+TEST_F(LoadStoreEliminationTest, OverlappingLoadStore) {
+  CreateTestControlFlowGraph();
+
+  HInstruction* c1 = graph_->GetIntConstant(1);
+
+  // Test LSE handling array LSE when there is vector store in between.
+  // a[i] = 1;
+  // .. = a[i];                <-- Remove.
+  // a[i,i+1,i+2,i+3] = data;  <-- PARTIAL OVERLAP !
+  // .. = a[i];                <-- Cannot remove.
+  AddArraySet(entry_, array_, i_, c1);
+  HInstruction* load1 = AddArrayGet(entry_, array_, i_);
+  AddVecStore(entry_, array_, i_);
+  HInstruction* load2 = AddArrayGet(entry_, array_, i_);
+
+  // Test LSE handling vector load/store partial overlap.
+  // a[i,i+1,i+2,i+3] = data;
+  // a[i+4,i+5,i+6,i+7] = data;
+  // .. = a[i,i+1,i+2,i+3];
+  // .. = a[i+4,i+5,i+6,i+7];
+  // a[i+1,i+2,i+3,i+4] = data;  <-- PARTIAL OVERLAP !
+  // .. = a[i,i+1,i+2,i+3];
+  // .. = a[i+4,i+5,i+6,i+7];
+  AddVecStore(entry_, array_, i_);
+  AddVecStore(entry_, array_, i_add4_);
+  HInstruction* vload1 = AddVecLoad(entry_, array_, i_);
+  HInstruction* vload2 = AddVecLoad(entry_, array_, i_add4_);
+  AddVecStore(entry_, array_, i_add1_);
+  HInstruction* vload3 = AddVecLoad(entry_, array_, i_);
+  HInstruction* vload4 = AddVecLoad(entry_, array_, i_add4_);
+
+  // Test LSE handling vector LSE when there is array store in between.
+  // a[i,i+1,i+2,i+3] = data;
+  // a[i+1] = 1;                 <-- PARTIAL OVERLAP !
+  // .. = a[i,i+1,i+2,i+3];
+  AddVecStore(entry_, array_, i_);
+  AddArraySet(entry_, array_, i_, c1);
+  HInstruction* vload5 = AddVecLoad(entry_, array_, i_);
+
+  PerformLSE();
+
+  ASSERT_TRUE(IsRemoved(load1));
+  ASSERT_FALSE(IsRemoved(load2));
+
+  ASSERT_TRUE(IsRemoved(vload1));
+  ASSERT_TRUE(IsRemoved(vload2));
+  ASSERT_FALSE(IsRemoved(vload3));
+  ASSERT_FALSE(IsRemoved(vload4));
+
+  ASSERT_FALSE(IsRemoved(vload5));
+}
+
+// function (int[] a, int j) {
+// a[j] = 1;
+// for (int i=0; i<128; i++) {
+//    /* doesn't do any write */
+// }
+// a[j] = 1;
+TEST_F(LoadStoreEliminationTest, Loop1) {
+  CreateTestControlFlowGraph();
+
+  HInstruction* c1 = graph_->GetIntConstant(1);
+
+  // a[j] = 1
+  AddArraySet(pre_header_, array_, j_, c1);
+
+  // LOOP BODY:
+  // .. = a[i,i+1,i+2,i+3];
+  AddVecLoad(loop_body_, array_, phi_);
+
+  // a[j] = 1;
+  HInstruction* array_set = AddArraySet(exit_, array_, j_, c1);
+
+  PerformLSE();
+
+  ASSERT_TRUE(IsRemoved(array_set));
+}
+
+// function (int[] a, int index) {
+//   a[index] = 1;
+//   int[] b = new int[128];
+//   for (int i=0; i<128; i++) {
+//     a[i,i+1,i+2,i+3] = vdata;
+//     b[i,i+1,i+2,i+3] = a[i,i+1,i+2,i+3];
+//   }
+//   a[index] = 1;
+// }
+TEST_F(LoadStoreEliminationTest, Loop2) {
+  CreateTestControlFlowGraph();
+
+  HInstruction* c0 = graph_->GetIntConstant(0);
+  HInstruction* c1 = graph_->GetIntConstant(1);
+  HInstruction* c128 = graph_->GetIntConstant(128);
+
+  HInstruction* array_b = new (GetAllocator()) HNewArray(c0, c128, 0);
+  entry_->AddInstruction(array_b);
+
+  // a[index] = 1;
+  AddArraySet(pre_header_, array_, i_, c1);
+
+  // a[i,i+1,i+2,i+3] = vdata;
+  // b[i,i+1,i+2,i+3] = a[i,i+1,i+2,i+3];
+  AddVecStore(loop_body_, array_, phi_);
+  HInstruction* vload = AddVecLoad(loop_body_, array_, phi_);
+  AddVecStore(loop_body_, array_b, phi_, vload->AsVecLoad());
+
+  // a[index] = 1;
+  HInstruction* a_set = AddArraySet(exit_, array_, i_, c1);
+
+  PerformLSE();
+
+  ASSERT_TRUE(IsRemoved(vload));
+  ASSERT_FALSE(IsRemoved(a_set));   // Cannot remove due to side effect in loop.
+}
+
+}  // namespace art