ARM64: Share address computation across SIMD LDRs/STRs.

For array accesses the element address has the following structure:
Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT

Taking into account ARM64 LDR/STR addressing modes address part
(CONST_OFFSET + index << ELEM_SHIFT) can be shared across array
access with the same data type and index.

For example, for the following loop 5 accesses can share address
computation:

void foo(int[] a, int[] b, int[] c) {
  for (i...) {
    a[i] = a[i] + 5;
    b[i] = b[i] + c[i];
  }
}

Test: test-art-host, test-art-target

Change-Id: I46af3b4e4a55004336672cdba3296b7622d815ca
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d7cc577..7601125 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -6260,6 +6260,15 @@
   }
 }
 
+void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConvention calling_convention;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index eee832a..9f2272b 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -2644,6 +2644,38 @@
          Operand(InputOperandAt(instruction, 1)));
 }
 
+void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  HIntConstant* shift = instruction->GetShift()->AsIntConstant();
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  // For byte case we don't need to shift the index variable so we can encode the data offset into
+  // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist
+  // data offset constant generation out of the loop and reduce the critical path length in the
+  // loop.
+  locations->SetInAt(1, shift->GetValue() == 0
+                        ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant())
+                        : Location::RequiresRegister());
+  locations->SetInAt(2, Location::ConstantLocation(shift));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  Register index_reg = InputRegisterAt(instruction, 0);
+  uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2));
+  uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue();
+
+  if (shift == 0) {
+    __ Add(OutputRegister(instruction), index_reg, offset);
+  } else {
+    Register offset_reg = InputRegisterAt(instruction, 1);
+    __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift));
+  }
+}
+
 void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index b6678b0..23a3477 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -6299,6 +6299,16 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConventionARMVIXL calling_convention;
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 57f7e6b..478bd24 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -783,6 +783,12 @@
     /*out*/ Register* scratch) {
   LocationSummary* locations = instruction->GetLocations();
   Register base = InputRegisterAt(instruction, 0);
+
+  if (instruction->InputAt(1)->IsIntermediateAddressIndex()) {
+    DCHECK(!is_string_char_at);
+    return MemOperand(base.X(), InputRegisterAt(instruction, 1).X());
+  }
+
   Location index = locations->InAt(1);
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index f16e372..311be1f 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -216,5 +216,18 @@
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) {
+  if (!instruction->IsStringCharAt()
+      && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) {
+  if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index eec4e49..8596f6a 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -75,6 +75,8 @@
   void VisitUShr(HUShr* instruction) OVERRIDE;
   void VisitXor(HXor* instruction) OVERRIDE;
   void VisitVecMul(HVecMul* instruction) OVERRIDE;
+  void VisitVecLoad(HVecLoad* instruction) OVERRIDE;
+  void VisitVecStore(HVecStore* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index c39e5f4..e5a8499 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -16,6 +16,8 @@
 
 #include "instruction_simplifier_shared.h"
 
+#include "mirror/array-inl.h"
+
 namespace art {
 
 namespace {
@@ -346,4 +348,59 @@
   return false;
 }
 
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) {
+  if (index->IsConstant()) {
+    // If index is constant the whole address calculation often can be done by LDR/STR themselves.
+    // TODO: Treat the case with not-embedable constant.
+    return false;
+  }
+
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+  Primitive::Type packed_type = access->GetPackedType();
+  uint32_t data_offset = mirror::Array::DataOffset(
+      Primitive::ComponentSize(packed_type)).Uint32Value();
+  size_t component_shift = Primitive::ComponentSizeShift(packed_type);
+
+  bool is_extracting_beneficial = false;
+  // It is beneficial to extract index intermediate address only if there are at least 2 users.
+  for (const HUseListNode<HInstruction*>& use : index->GetUses()) {
+    HInstruction* user = use.GetUser();
+    if (user->IsVecMemoryOperation() && user != access) {
+      HVecMemoryOperation* another_access = user->AsVecMemoryOperation();
+      Primitive::Type another_packed_type = another_access->GetPackedType();
+      uint32_t another_data_offset = mirror::Array::DataOffset(
+          Primitive::ComponentSize(another_packed_type)).Uint32Value();
+      size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type);
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    } else if (user->IsIntermediateAddressIndex()) {
+      HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex();
+      uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue();
+      size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue();
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    }
+  }
+
+  if (!is_extracting_beneficial) {
+    return false;
+  }
+
+  // Proceed to extract the index + data_offset address computation.
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntConstant* shift = graph->GetIntConstant(component_shift);
+  HIntermediateAddressIndex* address =
+      new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc);
+
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 1);
+
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index 2ea103a..371619f 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -59,6 +59,7 @@
                                   size_t data_offset);
 
 bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index);
 
 }  // namespace art
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 36c7df7..00d2988 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1396,7 +1396,8 @@
   M(BitwiseNegatedRight, Instruction)                                   \
   M(DataProcWithShifterOp, Instruction)                                 \
   M(MultiplyAccumulate, Instruction)                                    \
-  M(IntermediateAddress, Instruction)
+  M(IntermediateAddress, Instruction)                                   \
+  M(IntermediateAddressIndex, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index c6bfbcc..075a816 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -150,6 +150,49 @@
   DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
 };
 
+// This instruction computes part of the array access offset (data and index offset).
+//
+// For array accesses the element address has the following structure:
+// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing
+// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with
+// the same data type and index. For example, for the following loop 5 accesses can share address
+// computation:
+//
+// void foo(int[] a, int[] b, int[] c) {
+//   for (i...) {
+//     a[i] = a[i] + 5;
+//     b[i] = b[i] + c[i];
+//   }
+// }
+//
+// Note: as the instruction doesn't involve base array address into computations it has no side
+// effects (in comparison of HIntermediateAddress).
+class HIntermediateAddressIndex FINAL : public HExpression<3> {
+ public:
+  HIntermediateAddressIndex(
+      HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) {
+    SetRawInputAt(0, index);
+    SetRawInputAt(1, offset);
+    SetRawInputAt(2, shift);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetIndex() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+  HInstruction* GetShift() const { return InputAt(2); }
+
+  DECLARE_INSTRUCTION(IntermediateAddressIndex);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex);
+};
+
 class HDataProcWithShifterOp FINAL : public HExpression<2> {
  public:
   enum OpKind {
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 52c247b..92fe9bf 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -178,12 +178,17 @@
                       size_t vector_length,
                       uint32_t dex_pc)
       : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc),
-        alignment_(Primitive::ComponentSize(packed_type), 0) { }
+        alignment_(Primitive::ComponentSize(packed_type), 0) {
+    DCHECK_GE(number_of_inputs, 2u);
+  }
 
   void SetAlignment(Alignment alignment) { alignment_ = alignment; }
 
   Alignment GetAlignment() const { return alignment_; }
 
+  HInstruction* GetArray() const { return InputAt(0); }
+  HInstruction* GetIndex() const { return InputAt(1); }
+
   DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation);
 
  private:
diff --git a/test/527-checker-array-access-simd/expected.txt b/test/527-checker-array-access-simd/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/527-checker-array-access-simd/expected.txt
diff --git a/test/527-checker-array-access-simd/info.txt b/test/527-checker-array-access-simd/info.txt
new file mode 100644
index 0000000..f147943
--- /dev/null
+++ b/test/527-checker-array-access-simd/info.txt
@@ -0,0 +1 @@
+Test arm- and arm64-specific array access optimization for simd loops.
diff --git a/test/527-checker-array-access-simd/src/Main.java b/test/527-checker-array-access-simd/src/Main.java
new file mode 100644
index 0000000..8af5465
--- /dev/null
+++ b/test/527-checker-array-access-simd/src/Main.java
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkIntCase(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK:                                        VecLoad
+  /// CHECK-NEXT:                                   ldr q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  /// CHECK:                                        VecStore
+  /// CHECK-NEXT:                                   str q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  public static void checkByteCase(byte[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  public static void checkSingleAccess(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] = 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Index>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address2>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address1>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkInt2Float(int[] a, float[] b) {
+    for (int i = 0; i < 128; i++) {
+      b[i] = (float) a[i];
+    }
+  }
+
+  public static final int ARRAY_SIZE = 1024;
+
+  public static int calcArraySum(int[] a, byte[] b, float[] c) {
+    int sum = 0;
+    for (int i = 0; i < 128; i++) {
+      sum += a[i] + b[i] + (int) c[i];
+    }
+    return sum;
+  }
+
+  public static void main(String[] args) {
+    byte[] ba = new byte[ARRAY_SIZE];
+    int[] ia = new int[ARRAY_SIZE];
+    float[] fa = new float[ARRAY_SIZE];
+
+    checkSingleAccess(ia);
+    checkIntCase(ia);
+    checkByteCase(ba);
+    checkInt2Float(ia, fa);
+
+    assertIntEquals(3200, calcArraySum(ia, ba, fa));
+  }
+}