ARM: Support SIMD reduction for 32-bit backend.

Support SIMD reduction (add, min, max) and SAD (for int->int only)
idioms for arm (32-bit) backend.

Test: test-art-target, test-art-host
Test: 661-checker-simd-reduc, 660-checker-simd-sad-int

Change-Id: Ic6121f5d781a9bcedc33041b6c4ecafad9b0420a
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index f84408d..cc470dd 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -28,6 +28,7 @@
 using helpers::InputDRegisterAt;
 using helpers::InputRegisterAt;
 using helpers::OutputDRegister;
+using helpers::OutputRegister;
 using helpers::RegisterFrom;
 
 #define __ GetVIXLAssembler()->
@@ -76,11 +77,30 @@
 }
 
 void LocationsBuilderARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kInt32:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  vixl32::DRegister src = DRegisterFrom(locations->InAt(0));
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kInt32:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Vmov(OutputRegister(instruction), DRegisterLane(src, 0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector unary operations.
@@ -112,7 +132,28 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitVecReduce(HVecReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  vixl32::DRegister src = DRegisterFrom(locations->InAt(0));
+  vixl32::DRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kInt32:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ Vpadd(DataTypeValue::I32, dst, src, src);
+          break;
+        case HVecReduce::kMin:
+          __ Vpmin(DataTypeValue::S32, dst, src, src);
+          break;
+        case HVecReduce::kMax:
+          __ Vpmax(DataTypeValue::S32, dst, src, src);
+          break;
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderARMVIXL::VisitVecCnv(HVecCnv* instruction) {
@@ -635,11 +676,49 @@
 }
 
 void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kInt32:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  vixl32::DRegister dst = DRegisterFrom(locations->Out());
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ Vmov(I32, dst, 0);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kInt32:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Vmov(Untyped32, DRegisterLane(dst, 0), InputRegisterAt(instruction, 0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector accumulations.
@@ -676,7 +755,39 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  LocationSummary* locations = instruction->GetLocations();
+  vixl32::DRegister acc = DRegisterFrom(locations->InAt(0));
+  vixl32::DRegister left = DRegisterFrom(locations->InAt(1));
+  vixl32::DRegister right = DRegisterFrom(locations->InAt(2));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+  switch (a->GetPackedType()) {
+    case DataType::Type::kInt32:
+      DCHECK_EQ(2u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          UseScratchRegisterScope temps(GetVIXLAssembler());
+          vixl32::DRegister tmp = temps.AcquireD();
+          __ Vsub(DataTypeValue::I32, tmp, left, right);
+          __ Vabs(DataTypeValue::S32, tmp, tmp);
+          __ Vadd(DataTypeValue::I32, acc, acc, tmp);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d87861b..645915e 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -1359,7 +1359,7 @@
           *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
           return TrySetVectorLength(4);
         case DataType::Type::kInt32:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv | kNoWideSAD;
           return TrySetVectorLength(2);
         default:
           break;
@@ -1968,7 +1968,9 @@
     return false;
   }
   // Try same/narrower type and deal with vector restrictions.
-  if (!TrySetVectorType(sub_type, &restrictions) || HasVectorRestrictions(restrictions, kNoSAD)) {
+  if (!TrySetVectorType(sub_type, &restrictions) ||
+      HasVectorRestrictions(restrictions, kNoSAD) ||
+      (reduction_type != sub_type && HasVectorRestrictions(restrictions, kNoWideSAD))) {
     return false;
   }
   // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index b1b3d11..768fe55 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -78,6 +78,7 @@
     kNoStringCharAt  = 1 << 9,   // no StringCharAt
     kNoReduction     = 1 << 10,  // no reduction
     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
+    kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
   };
 
   /*
diff --git a/test/660-checker-simd-sad-int/src/Main.java b/test/660-checker-simd-sad-int/src/Main.java
index 0daeedd..338e841 100644
--- a/test/660-checker-simd-sad-int/src/Main.java
+++ b/test/660-checker-simd-sad-int/src/Main.java
@@ -31,6 +31,17 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-ARM: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  //
   /// CHECK-START-ARM64: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
@@ -65,6 +76,9 @@
   //
   // No ABS? No SAD!
   //
+  /// CHECK-START-ARM: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  //
   /// CHECK-START-ARM64: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static int sadInt2IntAlt(int[] x, int[] y) {
@@ -90,6 +104,17 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
+  /// CHECK-START-ARM: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  //
   /// CHECK-START-ARM64: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
diff --git a/test/661-checker-simd-reduc/src/Main.java b/test/661-checker-simd-reduc/src/Main.java
index bcfa968..b7d8250 100644
--- a/test/661-checker-simd-reduc/src/Main.java
+++ b/test/661-checker-simd-reduc/src/Main.java
@@ -61,6 +61,18 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
+  /// CHECK-START-ARM: int Main.reductionInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  //
   /// CHECK-START-ARM64: int Main.reductionInt(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
@@ -97,6 +109,30 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
+  /// CHECK-START-ARM: int Main.reductionIntChain() loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set1:d\d+>>   VecSetScalars [<<Cons1>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set1>>,{{d\d+}}]       loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Red1:d\d+>>   VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr1:i\d+>>  VecExtractScalar [<<Red1>>]   loop:none
+  /// CHECK-DAG: <<Set2:d\d+>>   VecSetScalars [<<Extr1>>]     loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:d\d+>>   Phi [<<Set2>>,{{d\d+}}]       loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi3>>]   loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi4>>,<<Load2>>]   loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi3>>,<<Cons2>>]      loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Red2:d\d+>>   VecReduce [<<Phi4>>]          loop:none
+  /// CHECK-DAG: <<Extr2:i\d+>>  VecExtractScalar [<<Red2>>]   loop:none
+  /// CHECK-DAG:                 Return [<<Extr2>>]            loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
   /// CHECK-START-ARM64: int Main.reductionIntChain() loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
@@ -147,6 +183,23 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
+  /// CHECK-START-ARM: int Main.reductionIntToLoop(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>   Phi [<<Extr>>,{{i\d+}}]       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:i\d+>>   Phi [<<Extr>>,{{i\d+}}]       loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
   /// CHECK-START-ARM64: int Main.reductionIntToLoop(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
@@ -241,6 +294,19 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
+  /// CHECK-START-ARM: int Main.reductionIntM1(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<ConsM1:i\d+>> IntConstant -1                loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsM1>>]    loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  //
   /// CHECK-START-ARM64: int Main.reductionIntM1(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<ConsM1:i\d+>> IntConstant -1                loop:none
@@ -326,6 +392,18 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
+  /// CHECK-START-ARM: int Main.reductionMinusInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecSub [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  //
   /// CHECK-START-ARM64: int Main.reductionMinusInt(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
@@ -411,6 +489,19 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
+  /// CHECK-START-ARM: int Main.reductionMinInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant 2147483647        loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsM>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecMin [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  //
   /// CHECK-START-ARM64: int Main.reductionMinInt(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant 2147483647        loop:none
@@ -474,6 +565,19 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
+  /// CHECK-START-ARM: int Main.reductionMaxInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant -2147483648       loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsM>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecMax [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  //
   /// CHECK-START-ARM64: int Main.reductionMaxInt(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant -2147483648       loop:none