Implement halving add idiom (with checker tests). Rationale: First of several idioms that map to very efficient SIMD instructions. Note that the is-zero-ext and is-sign-ext are general-purpose utilities that will be widely used in the vectorizer to detect low precision idioms, so expect that code to be shared with many CLs to come. Test: test-art-host, test-art-target Change-Id: If7dc2926c72a2e4b5cea15c44ef68cf5503e9be9

commit: f3e61ee363fe7f82ef56704f06d753e2034a67dd [log] [tgz]
author: Aart Bik <ajcbik@google.com> Wed Apr 12 17:09:20 2017 -0700
committer: Aart Bik <ajcbik@google.com> Wed Apr 19 10:30:57 2017 -0700
tree: a00f1fce4a2e284b0a03f941f530afc5b5c56b59
parent: 741a81af441cbcb7255229bf250bc009d2894e92 [diff]
diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc
index e7f7b30..6e82123 100644
--- a/compiler/optimizing/code_generator_vector_arm.cc
+++ b/compiler/optimizing/code_generator_vector_arm.cc

@@ -124,6 +124,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARM::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARM::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -148,6 +156,22 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARM::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARM::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 0923920..2dfccff 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc

@@ -318,6 +318,47 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        instruction->IsRounded()
+            ? __ Urhadd(dst.V16B(), lhs.V16B(), rhs.V16B())
+            : __ Uhadd(dst.V16B(), lhs.V16B(), rhs.V16B());
+      } else {
+        instruction->IsRounded()
+            ? __ Srhadd(dst.V16B(), lhs.V16B(), rhs.V16B())
+            : __ Shadd(dst.V16B(), lhs.V16B(), rhs.V16B());
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->IsUnsigned()) {
+        instruction->IsRounded()
+            ? __ Urhadd(dst.V8H(), lhs.V8H(), rhs.V8H())
+            : __ Uhadd(dst.V8H(), lhs.V8H(), rhs.V8H());
+      } else {
+        instruction->IsRounded()
+            ? __ Srhadd(dst.V8H(), lhs.V8H(), rhs.V8H())
+            : __ Shadd(dst.V8H(), lhs.V8H(), rhs.V8H());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM64::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -420,6 +461,22 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+}
+
+void LocationsBuilderARM64::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+}
+
 void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 74fa584..990178b 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc

@@ -124,6 +124,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARMVIXL::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -148,6 +156,22 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARMVIXL::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index 6969abd..8ea1ca7 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc

@@ -124,6 +124,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -148,6 +156,22 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 87118ce..a484bb4 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc

@@ -124,6 +124,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -148,6 +156,22 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS64::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 8dabb4d..a86d060 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc

@@ -350,6 +350,35 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK(instruction->IsRounded());
+  DCHECK(instruction->IsUnsigned());
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+     __ pavgb(dst, src);
+     return;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ pavgw(dst, src);
+      return;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -448,6 +477,22 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index e956088..6967353 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc

@@ -343,6 +343,31 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+     __ pavgb(dst, src);
+     return;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ pavgw(dst, src);
+      return;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecSub(HVecSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -441,6 +466,22 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecMin(HVecMin* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMin(HVecMin* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86_64::VisitVecMax(HVecMax* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMax(HVecMax* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderX86_64::VisitVecAnd(HVecAnd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
 }

diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index cc3c143..1b2b9f8 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc

@@ -509,6 +509,11 @@
     StartAttributeStream("kind") << deoptimize->GetKind();
   }
 
+  void VisitVecHalvingAdd(HVecHalvingAdd* hadd) OVERRIDE {
+    StartAttributeStream("unsigned") << std::boolalpha << hadd->IsUnsigned() << std::noboolalpha;
+    StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha;
+  }
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
   void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetOpKind();

diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 1c8674d..7c833cf 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc

@@ -45,18 +45,6 @@
   return c2 != 0 && CanLongValueFitIntoInt(static_cast<int64_t>(c1) / static_cast<int64_t>(c2));
 }
 
-/** Returns true for 32/64-bit constant instruction. */
-static bool IsIntAndGet(HInstruction* instruction, int64_t* value) {
-  if (instruction->IsIntConstant()) {
-    *value = instruction->AsIntConstant()->GetValue();
-    return true;
-  } else if (instruction->IsLongConstant()) {
-    *value = instruction->AsLongConstant()->GetValue();
-    return true;
-  }
-  return false;
-}
-
 /** Computes a * b for a,b > 0 (at least until first overflow happens). */
 static int64_t SafeMul(int64_t a, int64_t b, /*out*/ bool* overflow) {
   if (a > 0 && b > 0 && a > (std::numeric_limits<int64_t>::max() / b)) {
@@ -106,7 +94,7 @@
     }
   }
   int64_t value = -1;
-  return IsIntAndGet(instruction, &value) && value >= 0;
+  return IsInt64AndGet(instruction, &value) && value >= 0;
 }
 
 /** Hunts "under the hood" for a suitable instruction at the hint. */
@@ -149,7 +137,7 @@
     int64_t value;
     if (v.instruction->IsDiv() &&
         v.instruction->InputAt(0)->IsArrayLength() &&
-        IsIntAndGet(v.instruction->InputAt(1), &value) && v.a_constant == value) {
+        IsInt64AndGet(v.instruction->InputAt(1), &value) && v.a_constant == value) {
       return InductionVarRange::Value(v.instruction->InputAt(0), 1, v.b_constant);
     }
     // If a == 1, the most suitable one suffices as maximum value.
@@ -444,7 +432,7 @@
     // any of the three requests (kExact, kAtMost, and KAtLeast).
     if (info->induction_class == HInductionVarAnalysis::kInvariant &&
         info->operation == HInductionVarAnalysis::kFetch) {
-      if (IsIntAndGet(info->fetch, value)) {
+      if (IsInt64AndGet(info->fetch, value)) {
         return true;
       }
     }
@@ -635,7 +623,7 @@
   int64_t f = 0;
   if (IsConstant(info->op_a, kExact, &a) &&
       CanLongValueFitIntoInt(a) &&
-      IsIntAndGet(info->fetch, &f) && f >= 1) {
+      IsInt64AndGet(info->fetch, &f) && f >= 1) {
     // Conservative bounds on a * f^-i + b with f >= 1 can be computed without
     // trip count. Other forms would require a much more elaborate evaluation.
     const bool is_min_a = a >= 0 ? is_min : !is_min;
@@ -663,7 +651,7 @@
   // Unless at a constant or hint, chase the instruction a bit deeper into the HIR tree, so that
   // it becomes more likely range analysis will compare the same instructions as terminal nodes.
   int64_t value;
-  if (IsIntAndGet(instruction, &value) && CanLongValueFitIntoInt(value)) {
+  if (IsInt64AndGet(instruction, &value) && CanLongValueFitIntoInt(value)) {
     // Proper constant reveals best information.
     return Value(static_cast<int32_t>(value));
   } else if (instruction == chase_hint_) {
@@ -671,10 +659,10 @@
     return Value(instruction, 1, 0);
   } else if (instruction->IsAdd()) {
     // Incorporate suitable constants in the chased value.
-    if (IsIntAndGet(instruction->InputAt(0), &value) && CanLongValueFitIntoInt(value)) {
+    if (IsInt64AndGet(instruction->InputAt(0), &value) && CanLongValueFitIntoInt(value)) {
       return AddValue(Value(static_cast<int32_t>(value)),
                       GetFetch(instruction->InputAt(1), trip, in_body, is_min));
-    } else if (IsIntAndGet(instruction->InputAt(1), &value) && CanLongValueFitIntoInt(value)) {
+    } else if (IsInt64AndGet(instruction->InputAt(1), &value) && CanLongValueFitIntoInt(value)) {
       return AddValue(GetFetch(instruction->InputAt(0), trip, in_body, is_min),
                       Value(static_cast<int32_t>(value)));
     }
@@ -1074,7 +1062,7 @@
   // Detect known base and trip count (always taken).
   int64_t f = 0;
   int64_t m = 0;
-  if (IsIntAndGet(info->fetch, &f) && f >= 1 && IsConstant(trip->op_a, kExact, &m) && m >= 1) {
+  if (IsInt64AndGet(info->fetch, &f) && f >= 1 && IsConstant(trip->op_a, kExact, &m) && m >= 1) {
     HInstruction* opa = nullptr;
     HInstruction* opb = nullptr;
     if (GenerateCode(info->op_a, nullptr, graph, block, &opa, false, false) &&

diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 8e88c1e..5a95abd 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc

@@ -63,12 +63,122 @@
   return false;
 }
 
+// Detect a sign extension from the given type. Returns the promoted operand on success.
+static bool IsSignExtensionAndGet(HInstruction* instruction,
+                                  Primitive::Type type,
+                                  /*out*/ HInstruction** operand) {
+  // Accept any already wider constant that would be handled properly by sign
+  // extension when represented in the *width* of the given narrower data type
+  // (the fact that char normally zero extends does not matter here).
+  int64_t value = 0;
+  if (IsInt64AndGet(instruction, &value)) {
+    switch (type) {
+      case Primitive::kPrimByte:
+        if (std::numeric_limits<int8_t>::min() <= value &&
+            std::numeric_limits<int8_t>::max() >= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+        if (std::numeric_limits<int16_t>::min() <= value &&
+            std::numeric_limits<int16_t>::max() <= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      default:
+        return false;
+    }
+  }
+  // An implicit widening conversion of a signed integer to an integral type sign-extends
+  // the two's-complement representation of the integer value to fill the wider format.
+  if (instruction->GetType() == type && (instruction->IsArrayGet() ||
+                                         instruction->IsStaticFieldGet() ||
+                                         instruction->IsInstanceFieldGet())) {
+    switch (type) {
+      case Primitive::kPrimByte:
+      case Primitive::kPrimShort:
+        *operand = instruction;
+        return true;
+      default:
+        return false;
+    }
+  }
+  // TODO: perhaps explicit conversions later too?
+  //       (this may return something different from instruction)
+  return false;
+}
+
+// Detect a zero extension from the given type. Returns the promoted operand on success.
+static bool IsZeroExtensionAndGet(HInstruction* instruction,
+                                  Primitive::Type type,
+                                  /*out*/ HInstruction** operand) {
+  // Accept any already wider constant that would be handled properly by zero
+  // extension when represented in the *width* of the given narrower data type
+  // (the fact that byte/short normally sign extend does not matter here).
+  int64_t value = 0;
+  if (IsInt64AndGet(instruction, &value)) {
+    switch (type) {
+      case Primitive::kPrimByte:
+        if (std::numeric_limits<uint8_t>::min() <= value &&
+            std::numeric_limits<uint8_t>::max() >= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+        if (std::numeric_limits<uint16_t>::min() <= value &&
+            std::numeric_limits<uint16_t>::max() <= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      default:
+        return false;
+    }
+  }
+  // An implicit widening conversion of a char to an integral type zero-extends
+  // the representation of the char value to fill the wider format.
+  if (instruction->GetType() == type && (instruction->IsArrayGet() ||
+                                         instruction->IsStaticFieldGet() ||
+                                         instruction->IsInstanceFieldGet())) {
+    if (type == Primitive::kPrimChar) {
+      *operand = instruction;
+      return true;
+    }
+  }
+  // A sign (or zero) extension followed by an explicit removal of just the
+  // higher sign bits is equivalent to a zero extension of the underlying operand.
+  if (instruction->IsAnd()) {
+    int64_t mask = 0;
+    HInstruction* a = instruction->InputAt(0);
+    HInstruction* b = instruction->InputAt(1);
+    // In (a & b) find (mask & b) or (a & mask) with sign or zero extension on the non-mask.
+    if ((IsInt64AndGet(a, /*out*/ &mask) && (IsSignExtensionAndGet(b, type, /*out*/ operand) ||
+                                             IsZeroExtensionAndGet(b, type, /*out*/ operand))) ||
+        (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
+                                             IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
+      switch ((*operand)->GetType()) {
+        case Primitive::kPrimByte:  return mask == std::numeric_limits<uint8_t>::max();
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
+        default: return false;
+      }
+    }
+  }
+  // TODO: perhaps explicit conversions later too?
+  return false;
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
 }
 
-// Inserts an instruction.
+// Insert an instruction.
 static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) {
   DCHECK(block != nullptr);
   DCHECK(instruction != nullptr);
@@ -713,6 +823,10 @@
       return true;
     }
   } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
+    // Recognize vectorization idioms.
+    if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
+      return true;
+    }
     // Deal with vector restrictions.
     if ((HasVectorRestrictions(restrictions, kNoShift)) ||
         (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) {
@@ -806,11 +920,11 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs;
+            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
             *restrictions |= kNoDiv;
@@ -1039,6 +1153,90 @@
 #undef GENERATE_VEC
 
 //
+// Vectorization idioms.
+//
+
+// Method recognizes the following idioms:
+//   rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
+//   regular  halving add (a + b)     >> 1 for unsigned/signed operands a, b
+// Provided that the operands are promoted to a wider form to do the arithmetic and
+// then cast back to narrower form, the idioms can be mapped into efficient SIMD
+// implementation that operates directly in narrower form (plus one extra bit).
+// TODO: current version recognizes implicit byte/short/char widening only;
+//       explicit widening from int to long could be added later.
+bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
+                                                 HInstruction* instruction,
+                                                 bool generate_code,
+                                                 Primitive::Type type,
+                                                 uint64_t restrictions) {
+  // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
+  // (note whether the sign bit in higher precision is shifted in has no effect
+  // on the narrow precision computed by the idiom).
+  int64_t value = 0;
+  if ((instruction->IsShr() ||
+       instruction->IsUShr()) &&
+      IsInt64AndGet(instruction->InputAt(1), &value) && value == 1) {
+    //
+    // TODO: make following code less sensitive to associativity and commutativity differences.
+    //
+    HInstruction* x = instruction->InputAt(0);
+    // Test for an optional rounding part (x + 1) >> 1.
+    bool is_rounded = false;
+    if (x->IsAdd() && IsInt64AndGet(x->InputAt(1), &value) && value == 1) {
+      x = x->InputAt(0);
+      is_rounded = true;
+    }
+    // Test for a core addition (a + b) >> 1 (possibly rounded), either unsigned or signed.
+    if (x->IsAdd()) {
+      HInstruction* a = x->InputAt(0);
+      HInstruction* b = x->InputAt(1);
+      HInstruction* r = nullptr;
+      HInstruction* s = nullptr;
+      bool is_unsigned = false;
+      if (IsZeroExtensionAndGet(a, type, &r) && IsZeroExtensionAndGet(b, type, &s)) {
+        is_unsigned = true;
+      } else if (IsSignExtensionAndGet(a, type, &r) && IsSignExtensionAndGet(b, type, &s)) {
+        is_unsigned = false;
+      } else {
+        return false;
+      }
+      // Deal with vector restrictions.
+      if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+          (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
+        return false;
+      }
+      // Accept recognized halving add for vectorizable operands. Vectorized code uses the
+      // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
+      DCHECK(r != nullptr && s != nullptr);
+      if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+          VectorizeUse(node, s, generate_code, type, restrictions)) {
+        if (generate_code) {
+          if (vector_mode_ == kVector) {
+            vector_map_->Put(instruction, new (global_allocator_) HVecHalvingAdd(
+                global_allocator_,
+                vector_map_->Get(r),
+                vector_map_->Get(s),
+                type,
+                vector_length_,
+                is_unsigned,
+                is_rounded));
+          } else {
+            VectorizeUse(node, instruction->InputAt(0), generate_code, type, restrictions);
+            VectorizeUse(node, instruction->InputAt(1), generate_code, type, restrictions);
+            GenerateVecOp(instruction,
+                          vector_map_->Get(instruction->InputAt(0)),
+                          vector_map_->Get(instruction->InputAt(1)),
+                          type);
+          }
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+//
 // Helpers.
 //
 

diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index d8f50aa..4a7da86 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h

@@ -62,13 +62,15 @@
    * Vectorization restrictions (bit mask).
    */
   enum VectorRestrictions {
-    kNone     = 0,   // no restrictions
-    kNoMul    = 1,   // no multiplication
-    kNoDiv    = 2,   // no division
-    kNoShift  = 4,   // no shift
-    kNoShr    = 8,   // no arithmetic shift right
-    kNoHiBits = 16,  // "wider" operations cannot bring in higher order bits
-    kNoAbs    = 32,  // no absolute value
+    kNone            = 0,    // no restrictions
+    kNoMul           = 1,    // no multiplication
+    kNoDiv           = 2,    // no division
+    kNoShift         = 4,    // no shift
+    kNoShr           = 8,    // no arithmetic shift right
+    kNoHiBits        = 16,   // "wider" operations cannot bring in higher order bits
+    kNoSignedHAdd    = 32,   // no signed halving add
+    kNoUnroundedHAdd = 64,   // no unrounded halving add
+    kNoAbs           = 128,  // no absolute value
   };
 
   /*
@@ -136,6 +138,13 @@
                       Primitive::Type type);
   void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type);
 
+  // Vectorization idioms.
+  bool VectorizeHalvingAddIdiom(LoopNode* node,
+                                HInstruction* instruction,
+                                bool generate_code,
+                                Primitive::Type type,
+                                uint64_t restrictions);
+
   // Helpers.
   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
   bool TrySetSimpleLoopHeader(HBasicBlock* block);

diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index c109369..6be237e 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h

@@ -1369,9 +1369,12 @@
   M(VecAbs, VecUnaryOperation)                                          \
   M(VecNot, VecUnaryOperation)                                          \
   M(VecAdd, VecBinaryOperation)                                         \
+  M(VecHalvingAdd, VecBinaryOperation)                                  \
   M(VecSub, VecBinaryOperation)                                         \
   M(VecMul, VecBinaryOperation)                                         \
   M(VecDiv, VecBinaryOperation)                                         \
+  M(VecMin, VecBinaryOperation)                                         \
+  M(VecMax, VecBinaryOperation)                                         \
   M(VecAnd, VecBinaryOperation)                                         \
   M(VecAndNot, VecBinaryOperation)                                      \
   M(VecOr, VecBinaryOperation)                                          \
@@ -6845,6 +6848,7 @@
   DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator);
 };
 
+// Returns int64_t value of a properly typed constant.
 inline int64_t Int64FromConstant(HConstant* constant) {
   if (constant->IsIntConstant()) {
     return constant->AsIntConstant()->GetValue();
@@ -6856,6 +6860,21 @@
   }
 }
 
+// Returns true iff instruction is an integral constant (and sets value on success).
+inline bool IsInt64AndGet(HInstruction* instruction, /*out*/ int64_t* value) {
+  if (instruction->IsIntConstant()) {
+    *value = instruction->AsIntConstant()->GetValue();
+    return true;
+  } else if (instruction->IsLongConstant()) {
+    *value = instruction->AsLongConstant()->GetValue();
+    return true;
+  } else if (instruction->IsNullConstant()) {
+    *value = 0;
+    return true;
+  }
+  return false;
+}
+
 #define INSTRUCTION_TYPE_CHECK(type, super)                                    \
   inline bool HInstruction::Is##type() const { return GetKind() == k##type; }  \
   inline const H##type* HInstruction::As##type() const {                       \

diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 0cbbf2a..bff58d0 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h

@@ -338,6 +338,42 @@
   DISALLOW_COPY_AND_ASSIGN(HVecAdd);
 };
 
+// Performs halving add on every component in the two vectors, viz.
+// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
+// or      [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
+// for signed operands x, y (sign extension) or unsigned operands x, y (zero extension).
+class HVecHalvingAdd FINAL : public HVecBinaryOperation {
+ public:
+  HVecHalvingAdd(ArenaAllocator* arena,
+                 HInstruction* left,
+                 HInstruction* right,
+                 Primitive::Type packed_type,
+                 size_t vector_length,
+                 bool is_unsigned,
+                 bool is_rounded,
+                 uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc),
+        is_unsigned_(is_unsigned),
+        is_rounded_(is_rounded) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+
+  bool IsUnsigned() const { return is_unsigned_; }
+  bool IsRounded() const { return is_rounded_; }
+
+  DECLARE_INSTRUCTION(VecHalvingAdd);
+
+ private:
+  bool is_unsigned_;
+  bool is_rounded_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecHalvingAdd);
+};
+
 // Subtracts every component in the two vectors,
 // viz. [ x1, .. , xn ] - [ y1, .. , yn ] = [ x1 - y1, .. , xn - yn ].
 class HVecSub FINAL : public HVecBinaryOperation {
@@ -404,6 +440,50 @@
   DISALLOW_COPY_AND_ASSIGN(HVecDiv);
 };
 
+// Takes minimum of every component in the two vectors,
+// viz. MIN( [ x1, .. , xn ] , [ y1, .. , yn ]) = [ min(x1, y1), .. , min(xn, yn) ].
+class HVecMin FINAL : public HVecBinaryOperation {
+ public:
+  HVecMin(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecMin);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecMin);
+};
+
+// Takes maximum of every component in the two vectors,
+// viz. MAX( [ x1, .. , xn ] , [ y1, .. , yn ]) = [ max(x1, y1), .. , max(xn, yn) ].
+class HVecMax FINAL : public HVecBinaryOperation {
+ public:
+  HVecMax(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecMax);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecMax);
+};
+
 // Bitwise-ands every component in the two vectors,
 // viz. [ x1, .. , xn ] & [ y1, .. , yn ] = [ x1 & y1, .. , xn & yn ].
 class HVecAnd FINAL : public HVecBinaryOperation {
commit	f3e61ee363fe7f82ef56704f06d753e2034a67dd	[log] [tgz]
author	Aart Bik <ajcbik@google.com>	Wed Apr 12 17:09:20 2017 -0700
committer	Aart Bik <ajcbik@google.com>	Wed Apr 19 10:30:57 2017 -0700
tree	a00f1fce4a2e284b0a03f941f530afc5b5c56b59
parent	741a81af441cbcb7255229bf250bc009d2894e92 [diff]