Implement Sum-of-Abs-Differences idiom recognition.
Rationale:
Currently just on ARM64 (x86 lacks proper support),
using the SAD idiom yields great speedup on loops
that compute the sum-of-abs-difference operation.
Also includes some refinements around type conversions.
Speedup ExoPlayerAudio (golem run):
1.3x on ARM64
1.1x on x86
Test: test-art-host test-art-target
Bug: 64091002
Change-Id: Ia2b711d2bc23609a2ed50493dfe6719eedfe0130
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index baa0453..6f8743b 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -71,10 +71,13 @@
return false;
}
-// Detect a sign extension from the given type. Returns the promoted operand on success.
+// Detect a sign extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus sign extension from int can be included.
+// Returns the promoted operand on success.
static bool IsSignExtensionAndGet(HInstruction* instruction,
Primitive::Type type,
- /*out*/ HInstruction** operand) {
+ /*out*/ HInstruction** operand,
+ bool to64 = false) {
// Accept any already wider constant that would be handled properly by sign
// extension when represented in the *width* of the given narrower data type
// (the fact that char normally zero extends does not matter here).
@@ -82,20 +85,24 @@
if (IsInt64AndGet(instruction, /*out*/ &value)) {
switch (type) {
case Primitive::kPrimByte:
- if (std::numeric_limits<int8_t>::min() <= value &&
- std::numeric_limits<int8_t>::max() >= value) {
+ if (IsInt<8>(value)) {
*operand = instruction;
return true;
}
return false;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- if (std::numeric_limits<int16_t>::min() <= value &&
- std::numeric_limits<int16_t>::max() <= value) {
+ if (IsInt<16>(value)) {
*operand = instruction;
return true;
}
return false;
+ case Primitive::kPrimInt:
+ if (IsInt<32>(value)) {
+ *operand = instruction;
+ return to64;
+ }
+ return false;
default:
return false;
}
@@ -110,40 +117,52 @@
case Primitive::kPrimShort:
*operand = instruction;
return true;
+ case Primitive::kPrimInt:
+ *operand = instruction;
+ return to64;
default:
return false;
}
}
- // TODO: perhaps explicit conversions later too?
- // (this may return something different from instruction)
+ // Explicit type conversion to long.
+ if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+ return IsSignExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+ }
return false;
}
-// Detect a zero extension from the given type. Returns the promoted operand on success.
+// Detect a zero extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus zero extension from int can be included.
+// Returns the promoted operand on success.
static bool IsZeroExtensionAndGet(HInstruction* instruction,
Primitive::Type type,
- /*out*/ HInstruction** operand) {
+ /*out*/ HInstruction** operand,
+ bool to64 = false) {
// Accept any already wider constant that would be handled properly by zero
// extension when represented in the *width* of the given narrower data type
- // (the fact that byte/short normally sign extend does not matter here).
+ // (the fact that byte/short/int normally sign extend does not matter here).
int64_t value = 0;
if (IsInt64AndGet(instruction, /*out*/ &value)) {
switch (type) {
case Primitive::kPrimByte:
- if (std::numeric_limits<uint8_t>::min() <= value &&
- std::numeric_limits<uint8_t>::max() >= value) {
+ if (IsUint<8>(value)) {
*operand = instruction;
return true;
}
return false;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- if (std::numeric_limits<uint16_t>::min() <= value &&
- std::numeric_limits<uint16_t>::max() <= value) {
+ if (IsUint<16>(value)) {
*operand = instruction;
return true;
}
return false;
+ case Primitive::kPrimInt:
+ if (IsUint<32>(value)) {
+ *operand = instruction;
+ return to64;
+ }
+ return false;
default:
return false;
}
@@ -170,14 +189,21 @@
(IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
switch ((*operand)->GetType()) {
- case Primitive::kPrimByte: return mask == std::numeric_limits<uint8_t>::max();
+ case Primitive::kPrimByte:
+ return mask == std::numeric_limits<uint8_t>::max();
case Primitive::kPrimChar:
- case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
+ case Primitive::kPrimShort:
+ return mask == std::numeric_limits<uint16_t>::max();
+ case Primitive::kPrimInt:
+ return mask == std::numeric_limits<uint32_t>::max() && to64;
default: return false;
}
}
}
- // TODO: perhaps explicit conversions later too?
+ // Explicit type conversion to long.
+ if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+ return IsZeroExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+ }
return false;
}
@@ -214,6 +240,55 @@
return false;
}
+// Compute relative vector length based on type difference.
+static size_t GetOtherVL(Primitive::Type other_type, Primitive::Type vector_type, size_t vl) {
+ switch (other_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl;
+ default: break;
+ }
+ return vl;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl >> 1;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort: return vl;
+ default: break;
+ }
+ break;
+ case Primitive::kPrimInt:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl >> 2;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort: return vl >> 1;
+ case Primitive::kPrimInt: return vl;
+ default: break;
+ }
+ break;
+ case Primitive::kPrimLong:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl >> 3;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort: return vl >> 2;
+ case Primitive::kPrimInt: return vl >> 1;
+ case Primitive::kPrimLong: return vl;
+ default: break;
+ }
+ break;
+ default:
+ break;
+ }
+ LOG(FATAL) << "Unsupported idiom conversion";
+ UNREACHABLE();
+}
+
// Detect up to two instructions a and b, and an acccumulated constant c.
static bool IsAddConstHelper(HInstruction* instruction,
/*out*/ HInstruction** a,
@@ -260,16 +335,16 @@
}
// Detect reductions of the following forms,
-// under assumption phi has only *one* use:
// x = x_phi + ..
// x = x_phi - ..
// x = max(x_phi, ..)
// x = min(x_phi, ..)
static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
if (reduction->IsAdd()) {
- return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+ return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+ (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
} else if (reduction->IsSub()) {
- return reduction->InputAt(0) == phi;
+ return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi);
} else if (reduction->IsInvokeStaticOrDirect()) {
switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
case Intrinsics::kMathMinIntInt:
@@ -280,7 +355,8 @@
case Intrinsics::kMathMaxLongLong:
case Intrinsics::kMathMaxFloatFloat:
case Intrinsics::kMathMaxDoubleDouble:
- return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+ return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+ (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
default:
return false;
}
@@ -288,9 +364,9 @@
return false;
}
-// Translates operation to reduction kind.
-static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
- if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+// Translates vector operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
+ if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
return HVecReduce::kSum;
} else if (reduction->IsVecMin()) {
return HVecReduce::kMin;
@@ -720,7 +796,6 @@
HBasicBlock* block,
HBasicBlock* exit,
int64_t trip_count) {
- Primitive::Type induc_type = Primitive::kPrimInt;
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -739,6 +814,10 @@
vector_header_ = header;
vector_body_ = block;
+ // Loop induction type.
+ Primitive::Type induc_type = main_phi->GetType();
+ DCHECK(induc_type == Primitive::kPrimInt || induc_type == Primitive::kPrimLong) << induc_type;
+
// Generate dynamic loop peeling trip count, if needed, under the assumption
// that the Android runtime guarantees at least "component size" alignment:
// ptc = (ALIGN - (&a[initial] % ALIGN)) / type-size
@@ -767,10 +846,10 @@
HInstruction* rem = Insert(
preheader, new (global_allocator_) HAnd(induc_type,
diff,
- graph_->GetIntConstant(chunk - 1)));
+ graph_->GetConstant(induc_type, chunk - 1)));
vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
}
- vector_index_ = graph_->GetIntConstant(0);
+ vector_index_ = graph_->GetConstant(induc_type, 0);
// Generate runtime disambiguation test:
// vtc = a != b ? vtc : 0;
@@ -779,7 +858,8 @@
preheader,
new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_));
vtc = Insert(preheader,
- new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc));
+ new (global_allocator_)
+ HSelect(rt, vtc, graph_->GetConstant(induc_type, 0), kNoDexPc));
needs_cleanup = true;
}
@@ -793,7 +873,7 @@
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
vector_index_,
ptc,
- graph_->GetIntConstant(1),
+ graph_->GetConstant(induc_type, 1),
kNoUnrollingFactor);
}
@@ -806,7 +886,7 @@
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
vector_index_,
vtc,
- graph_->GetIntConstant(vector_length_), // increment per unroll
+ graph_->GetConstant(induc_type, vector_length_), // increment per unroll
unroll);
HLoopInformation* vloop = vector_header_->GetLoopInformation();
@@ -820,14 +900,20 @@
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
vector_index_,
stc,
- graph_->GetIntConstant(1),
+ graph_->GetConstant(induc_type, 1),
kNoUnrollingFactor);
}
// Link reductions to their final uses.
for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
if (i->first->IsPhi()) {
- i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+ HInstruction* phi = i->first;
+ HInstruction* repl = ReduceAndExtractIfNeeded(i->second);
+ // Deal with regular uses.
+ for (const HUseListNode<HInstruction*>& use : phi->GetUses()) {
+ induction_range_.Replace(use.GetUser(), phi, repl); // update induction use
+ }
+ phi->ReplaceWith(repl);
}
}
@@ -853,7 +939,7 @@
HInstruction* step,
uint32_t unroll) {
DCHECK(unroll == 1 || vector_mode_ == kVector);
- Primitive::Type induc_type = Primitive::kPrimInt;
+ Primitive::Type induc_type = lo->GetType();
// Prepare new loop.
vector_preheader_ = new_preheader,
vector_header_ = vector_preheader_->GetSingleSuccessor();
@@ -942,8 +1028,10 @@
auto redit = reductions_->find(instruction);
if (redit != reductions_->end()) {
Primitive::Type type = instruction->GetType();
- if (TrySetVectorType(type, &restrictions) &&
- VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+ // Recognize SAD idiom or direct reduction.
+ if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+ (TrySetVectorType(type, &restrictions) &&
+ VectorizeUse(node, instruction, generate_code, type, restrictions))) {
if (generate_code) {
HInstruction* new_red = vector_map_->Get(instruction);
vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
@@ -1029,14 +1117,20 @@
HInstruction* opa = conversion->InputAt(0);
Primitive::Type from = conversion->GetInputType();
Primitive::Type to = conversion->GetResultType();
- if ((to == Primitive::kPrimByte ||
- to == Primitive::kPrimChar ||
- to == Primitive::kPrimShort) && from == Primitive::kPrimInt) {
- // Accept a "narrowing" type conversion from a "wider" computation for
- // (1) conversion into final required type,
- // (2) vectorizable operand,
- // (3) "wider" operations cannot bring in higher order bits.
- if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) {
+ if (Primitive::IsIntegralType(from) && Primitive::IsIntegralType(to)) {
+ size_t size_vec = Primitive::ComponentSize(type);
+ size_t size_from = Primitive::ComponentSize(from);
+ size_t size_to = Primitive::ComponentSize(to);
+ // Accept an integral conversion
+ // (1a) narrowing into vector type, "wider" operations cannot bring in higher order bits, or
+ // (1b) widening from at least vector type, and
+ // (2) vectorizable operand.
+ if ((size_to < size_from &&
+ size_to == size_vec &&
+ VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) ||
+ (size_to >= size_from &&
+ size_from >= size_vec &&
+ VectorizeUse(node, opa, generate_code, type, restrictions))) {
if (generate_code) {
if (vector_mode_ == kVector) {
vector_map_->Put(instruction, vector_map_->Get(opa)); // operand pass-through
@@ -1088,7 +1182,7 @@
return true;
}
} else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
- // Recognize vectorization idioms.
+ // Recognize halving add idiom.
if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
return true;
}
@@ -1181,7 +1275,8 @@
return false; // reject, unless all operands are same-extension narrower
}
// Accept MIN/MAX(x, y) for vectorizable operands.
- DCHECK(r != nullptr && s != nullptr);
+ DCHECK(r != nullptr);
+ DCHECK(s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
r = opa;
s = opb;
@@ -1232,11 +1327,11 @@
switch (type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
*restrictions |= kNoDiv;
@@ -1261,17 +1356,17 @@
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
*restrictions |=
- kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+ kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+ *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
- *restrictions |= kNoDiv;
+ *restrictions |= kNoDiv | kNoSAD;
return TrySetVectorLength(4);
case Primitive::kPrimLong:
- *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
+ *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax | kNoSAD;
return TrySetVectorLength(2);
case Primitive::kPrimFloat:
*restrictions |= kNoMinMax | kNoReduction; // minmax: -0.0 vs +0.0
@@ -1289,17 +1384,17 @@
switch (type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(4);
case Primitive::kPrimLong:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(2);
case Primitive::kPrimFloat:
*restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN)
@@ -1317,17 +1412,17 @@
switch (type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(4);
case Primitive::kPrimLong:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(2);
case Primitive::kPrimFloat:
*restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN)
@@ -1371,8 +1466,16 @@
if (it != vector_permanent_map_->end()) {
vector = it->second; // reuse during unrolling
} else {
- vector = new (global_allocator_) HVecReplicateScalar(
- global_allocator_, org, type, vector_length_);
+ // Generates ReplicateScalar( (optional_type_conv) org ).
+ HInstruction* input = org;
+ Primitive::Type input_type = input->GetType();
+ if (type != input_type && (type == Primitive::kPrimLong ||
+ input_type == Primitive::kPrimLong)) {
+ input = Insert(vector_preheader_,
+ new (global_allocator_) HTypeConversion(type, input, kNoDexPc));
+ }
+ vector = new (global_allocator_)
+ HVecReplicateScalar(global_allocator_, input, type, vector_length_);
vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
}
vector_map_->Put(org, vector);
@@ -1465,10 +1568,15 @@
// Prepare the new initialization.
if (vector_mode_ == kVector) {
// Generate a [initial, 0, .., 0] vector.
- new_init = Insert(
- vector_preheader_,
- new (global_allocator_) HVecSetScalars(
- global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+ HVecOperation* red_vector = new_red->AsVecOperation();
+ size_t vector_length = red_vector->GetVectorLength();
+ Primitive::Type type = red_vector->GetPackedType();
+ new_init = Insert(vector_preheader_,
+ new (global_allocator_) HVecSetScalars(global_allocator_,
+ &new_init,
+ type,
+ vector_length,
+ 1));
} else {
new_init = ReduceAndExtractIfNeeded(new_init);
}
@@ -1484,18 +1592,20 @@
if (instruction->IsPhi()) {
HInstruction* input = instruction->InputAt(1);
if (input->IsVecOperation()) {
- Primitive::Type type = input->AsVecOperation()->GetPackedType();
+ HVecOperation* input_vector = input->AsVecOperation();
+ size_t vector_length = input_vector->GetVectorLength();
+ Primitive::Type type = input_vector->GetPackedType();
+ HVecReduce::ReductionKind kind = GetReductionKind(input_vector);
HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
// Generate a vector reduction and scalar extract
// x = REDUCE( [x_1, .., x_n] )
// y = x_1
// along the exit of the defining loop.
- HVecReduce::ReductionKind kind = GetReductionKind(input);
HInstruction* reduce = new (global_allocator_) HVecReduce(
- global_allocator_, instruction, type, vector_length_, kind);
+ global_allocator_, instruction, type, vector_length, kind);
exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
instruction = new (global_allocator_) HVecExtractScalar(
- global_allocator_, reduce, type, vector_length_, 0);
+ global_allocator_, reduce, type, vector_length, 0);
exit->InsertInstructionAfter(instruction, reduce);
}
}
@@ -1516,27 +1626,19 @@
HInstruction* opb,
Primitive::Type type,
bool is_unsigned) {
- if (vector_mode_ == kSequential) {
- // Non-converting scalar code follows implicit integral promotion.
- if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean ||
- type == Primitive::kPrimByte ||
- type == Primitive::kPrimChar ||
- type == Primitive::kPrimShort)) {
- type = Primitive::kPrimInt;
- }
- }
HInstruction* vector = nullptr;
+ Primitive::Type org_type = org->GetType();
switch (org->GetKind()) {
case HInstruction::kNeg:
DCHECK(opb == nullptr);
GENERATE_VEC(
new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_),
- new (global_allocator_) HNeg(type, opa));
+ new (global_allocator_) HNeg(org_type, opa));
case HInstruction::kNot:
DCHECK(opb == nullptr);
GENERATE_VEC(
new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
- new (global_allocator_) HNot(type, opa));
+ new (global_allocator_) HNot(org_type, opa));
case HInstruction::kBooleanNot:
DCHECK(opb == nullptr);
GENERATE_VEC(
@@ -1546,47 +1648,47 @@
DCHECK(opb == nullptr);
GENERATE_VEC(
new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_),
- new (global_allocator_) HTypeConversion(type, opa, kNoDexPc));
+ new (global_allocator_) HTypeConversion(org_type, opa, kNoDexPc));
case HInstruction::kAdd:
GENERATE_VEC(
new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HAdd(type, opa, opb));
+ new (global_allocator_) HAdd(org_type, opa, opb));
case HInstruction::kSub:
GENERATE_VEC(
new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HSub(type, opa, opb));
+ new (global_allocator_) HSub(org_type, opa, opb));
case HInstruction::kMul:
GENERATE_VEC(
new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HMul(type, opa, opb));
+ new (global_allocator_) HMul(org_type, opa, opb));
case HInstruction::kDiv:
GENERATE_VEC(
new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HDiv(type, opa, opb, kNoDexPc));
+ new (global_allocator_) HDiv(org_type, opa, opb, kNoDexPc));
case HInstruction::kAnd:
GENERATE_VEC(
new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HAnd(type, opa, opb));
+ new (global_allocator_) HAnd(org_type, opa, opb));
case HInstruction::kOr:
GENERATE_VEC(
new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HOr(type, opa, opb));
+ new (global_allocator_) HOr(org_type, opa, opb));
case HInstruction::kXor:
GENERATE_VEC(
new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HXor(type, opa, opb));
+ new (global_allocator_) HXor(org_type, opa, opb));
case HInstruction::kShl:
GENERATE_VEC(
new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HShl(type, opa, opb));
+ new (global_allocator_) HShl(org_type, opa, opb));
case HInstruction::kShr:
GENERATE_VEC(
new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HShr(type, opa, opb));
+ new (global_allocator_) HShr(org_type, opa, opb));
case HInstruction::kUShr:
GENERATE_VEC(
new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HUShr(type, opa, opb));
+ new (global_allocator_) HUShr(org_type, opa, opb));
case HInstruction::kInvokeStaticOrDirect: {
HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
if (vector_mode_ == kVector) {
@@ -1667,8 +1769,8 @@
//
// Method recognizes the following idioms:
-// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
-// regular halving add (a + b) >> 1 for unsigned/signed operands a, b
+// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
+// truncated halving add (a + b) >> 1 for unsigned/signed operands a, b
// Provided that the operands are promoted to a wider form to do the arithmetic and
// then cast back to narrower form, the idioms can be mapped into efficient SIMD
// implementation that operates directly in narrower form (plus one extra bit).
@@ -1712,7 +1814,8 @@
}
// Accept recognized halving add for vectorizable operands. Vectorized code uses the
// shorthand idiomatic operation. Sequential code uses the original scalar expressions.
- DCHECK(r != nullptr && s != nullptr);
+ DCHECK(r != nullptr);
+ DCHECK(s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
r = instruction->InputAt(0);
s = instruction->InputAt(1);
@@ -1741,6 +1844,88 @@
return false;
}
+// Method recognizes the following idiom:
+// q += ABS(a - b) for signed operands a, b
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a sad-accumulate node (rather than relying combining finer grained nodes later).
+// TODO: unsigned SAD too?
+bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ Primitive::Type reduction_type,
+ uint64_t restrictions) {
+ // Filter integral "q += ABS(a - b);" reduction, where ABS and SUB
+ // are done in the same precision (either int or long).
+ if (!instruction->IsAdd() ||
+ (reduction_type != Primitive::kPrimInt && reduction_type != Primitive::kPrimLong)) {
+ return false;
+ }
+ HInstruction* q = instruction->InputAt(0);
+ HInstruction* v = instruction->InputAt(1);
+ HInstruction* a = nullptr;
+ HInstruction* b = nullptr;
+ if (v->IsInvokeStaticOrDirect() &&
+ (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt ||
+ v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) {
+ HInstruction* x = v->InputAt(0);
+ if (x->IsSub() && x->GetType() == reduction_type) {
+ a = x->InputAt(0);
+ b = x->InputAt(1);
+ }
+ }
+ if (a == nullptr || b == nullptr) {
+ return false;
+ }
+ // Accept same-type or consistent sign extension for narrower-type on operands a and b.
+ // The same-type or narrower operands are called r (a or lower) and s (b or lower).
+ HInstruction* r = a;
+ HInstruction* s = b;
+ bool is_unsigned = false;
+ Primitive::Type sub_type = a->GetType();
+ if (a->IsTypeConversion()) {
+ sub_type = a->InputAt(0)->GetType();
+ } else if (b->IsTypeConversion()) {
+ sub_type = b->InputAt(0)->GetType();
+ }
+ if (reduction_type != sub_type &&
+ (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
+ return false;
+ }
+ // Try same/narrower type and deal with vector restrictions.
+ if (!TrySetVectorType(sub_type, &restrictions) || HasVectorRestrictions(restrictions, kNoSAD)) {
+ return false;
+ }
+ // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
+ // idiomatic operation. Sequential code uses the original scalar expressions.
+ DCHECK(r != nullptr);
+ DCHECK(s != nullptr);
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ r = s = v->InputAt(0);
+ }
+ if (VectorizeUse(node, q, generate_code, sub_type, restrictions) &&
+ VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
+ VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
+ if (generate_code) {
+ if (vector_mode_ == kVector) {
+ vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
+ global_allocator_,
+ vector_map_->Get(q),
+ vector_map_->Get(r),
+ vector_map_->Get(s),
+ reduction_type,
+ GetOtherVL(reduction_type, sub_type, vector_length_)));
+ MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+ } else {
+ GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type);
+ GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
//
// Vectorization heuristics.
//