diff options
author | 2017-09-01 13:06:08 -0700 | |
---|---|---|
committer | 2017-09-21 10:20:55 -0700 | |
commit | dbbac8f812a866b1b53f3007721f66038d208549 (patch) | |
tree | 05cecd927afccd33fc1c14b39ada47e86873f560 /compiler/optimizing | |
parent | 2406bf17998e15bd40677a907beb3e9c41facce4 (diff) |
Implement Sum-of-Abs-Differences idiom recognition.
Rationale:
Currently just on ARM64 (x86 lacks proper support),
using the SAD idiom yields great speedup on loops
that compute the sum-of-abs-difference operation.
Also includes some refinements around type conversions.
Speedup ExoPlayerAudio (golem run):
1.3x on ARM64
1.1x on x86
Test: test-art-host test-art-target
Bug: 64091002
Change-Id: Ia2b711d2bc23609a2ed50493dfe6719eedfe0130
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64.cc | 229 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm_vixl.cc | 36 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_mips.cc | 61 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_mips64.cc | 61 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86.cc | 39 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86_64.cc | 32 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 375 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 6 | ||||
-rw-r--r-- | compiler/optimizing/nodes.h | 1 | ||||
-rw-r--r-- | compiler/optimizing/nodes_vector.h | 59 |
10 files changed, 704 insertions, 195 deletions
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 18a55c8b09..3f576c82b3 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -949,20 +949,18 @@ void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instructi } } -void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); - switch (instr->GetPackedType()) { +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: - locations->SetInAt( - HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); - locations->SetInAt( - HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); - locations->SetInAt( - HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); - DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); locations->SetOut(Location::SameAsFirstInput()); break; default: @@ -971,18 +969,25 @@ void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* i } } +void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + // Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a // 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result. // However vector MultiplyAccumulate instruction is not affected. -void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LocationSummary* locations = instr->GetLocations(); - VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex)); - VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex)); - VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex)); - switch (instr->GetPackedType()) { +void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister acc = VRegisterFrom(locations->InAt(0)); + VRegister left = VRegisterFrom(locations->InAt(1)); + VRegister right = VRegisterFrom(locations->InAt(2)); + + DCHECK(locations->InAt(0).Equals(locations->Out())); + + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(16u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.V16B(), left.V16B(), right.V16B()); } else { __ Mls(acc.V16B(), left.V16B(), right.V16B()); @@ -990,16 +995,16 @@ void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccum break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(8u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.V8H(), left.V8H(), right.V8H()); } else { __ Mls(acc.V8H(), left.V8H(), right.V8H()); } break; case Primitive::kPrimInt: - DCHECK_EQ(4u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.V4S(), left.V4S(), right.V4S()); } else { __ Mls(acc.V4S(), left.V4S(), right.V4S()); @@ -1007,6 +1012,186 @@ void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccum break; default: LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); + // Some conversions require temporary registers. + LocationSummary* locations = instruction->GetLocations(); + HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); + HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); + DCHECK_EQ(a->GetPackedType(), b->GetPackedType()); + switch (a->GetPackedType()) { + case Primitive::kPrimByte: + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + FALLTHROUGH_INTENDED; + case Primitive::kPrimInt: + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + break; + default: + break; + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + if (instruction->GetPackedType() == Primitive::kPrimLong) { + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + } + break; + case Primitive::kPrimInt: + case Primitive::kPrimLong: + if (instruction->GetPackedType() == a->GetPackedType()) { + locations->AddTemp(Location::RequiresFpuRegister()); + } + break; + default: + break; + } +} + +void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister acc = VRegisterFrom(locations->InAt(0)); + VRegister left = VRegisterFrom(locations->InAt(1)); + VRegister right = VRegisterFrom(locations->InAt(2)); + + DCHECK(locations->InAt(0).Equals(locations->Out())); + + // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S). + HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); + HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); + DCHECK_EQ(a->GetPackedType(), b->GetPackedType()); + switch (a->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sabal(acc.V8H(), left.V8B(), right.V8B()); + __ Sabal2(acc.V8H(), left.V16B(), right.V16B()); + break; + case Primitive::kPrimInt: { + DCHECK_EQ(4u, instruction->GetVectorLength()); + VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); + VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); + __ Sxtl(tmp1.V8H(), left.V8B()); + __ Sxtl(tmp2.V8H(), right.V8B()); + __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H()); + __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H()); + __ Sxtl2(tmp1.V8H(), left.V16B()); + __ Sxtl2(tmp2.V8H(), right.V16B()); + __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H()); + __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H()); + break; + } + case Primitive::kPrimLong: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); + VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); + VRegister tmp3 = VRegisterFrom(locations->GetTemp(2)); + VRegister tmp4 = VRegisterFrom(locations->GetTemp(3)); + __ Sxtl(tmp1.V8H(), left.V8B()); + __ Sxtl(tmp2.V8H(), right.V8B()); + __ Sxtl(tmp3.V4S(), tmp1.V4H()); + __ Sxtl(tmp4.V4S(), tmp2.V4H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + __ Sxtl2(tmp3.V4S(), tmp1.V8H()); + __ Sxtl2(tmp4.V4S(), tmp2.V8H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + __ Sxtl2(tmp1.V8H(), left.V16B()); + __ Sxtl2(tmp2.V8H(), right.V16B()); + __ Sxtl(tmp3.V4S(), tmp1.V4H()); + __ Sxtl(tmp4.V4S(), tmp2.V4H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + __ Sxtl2(tmp3.V4S(), tmp1.V8H()); + __ Sxtl2(tmp4.V4S(), tmp2.V8H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sabal(acc.V4S(), left.V4H(), right.V4H()); + __ Sabal2(acc.V4S(), left.V8H(), right.V8H()); + break; + case Primitive::kPrimLong: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); + VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); + __ Sxtl(tmp1.V4S(), left.V4H()); + __ Sxtl(tmp2.V4S(), right.V4H()); + __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S()); + __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S()); + __ Sxtl2(tmp1.V4S(), left.V8H()); + __ Sxtl2(tmp2.V4S(), right.V8H()); + __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S()); + __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S()); + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: { + DCHECK_EQ(4u, instruction->GetVectorLength()); + VRegister tmp = VRegisterFrom(locations->GetTemp(0)); + __ Sub(tmp.V4S(), left.V4S(), right.V4S()); + __ Abs(tmp.V4S(), tmp.V4S()); + __ Add(acc.V4S(), acc.V4S(), tmp.V4S()); + break; + } + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Sabal(acc.V2D(), left.V2S(), right.V2S()); + __ Sabal2(acc.V2D(), left.V4S(), right.V4S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + VRegister tmp = VRegisterFrom(locations->GetTemp(0)); + __ Sub(tmp.V2D(), left.V2D(), right.V2D()); + __ Abs(tmp.V2D(), tmp.V2D()); + __ Add(acc.V2D(), acc.V2D(), tmp.V2D()); + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; } } diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc index 7a11dff41e..069054c2f5 100644 --- a/compiler/optimizing/code_generator_vector_arm_vixl.cc +++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc @@ -629,12 +629,40 @@ void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruc LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LOG(FATAL) << "No SIMD for " << instr->GetId(); +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LOG(FATAL) << "No SIMD for " << instr->GetId(); +void LocationsBuilderARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); } // Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc index c2fbf7f04b..0bedafcc81 100644 --- a/compiler/optimizing/code_generator_vector_mips.cc +++ b/compiler/optimizing/code_generator_vector_mips.cc @@ -826,21 +826,18 @@ void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instructio LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); - switch (instr->GetPackedType()) { +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimLong: - locations->SetInAt( - HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); - locations->SetInAt( - HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); - locations->SetInAt( - HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); - DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); locations->SetOut(Location::SameAsFirstInput()); break; default: @@ -849,18 +846,19 @@ void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* in } } -void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LocationSummary* locations = instr->GetLocations(); - VectorRegister acc = - VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex)); - VectorRegister left = - VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex)); - VectorRegister right = - VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex)); - switch (instr->GetPackedType()) { +void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VectorRegister acc = VectorRegisterFrom(locations->InAt(0)); + VectorRegister left = VectorRegisterFrom(locations->InAt(1)); + VectorRegister right = VectorRegisterFrom(locations->InAt(2)); + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(16u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvB(acc, left, right); } else { __ MsubvB(acc, left, right); @@ -868,24 +866,24 @@ void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumu break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(8u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvH(acc, left, right); } else { __ MsubvH(acc, left, right); } break; case Primitive::kPrimInt: - DCHECK_EQ(4u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvW(acc, left, right); } else { __ MsubvW(acc, left, right); } break; case Primitive::kPrimLong: - DCHECK_EQ(2u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(2u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvD(acc, left, right); } else { __ MsubvD(acc, left, right); @@ -897,6 +895,15 @@ void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumu } } +void LocationsBuilderMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + // TODO: implement this, location helper already filled out (shared with MulAcc). +} + // Helper to set up locations for vector memory operations. static void CreateVecMemLocations(ArenaAllocator* arena, HVecMemoryOperation* instruction, diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc index 9d3a777c13..db31bdcc92 100644 --- a/compiler/optimizing/code_generator_vector_mips64.cc +++ b/compiler/optimizing/code_generator_vector_mips64.cc @@ -830,21 +830,18 @@ void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruct LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); - switch (instr->GetPackedType()) { +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimLong: - locations->SetInAt( - HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); - locations->SetInAt( - HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); - locations->SetInAt( - HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); - DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); locations->SetOut(Location::SameAsFirstInput()); break; default: @@ -853,18 +850,19 @@ void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* } } -void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LocationSummary* locations = instr->GetLocations(); - VectorRegister acc = - VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex)); - VectorRegister left = - VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex)); - VectorRegister right = - VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex)); - switch (instr->GetPackedType()) { +void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VectorRegister acc = VectorRegisterFrom(locations->InAt(0)); + VectorRegister left = VectorRegisterFrom(locations->InAt(1)); + VectorRegister right = VectorRegisterFrom(locations->InAt(2)); + switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(16u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvB(acc, left, right); } else { __ MsubvB(acc, left, right); @@ -872,24 +870,24 @@ void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccu break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(8u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvH(acc, left, right); } else { __ MsubvH(acc, left, right); } break; case Primitive::kPrimInt: - DCHECK_EQ(4u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvW(acc, left, right); } else { __ MsubvW(acc, left, right); } break; case Primitive::kPrimLong: - DCHECK_EQ(2u, instr->GetVectorLength()); - if (instr->GetOpKind() == HInstruction::kAdd) { + DCHECK_EQ(2u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { __ MaddvD(acc, left, right); } else { __ MsubvD(acc, left, right); @@ -901,6 +899,15 @@ void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccu } } +void LocationsBuilderMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + // TODO: implement this, location helper already filled out (shared with MulAcc). +} + // Helper to set up locations for vector memory operations. static void CreateVecMemLocations(ArenaAllocator* arena, HVecMemoryOperation* instruction, diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 37190f8363..5a012e7298 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -51,7 +51,6 @@ void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instructi : Location::RequiresFpuRegister()); locations->SetOut(is_zero ? Location::RequiresFpuRegister() : Location::SameAsFirstInput()); - break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -1033,12 +1032,42 @@ void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction } } -void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LOG(FATAL) << "No SIMD for " << instr->GetId(); +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + // TODO: pmaddwd? + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); } -void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { - LOG(FATAL) << "No SIMD for " << instr->GetId(); +void InstructionCodeGeneratorX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + // TODO: psadbw for unsigned? + LOG(FATAL) << "No SIMD for " << instruction->GetId(); } // Helper to set up locations for vector memory operations. diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index edd0209f10..3698b7fb85 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -1005,11 +1005,41 @@ void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruct } } +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); } void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + // TODO: pmaddwd? + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + // TODO: psadbw for unsigned? LOG(FATAL) << "No SIMD for " << instruction->GetId(); } diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index baa045390b..6f8743bd53 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -71,10 +71,13 @@ static bool IsEarlyExit(HLoopInformation* loop_info) { return false; } -// Detect a sign extension from the given type. Returns the promoted operand on success. +// Detect a sign extension in instruction from the given type. The to64 parameter +// denotes if result is long, and thus sign extension from int can be included. +// Returns the promoted operand on success. static bool IsSignExtensionAndGet(HInstruction* instruction, Primitive::Type type, - /*out*/ HInstruction** operand) { + /*out*/ HInstruction** operand, + bool to64 = false) { // Accept any already wider constant that would be handled properly by sign // extension when represented in the *width* of the given narrower data type // (the fact that char normally zero extends does not matter here). @@ -82,20 +85,24 @@ static bool IsSignExtensionAndGet(HInstruction* instruction, if (IsInt64AndGet(instruction, /*out*/ &value)) { switch (type) { case Primitive::kPrimByte: - if (std::numeric_limits<int8_t>::min() <= value && - std::numeric_limits<int8_t>::max() >= value) { + if (IsInt<8>(value)) { *operand = instruction; return true; } return false; case Primitive::kPrimChar: case Primitive::kPrimShort: - if (std::numeric_limits<int16_t>::min() <= value && - std::numeric_limits<int16_t>::max() <= value) { + if (IsInt<16>(value)) { *operand = instruction; return true; } return false; + case Primitive::kPrimInt: + if (IsInt<32>(value)) { + *operand = instruction; + return to64; + } + return false; default: return false; } @@ -110,40 +117,52 @@ static bool IsSignExtensionAndGet(HInstruction* instruction, case Primitive::kPrimShort: *operand = instruction; return true; + case Primitive::kPrimInt: + *operand = instruction; + return to64; default: return false; } } - // TODO: perhaps explicit conversions later too? - // (this may return something different from instruction) + // Explicit type conversion to long. + if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) { + return IsSignExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true); + } return false; } -// Detect a zero extension from the given type. Returns the promoted operand on success. +// Detect a zero extension in instruction from the given type. The to64 parameter +// denotes if result is long, and thus zero extension from int can be included. +// Returns the promoted operand on success. static bool IsZeroExtensionAndGet(HInstruction* instruction, Primitive::Type type, - /*out*/ HInstruction** operand) { + /*out*/ HInstruction** operand, + bool to64 = false) { // Accept any already wider constant that would be handled properly by zero // extension when represented in the *width* of the given narrower data type - // (the fact that byte/short normally sign extend does not matter here). + // (the fact that byte/short/int normally sign extend does not matter here). int64_t value = 0; if (IsInt64AndGet(instruction, /*out*/ &value)) { switch (type) { case Primitive::kPrimByte: - if (std::numeric_limits<uint8_t>::min() <= value && - std::numeric_limits<uint8_t>::max() >= value) { + if (IsUint<8>(value)) { *operand = instruction; return true; } return false; case Primitive::kPrimChar: case Primitive::kPrimShort: - if (std::numeric_limits<uint16_t>::min() <= value && - std::numeric_limits<uint16_t>::max() <= value) { + if (IsUint<16>(value)) { *operand = instruction; return true; } return false; + case Primitive::kPrimInt: + if (IsUint<32>(value)) { + *operand = instruction; + return to64; + } + return false; default: return false; } @@ -170,14 +189,21 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction, (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) || IsZeroExtensionAndGet(a, type, /*out*/ operand)))) { switch ((*operand)->GetType()) { - case Primitive::kPrimByte: return mask == std::numeric_limits<uint8_t>::max(); + case Primitive::kPrimByte: + return mask == std::numeric_limits<uint8_t>::max(); case Primitive::kPrimChar: - case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max(); + case Primitive::kPrimShort: + return mask == std::numeric_limits<uint16_t>::max(); + case Primitive::kPrimInt: + return mask == std::numeric_limits<uint32_t>::max() && to64; default: return false; } } } - // TODO: perhaps explicit conversions later too? + // Explicit type conversion to long. + if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) { + return IsZeroExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true); + } return false; } @@ -214,6 +240,55 @@ static bool IsNarrowerOperand(HInstruction* a, return false; } +// Compute relative vector length based on type difference. +static size_t GetOtherVL(Primitive::Type other_type, Primitive::Type vector_type, size_t vl) { + switch (other_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + switch (vector_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: return vl; + default: break; + } + return vl; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + switch (vector_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: return vl >> 1; + case Primitive::kPrimChar: + case Primitive::kPrimShort: return vl; + default: break; + } + break; + case Primitive::kPrimInt: + switch (vector_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: return vl >> 2; + case Primitive::kPrimChar: + case Primitive::kPrimShort: return vl >> 1; + case Primitive::kPrimInt: return vl; + default: break; + } + break; + case Primitive::kPrimLong: + switch (vector_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: return vl >> 3; + case Primitive::kPrimChar: + case Primitive::kPrimShort: return vl >> 2; + case Primitive::kPrimInt: return vl >> 1; + case Primitive::kPrimLong: return vl; + default: break; + } + break; + default: + break; + } + LOG(FATAL) << "Unsupported idiom conversion"; + UNREACHABLE(); +} + // Detect up to two instructions a and b, and an acccumulated constant c. static bool IsAddConstHelper(HInstruction* instruction, /*out*/ HInstruction** a, @@ -260,16 +335,16 @@ static bool IsAddConst(HInstruction* instruction, } // Detect reductions of the following forms, -// under assumption phi has only *one* use: // x = x_phi + .. // x = x_phi - .. // x = max(x_phi, ..) // x = min(x_phi, ..) static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) { if (reduction->IsAdd()) { - return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi; + return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) || + (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi); } else if (reduction->IsSub()) { - return reduction->InputAt(0) == phi; + return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi); } else if (reduction->IsInvokeStaticOrDirect()) { switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) { case Intrinsics::kMathMinIntInt: @@ -280,7 +355,8 @@ static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) { case Intrinsics::kMathMaxLongLong: case Intrinsics::kMathMaxFloatFloat: case Intrinsics::kMathMaxDoubleDouble: - return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi; + return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) || + (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi); default: return false; } @@ -288,9 +364,9 @@ static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) { return false; } -// Translates operation to reduction kind. -static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) { - if (reduction->IsVecAdd() || reduction->IsVecSub()) { +// Translates vector operation to reduction kind. +static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) { + if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) { return HVecReduce::kSum; } else if (reduction->IsVecMin()) { return HVecReduce::kMin; @@ -720,7 +796,6 @@ void HLoopOptimization::Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count) { - Primitive::Type induc_type = Primitive::kPrimInt; HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); @@ -739,6 +814,10 @@ void HLoopOptimization::Vectorize(LoopNode* node, vector_header_ = header; vector_body_ = block; + // Loop induction type. + Primitive::Type induc_type = main_phi->GetType(); + DCHECK(induc_type == Primitive::kPrimInt || induc_type == Primitive::kPrimLong) << induc_type; + // Generate dynamic loop peeling trip count, if needed, under the assumption // that the Android runtime guarantees at least "component size" alignment: // ptc = (ALIGN - (&a[initial] % ALIGN)) / type-size @@ -767,10 +846,10 @@ void HLoopOptimization::Vectorize(LoopNode* node, HInstruction* rem = Insert( preheader, new (global_allocator_) HAnd(induc_type, diff, - graph_->GetIntConstant(chunk - 1))); + graph_->GetConstant(induc_type, chunk - 1))); vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem)); } - vector_index_ = graph_->GetIntConstant(0); + vector_index_ = graph_->GetConstant(induc_type, 0); // Generate runtime disambiguation test: // vtc = a != b ? vtc : 0; @@ -779,7 +858,8 @@ void HLoopOptimization::Vectorize(LoopNode* node, preheader, new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_)); vtc = Insert(preheader, - new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc)); + new (global_allocator_) + HSelect(rt, vtc, graph_->GetConstant(induc_type, 0), kNoDexPc)); needs_cleanup = true; } @@ -793,7 +873,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), vector_index_, ptc, - graph_->GetIntConstant(1), + graph_->GetConstant(induc_type, 1), kNoUnrollingFactor); } @@ -806,7 +886,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), vector_index_, vtc, - graph_->GetIntConstant(vector_length_), // increment per unroll + graph_->GetConstant(induc_type, vector_length_), // increment per unroll unroll); HLoopInformation* vloop = vector_header_->GetLoopInformation(); @@ -820,14 +900,20 @@ void HLoopOptimization::Vectorize(LoopNode* node, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), vector_index_, stc, - graph_->GetIntConstant(1), + graph_->GetConstant(induc_type, 1), kNoUnrollingFactor); } // Link reductions to their final uses. for (auto i = reductions_->begin(); i != reductions_->end(); ++i) { if (i->first->IsPhi()) { - i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second)); + HInstruction* phi = i->first; + HInstruction* repl = ReduceAndExtractIfNeeded(i->second); + // Deal with regular uses. + for (const HUseListNode<HInstruction*>& use : phi->GetUses()) { + induction_range_.Replace(use.GetUser(), phi, repl); // update induction use + } + phi->ReplaceWith(repl); } } @@ -853,7 +939,7 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, HInstruction* step, uint32_t unroll) { DCHECK(unroll == 1 || vector_mode_ == kVector); - Primitive::Type induc_type = Primitive::kPrimInt; + Primitive::Type induc_type = lo->GetType(); // Prepare new loop. vector_preheader_ = new_preheader, vector_header_ = vector_preheader_->GetSingleSuccessor(); @@ -942,8 +1028,10 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, auto redit = reductions_->find(instruction); if (redit != reductions_->end()) { Primitive::Type type = instruction->GetType(); - if (TrySetVectorType(type, &restrictions) && - VectorizeUse(node, instruction, generate_code, type, restrictions)) { + // Recognize SAD idiom or direct reduction. + if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) || + (TrySetVectorType(type, &restrictions) && + VectorizeUse(node, instruction, generate_code, type, restrictions))) { if (generate_code) { HInstruction* new_red = vector_map_->Get(instruction); vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second)); @@ -1029,14 +1117,20 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, HInstruction* opa = conversion->InputAt(0); Primitive::Type from = conversion->GetInputType(); Primitive::Type to = conversion->GetResultType(); - if ((to == Primitive::kPrimByte || - to == Primitive::kPrimChar || - to == Primitive::kPrimShort) && from == Primitive::kPrimInt) { - // Accept a "narrowing" type conversion from a "wider" computation for - // (1) conversion into final required type, - // (2) vectorizable operand, - // (3) "wider" operations cannot bring in higher order bits. - if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) { + if (Primitive::IsIntegralType(from) && Primitive::IsIntegralType(to)) { + size_t size_vec = Primitive::ComponentSize(type); + size_t size_from = Primitive::ComponentSize(from); + size_t size_to = Primitive::ComponentSize(to); + // Accept an integral conversion + // (1a) narrowing into vector type, "wider" operations cannot bring in higher order bits, or + // (1b) widening from at least vector type, and + // (2) vectorizable operand. + if ((size_to < size_from && + size_to == size_vec && + VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) || + (size_to >= size_from && + size_from >= size_vec && + VectorizeUse(node, opa, generate_code, type, restrictions))) { if (generate_code) { if (vector_mode_ == kVector) { vector_map_->Put(instruction, vector_map_->Get(opa)); // operand pass-through @@ -1088,7 +1182,7 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return true; } } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) { - // Recognize vectorization idioms. + // Recognize halving add idiom. if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) { return true; } @@ -1181,7 +1275,8 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return false; // reject, unless all operands are same-extension narrower } // Accept MIN/MAX(x, y) for vectorizable operands. - DCHECK(r != nullptr && s != nullptr); + DCHECK(r != nullptr); + DCHECK(s != nullptr); if (generate_code && vector_mode_ != kVector) { // de-idiom r = opa; s = opb; @@ -1232,11 +1327,11 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv; return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; @@ -1261,17 +1356,17 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric case Primitive::kPrimBoolean: case Primitive::kPrimByte: *restrictions |= - kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction; + kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction; + *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD; return TrySetVectorLength(8); case Primitive::kPrimInt: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoSAD; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax; + *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax | kNoSAD; return TrySetVectorLength(2); case Primitive::kPrimFloat: *restrictions |= kNoMinMax | kNoReduction; // minmax: -0.0 vs +0.0 @@ -1289,17 +1384,17 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv | kNoReduction | kNoSAD; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction; + *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD; return TrySetVectorLength(8); case Primitive::kPrimInt: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv | kNoReduction | kNoSAD; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv | kNoReduction | kNoSAD; return TrySetVectorLength(2); case Primitive::kPrimFloat: *restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN) @@ -1317,17 +1412,17 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv | kNoReduction | kNoSAD; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction; + *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD; return TrySetVectorLength(8); case Primitive::kPrimInt: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv | kNoReduction | kNoSAD; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoDiv | kNoReduction; + *restrictions |= kNoDiv | kNoReduction | kNoSAD; return TrySetVectorLength(2); case Primitive::kPrimFloat: *restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN) @@ -1371,8 +1466,16 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) if (it != vector_permanent_map_->end()) { vector = it->second; // reuse during unrolling } else { - vector = new (global_allocator_) HVecReplicateScalar( - global_allocator_, org, type, vector_length_); + // Generates ReplicateScalar( (optional_type_conv) org ). + HInstruction* input = org; + Primitive::Type input_type = input->GetType(); + if (type != input_type && (type == Primitive::kPrimLong || + input_type == Primitive::kPrimLong)) { + input = Insert(vector_preheader_, + new (global_allocator_) HTypeConversion(type, input, kNoDexPc)); + } + vector = new (global_allocator_) + HVecReplicateScalar(global_allocator_, input, type, vector_length_); vector_permanent_map_->Put(org, Insert(vector_preheader_, vector)); } vector_map_->Put(org, vector); @@ -1465,10 +1568,15 @@ void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* r // Prepare the new initialization. if (vector_mode_ == kVector) { // Generate a [initial, 0, .., 0] vector. - new_init = Insert( - vector_preheader_, - new (global_allocator_) HVecSetScalars( - global_allocator_, &new_init, phi->GetType(), vector_length_, 1)); + HVecOperation* red_vector = new_red->AsVecOperation(); + size_t vector_length = red_vector->GetVectorLength(); + Primitive::Type type = red_vector->GetPackedType(); + new_init = Insert(vector_preheader_, + new (global_allocator_) HVecSetScalars(global_allocator_, + &new_init, + type, + vector_length, + 1)); } else { new_init = ReduceAndExtractIfNeeded(new_init); } @@ -1484,18 +1592,20 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct if (instruction->IsPhi()) { HInstruction* input = instruction->InputAt(1); if (input->IsVecOperation()) { - Primitive::Type type = input->AsVecOperation()->GetPackedType(); + HVecOperation* input_vector = input->AsVecOperation(); + size_t vector_length = input_vector->GetVectorLength(); + Primitive::Type type = input_vector->GetPackedType(); + HVecReduce::ReductionKind kind = GetReductionKind(input_vector); HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0]; // Generate a vector reduction and scalar extract // x = REDUCE( [x_1, .., x_n] ) // y = x_1 // along the exit of the defining loop. - HVecReduce::ReductionKind kind = GetReductionKind(input); HInstruction* reduce = new (global_allocator_) HVecReduce( - global_allocator_, instruction, type, vector_length_, kind); + global_allocator_, instruction, type, vector_length, kind); exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction()); instruction = new (global_allocator_) HVecExtractScalar( - global_allocator_, reduce, type, vector_length_, 0); + global_allocator_, reduce, type, vector_length, 0); exit->InsertInstructionAfter(instruction, reduce); } } @@ -1516,27 +1626,19 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, HInstruction* opb, Primitive::Type type, bool is_unsigned) { - if (vector_mode_ == kSequential) { - // Non-converting scalar code follows implicit integral promotion. - if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean || - type == Primitive::kPrimByte || - type == Primitive::kPrimChar || - type == Primitive::kPrimShort)) { - type = Primitive::kPrimInt; - } - } HInstruction* vector = nullptr; + Primitive::Type org_type = org->GetType(); switch (org->GetKind()) { case HInstruction::kNeg: DCHECK(opb == nullptr); GENERATE_VEC( new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_), - new (global_allocator_) HNeg(type, opa)); + new (global_allocator_) HNeg(org_type, opa)); case HInstruction::kNot: DCHECK(opb == nullptr); GENERATE_VEC( new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_), - new (global_allocator_) HNot(type, opa)); + new (global_allocator_) HNot(org_type, opa)); case HInstruction::kBooleanNot: DCHECK(opb == nullptr); GENERATE_VEC( @@ -1546,47 +1648,47 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, DCHECK(opb == nullptr); GENERATE_VEC( new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_), - new (global_allocator_) HTypeConversion(type, opa, kNoDexPc)); + new (global_allocator_) HTypeConversion(org_type, opa, kNoDexPc)); case HInstruction::kAdd: GENERATE_VEC( new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HAdd(type, opa, opb)); + new (global_allocator_) HAdd(org_type, opa, opb)); case HInstruction::kSub: GENERATE_VEC( new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HSub(type, opa, opb)); + new (global_allocator_) HSub(org_type, opa, opb)); case HInstruction::kMul: GENERATE_VEC( new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HMul(type, opa, opb)); + new (global_allocator_) HMul(org_type, opa, opb)); case HInstruction::kDiv: GENERATE_VEC( new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HDiv(type, opa, opb, kNoDexPc)); + new (global_allocator_) HDiv(org_type, opa, opb, kNoDexPc)); case HInstruction::kAnd: GENERATE_VEC( new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HAnd(type, opa, opb)); + new (global_allocator_) HAnd(org_type, opa, opb)); case HInstruction::kOr: GENERATE_VEC( new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HOr(type, opa, opb)); + new (global_allocator_) HOr(org_type, opa, opb)); case HInstruction::kXor: GENERATE_VEC( new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HXor(type, opa, opb)); + new (global_allocator_) HXor(org_type, opa, opb)); case HInstruction::kShl: GENERATE_VEC( new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HShl(type, opa, opb)); + new (global_allocator_) HShl(org_type, opa, opb)); case HInstruction::kShr: GENERATE_VEC( new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HShr(type, opa, opb)); + new (global_allocator_) HShr(org_type, opa, opb)); case HInstruction::kUShr: GENERATE_VEC( new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_), - new (global_allocator_) HUShr(type, opa, opb)); + new (global_allocator_) HUShr(org_type, opa, opb)); case HInstruction::kInvokeStaticOrDirect: { HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect(); if (vector_mode_ == kVector) { @@ -1667,8 +1769,8 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, // // Method recognizes the following idioms: -// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b -// regular halving add (a + b) >> 1 for unsigned/signed operands a, b +// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b +// truncated halving add (a + b) >> 1 for unsigned/signed operands a, b // Provided that the operands are promoted to a wider form to do the arithmetic and // then cast back to narrower form, the idioms can be mapped into efficient SIMD // implementation that operates directly in narrower form (plus one extra bit). @@ -1712,7 +1814,8 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, } // Accept recognized halving add for vectorizable operands. Vectorized code uses the // shorthand idiomatic operation. Sequential code uses the original scalar expressions. - DCHECK(r != nullptr && s != nullptr); + DCHECK(r != nullptr); + DCHECK(s != nullptr); if (generate_code && vector_mode_ != kVector) { // de-idiom r = instruction->InputAt(0); s = instruction->InputAt(1); @@ -1741,6 +1844,88 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, return false; } +// Method recognizes the following idiom: +// q += ABS(a - b) for signed operands a, b +// Provided that the operands have the same type or are promoted to a wider form. +// Since this may involve a vector length change, the idiom is handled by going directly +// to a sad-accumulate node (rather than relying combining finer grained nodes later). +// TODO: unsigned SAD too? +bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type reduction_type, + uint64_t restrictions) { + // Filter integral "q += ABS(a - b);" reduction, where ABS and SUB + // are done in the same precision (either int or long). + if (!instruction->IsAdd() || + (reduction_type != Primitive::kPrimInt && reduction_type != Primitive::kPrimLong)) { + return false; + } + HInstruction* q = instruction->InputAt(0); + HInstruction* v = instruction->InputAt(1); + HInstruction* a = nullptr; + HInstruction* b = nullptr; + if (v->IsInvokeStaticOrDirect() && + (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt || + v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) { + HInstruction* x = v->InputAt(0); + if (x->IsSub() && x->GetType() == reduction_type) { + a = x->InputAt(0); + b = x->InputAt(1); + } + } + if (a == nullptr || b == nullptr) { + return false; + } + // Accept same-type or consistent sign extension for narrower-type on operands a and b. + // The same-type or narrower operands are called r (a or lower) and s (b or lower). + HInstruction* r = a; + HInstruction* s = b; + bool is_unsigned = false; + Primitive::Type sub_type = a->GetType(); + if (a->IsTypeConversion()) { + sub_type = a->InputAt(0)->GetType(); + } else if (b->IsTypeConversion()) { + sub_type = b->InputAt(0)->GetType(); + } + if (reduction_type != sub_type && + (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) { + return false; + } + // Try same/narrower type and deal with vector restrictions. + if (!TrySetVectorType(sub_type, &restrictions) || HasVectorRestrictions(restrictions, kNoSAD)) { + return false; + } + // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand + // idiomatic operation. Sequential code uses the original scalar expressions. + DCHECK(r != nullptr); + DCHECK(s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = s = v->InputAt(0); + } + if (VectorizeUse(node, q, generate_code, sub_type, restrictions) && + VectorizeUse(node, r, generate_code, sub_type, restrictions) && + VectorizeUse(node, s, generate_code, sub_type, restrictions)) { + if (generate_code) { + if (vector_mode_ == kVector) { + vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate( + global_allocator_, + vector_map_->Get(q), + vector_map_->Get(r), + vector_map_->Get(s), + reduction_type, + GetOtherVL(reduction_type, sub_type, vector_length_))); + MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom); + } else { + GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type); + GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type); + } + } + return true; + } + return false; +} + // // Vectorization heuristics. // diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index f34751815b..ae2ea76f47 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -75,6 +75,7 @@ class HLoopOptimization : public HOptimization { kNoMinMax = 1 << 8, // no min/max kNoStringCharAt = 1 << 9, // no StringCharAt kNoReduction = 1 << 10, // no reduction + kNoSAD = 1 << 11, // no sum of absolute differences (SAD) }; /* @@ -172,6 +173,11 @@ class HLoopOptimization : public HOptimization { bool generate_code, Primitive::Type type, uint64_t restrictions); + bool VectorizeSADIdiom(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type type, + uint64_t restrictions); // Vectorization heuristics. bool IsVectorizationProfitable(int64_t trip_count); diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index a6d0da1c96..6bc5111de2 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1396,6 +1396,7 @@ class HLoopInformationOutwardIterator : public ValueObject { M(VecUShr, VecBinaryOperation) \ M(VecSetScalars, VecOperation) \ M(VecMultiplyAccumulate, VecOperation) \ + M(VecSADAccumulate, VecOperation) \ M(VecLoad, VecMemoryOperation) \ M(VecStore, VecMemoryOperation) \ diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index c5e75a7ca4..1488b7086a 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -461,8 +461,8 @@ class HVecAdd FINAL : public HVecBinaryOperation { }; // Performs halving add on every component in the two vectors, viz. -// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ] -// or [ x1, .. , xn ] hadd [ y1, .. , yn ] = [ (x1 + y1) >> 1, .. , (xn + yn ) >> 1 ] +// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ] +// truncated [ x1, .. , xn ] hadd [ y1, .. , yn ] = [ (x1 + y1) >> 1, .. , (xn + yn ) >> 1 ] // for signed operands x, y (sign extension) or unsigned operands x, y (zero extension). class HVecHalvingAdd FINAL : public HVecBinaryOperation { public: @@ -810,8 +810,8 @@ class HVecUShr FINAL : public HVecBinaryOperation { // // Assigns the given scalar elements to a vector, -// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ] if n == m, -// set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m < n. +// viz. set( array(x1, .. , xn) ) = [ x1, .. , xn ] if n == m, +// set( array(x1, .. , xm) ) = [ x1, .. , xm, 0, .. , 0 ] if m < n. class HVecSetScalars FINAL : public HVecOperation { public: HVecSetScalars(ArenaAllocator* arena, @@ -842,9 +842,8 @@ class HVecSetScalars FINAL : public HVecOperation { DISALLOW_COPY_AND_ASSIGN(HVecSetScalars); }; -// Multiplies every component in the two vectors, adds the result vector to the accumulator vector. -// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] = -// [ acc1 + x1 * y1, .. , accn + xn * yn ]. +// Multiplies every component in the two vectors, adds the result vector to the accumulator vector, +// viz. [ a1, .. , an ] + [ x1, .. , xn ] * [ y1, .. , yn ] = [ a1 + x1 * y1, .. , an + xn * yn ]. class HVecMultiplyAccumulate FINAL : public HVecOperation { public: HVecMultiplyAccumulate(ArenaAllocator* arena, @@ -866,15 +865,11 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation { DCHECK(HasConsistentPackedTypes(accumulator, packed_type)); DCHECK(HasConsistentPackedTypes(mul_left, packed_type)); DCHECK(HasConsistentPackedTypes(mul_right, packed_type)); - SetRawInputAt(kInputAccumulatorIndex, accumulator); - SetRawInputAt(kInputMulLeftIndex, mul_left); - SetRawInputAt(kInputMulRightIndex, mul_right); + SetRawInputAt(0, accumulator); + SetRawInputAt(1, mul_left); + SetRawInputAt(2, mul_right); } - static constexpr int kInputAccumulatorIndex = 0; - static constexpr int kInputMulLeftIndex = 1; - static constexpr int kInputMulRightIndex = 2; - bool CanBeMoved() const OVERRIDE { return true; } bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { @@ -894,6 +889,42 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation { DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate); }; +// Takes the absolute difference of two vectors, and adds the results to +// same-precision or wider-precision components in the accumulator, +// viz. SAD([ a1, .. , am ], [ x1, .. , xn ], [ y1, .. , yn ] = +// [ a1 + sum abs(xi-yi), .. , am + sum abs(xj-yj) ], +// for m <= n and non-overlapping sums. +class HVecSADAccumulate FINAL : public HVecOperation { + public: + HVecSADAccumulate(ArenaAllocator* arena, + HInstruction* accumulator, + HInstruction* sad_left, + HInstruction* sad_right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecOperation(arena, + packed_type, + SideEffects::None(), + /* number_of_inputs */ 3, + vector_length, + dex_pc) { + DCHECK(HasConsistentPackedTypes(accumulator, packed_type)); + DCHECK(sad_left->IsVecOperation()); + DCHECK(sad_right->IsVecOperation()); + DCHECK_EQ(sad_left->AsVecOperation()->GetPackedType(), + sad_right->AsVecOperation()->GetPackedType()); + SetRawInputAt(0, accumulator); + SetRawInputAt(1, sad_left); + SetRawInputAt(2, sad_right); + } + + DECLARE_INSTRUCTION(VecSADAccumulate); + + private: + DISALLOW_COPY_AND_ASSIGN(HVecSADAccumulate); +}; + // Loads a vector from memory, viz. load(mem, 1) // yield the vector [ mem(1), .. , mem(n) ]. class HVecLoad FINAL : public HVecMemoryOperation { |