From aaac0e3cbfe72217cad204d0122f2b73a602d2dd Mon Sep 17 00:00:00 2001
From: Artem Serov <artem.serov@linaro.org>
Date: Tue, 7 Aug 2018 00:52:22 +0100
Subject: ART: ARM64: Support DotProd SIMD idiom.

Implement support for vectorization idiom which performs dot
product of two vectors and adds the result to wider precision
components in the accumulator.

viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
                 [ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
     for m <= n, non-overlapping sums,
     for either both signed or both unsigned operands x, y.

The patch shows up to 7x performance improvement on a micro
benchmark on Cortex-A57.

Test: 684-checker-simd-dotprod.
Test: test-art-host, test-art-target.

Change-Id: Ibab0d51f537fdecd1d84033197be3ebf5ec4e455
---
 compiler/optimizing/nodes_vector.h | 60 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'compiler/optimizing/nodes_vector.h')

diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c7539f2846..597e399dd1 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -1021,6 +1021,66 @@ class HVecSADAccumulate final : public HVecOperation {
   DEFAULT_COPY_CONSTRUCTOR(VecSADAccumulate);
 };
 
+// Performs dot product of two vectors and adds the result to wider precision components in
+// the accumulator.
+//
+// viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
+//                  [ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
+//      for m <= n, non-overlapping sums,
+//      for either both signed or both unsigned operands x, y.
+//
+// Notes:
+//   - packed type reflects the type of sum reduction, not the type of the operands.
+//   - IsZeroExtending() is used to determine the kind of signed/zero extension to be
+//     performed for the operands.
+//
+// TODO: Support types other than kInt32 for packed type.
+class HVecDotProd final : public HVecOperation {
+ public:
+  HVecDotProd(ArenaAllocator* allocator,
+              HInstruction* accumulator,
+              HInstruction* left,
+              HInstruction* right,
+              DataType::Type packed_type,
+              bool is_zero_extending,
+              size_t vector_length,
+              uint32_t dex_pc)
+    : HVecOperation(kVecDotProd,
+                    allocator,
+                    packed_type,
+                    SideEffects::None(),
+                    /* number_of_inputs */ 3,
+                    vector_length,
+                    dex_pc) {
+    DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+    DCHECK(DataType::IsIntegralType(packed_type));
+    DCHECK(left->IsVecOperation());
+    DCHECK(right->IsVecOperation());
+    DCHECK_EQ(ToSignedType(left->AsVecOperation()->GetPackedType()),
+              ToSignedType(right->AsVecOperation()->GetPackedType()));
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, left);
+    SetRawInputAt(2, right);
+    SetPackedFlag<kFieldHDotProdIsZeroExtending>(is_zero_extending);
+  }
+
+  bool IsZeroExtending() const { return GetPackedFlag<kFieldHDotProdIsZeroExtending>(); }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecDotProd);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecDotProd);
+
+ private:
+  // Additional packed bits.
+  static constexpr size_t kFieldHDotProdIsZeroExtending =
+      HVecOperation::kNumberOfVectorOpPackedBits;
+  static constexpr size_t kNumberOfHDotProdPackedBits = kFieldHDotProdIsZeroExtending + 1;
+  static_assert(kNumberOfHDotProdPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad final : public HVecMemoryOperation {
-- 
cgit v1.2.3-59-g8ed1b