ARM64: Enable SDOT/UDOT instructions emission.

Enables SDOT/UDOT instructions emission for those arm64
targets which support DOTPROD feature. Currently only
vector VecDotProd instruction could emit those.

Test: test-art-target.
Test: test-art-target --instruction-set-features runtime.
Test: 684-checker-simd-dotprod.

Change-Id: I57a16e340a42879ff19a3b2439ea11525dbeaccc
diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index 714d984..2a4c785 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc
@@ -40,13 +40,9 @@
 
 #define __ GetVIXLAssembler()->
 
-// Build-time switch for Armv8.4-a dot product instructions.
-// TODO: Enable dot product when there is a device to test it on.
-static constexpr bool kArm64EmitDotProdInstructions = false;
-
 // Returns whether dot product instructions should be emitted.
 static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) {
-  return kArm64EmitDotProdInstructions && codegen_->GetInstructionSetFeatures().HasDotProd();
+  return codegen_->GetInstructionSetFeatures().HasDotProd();
 }
 
 void LocationsBuilderARM64Neon::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index d6fa0f6..1761dfc 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc
@@ -40,13 +40,9 @@
 
 #define __ GetVIXLAssembler()->
 
-// Build-time switch for Armv8.4-a dot product instructions.
-// TODO: Enable dot product when there is a device to test it on.
-static constexpr bool kArm64EmitDotProdInstructions = false;
-
 // Returns whether dot product instructions should be emitted.
 static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) {
-  return kArm64EmitDotProdInstructions && codegen_->GetInstructionSetFeatures().HasDotProd();
+  return codegen_->GetInstructionSetFeatures().HasDotProd();
 }
 
 void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
diff --git a/test/684-checker-simd-dotprod/src/other/TestByte.java b/test/684-checker-simd-dotprod/src/other/TestByte.java
index 9acfc59..608d021 100644
--- a/test/684-checker-simd-dotprod/src/other/TestByte.java
+++ b/test/684-checker-simd-dotprod/src/other/TestByte.java
@@ -34,7 +34,7 @@
   /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdSimple(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdSimple(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
   /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
   /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
@@ -48,6 +48,16 @@
   //
   /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
   /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+
+
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdSimple(byte[], byte[]) disassembly (after)
+  /// CHECK:        VecDotProd
+  /// CHECK-IF:     hasIsaFeature("dotprod")
+  ///               CHECK-NEXT:   sdot v{{\d+}}.4s, v{{\d+}}.16b, v{{\d+}}.16b
+  /// CHECK-ELSE:
+  ///               CHECK-NOT:    sdot
+  ///               CHECK-NOT:    udot
+  /// CHECK-FI:
   public static final int testDotProdSimple(byte[] a, byte[] b) {
     int s = 1;
     for (int i = 0; i < b.length; i++) {
@@ -72,7 +82,7 @@
   /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplex(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdComplex(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
   /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
   /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
@@ -109,7 +119,7 @@
   /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdSimpleUnsigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdSimpleUnsigned(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
   /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
   /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
@@ -123,6 +133,15 @@
   //
   /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
   /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdSimpleUnsigned(byte[], byte[]) disassembly (after)
+  /// CHECK:        VecDotProd
+  /// CHECK-IF:     hasIsaFeature("dotprod")
+  ///               CHECK-NEXT:   udot v{{\d+}}.4s, v{{\d+}}.16b, v{{\d+}}.16b
+  /// CHECK-ELSE:
+  ///               CHECK-NOT:    sdot
+  ///               CHECK-NOT:    udot
+  /// CHECK-FI:
   public static final int testDotProdSimpleUnsigned(byte[] a, byte[] b) {
     int s = 1;
     for (int i = 0; i < b.length; i++) {
@@ -147,7 +166,7 @@
   /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplexUnsigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdComplexUnsigned(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
   /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
   /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
@@ -188,7 +207,7 @@
   /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplexUnsignedCastedToSigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdComplexUnsignedCastedToSigned(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
   /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
   /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
@@ -229,7 +248,7 @@
   /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplexSignedCastedToUnsigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdComplexSignedCastedToUnsigned(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
   /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
   /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
@@ -255,7 +274,7 @@
     return s - 1;
   }
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdSignedWidening(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdSignedWidening(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG:                  VecDotProd type:Int8
   public static final int testDotProdSignedWidening(byte[] a, byte[] b) {
     int s = 1;
@@ -266,7 +285,7 @@
     return s - 1;
   }
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdParamSigned(int, byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdParamSigned(int, byte[]) loop_optimization (after)
   /// CHECK-DAG:                  VecDotProd type:Int8
   public static final int testDotProdParamSigned(int x, byte[] b) {
     int s = 1;
@@ -277,7 +296,7 @@
     return s - 1;
   }
 
-  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdParamUnsigned(int, byte[]) loop_optimization (after)
+  /// CHECK-START-ARM64: int other.TestByte.testDotProdParamUnsigned(int, byte[]) loop_optimization (after)
   /// CHECK-DAG:                  VecDotProd type:Uint8
   public static final int testDotProdParamUnsigned(int x, byte[] b) {
     int s = 1;