ARM64: Encode constants when it is possible.

Small optimization which improves HVecReplicateScalar by encoding
immediates directly into NEON instruction when possible instead of
generating constant in GPR and transferring it into NEON register.

Test: test-art-target, test-art-host.
Change-Id: I2113bbd98c0dc8433d2b7048921b9ed7c35ef1c5
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index a41adca..f422b9f 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -22,6 +22,8 @@
 namespace art {
 namespace arm64 {
 
+using helpers::ARM64EncodableConstantOrRegister;
+using helpers::Arm64CanEncodeConstantAsImmediate;
 using helpers::DRegisterFrom;
 using helpers::VRegisterFrom;
 using helpers::HeapOperand;
@@ -34,6 +36,7 @@
 
 void LocationsBuilderARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  HInstruction* input = instruction->InputAt(0);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -41,13 +44,19 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction));
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      if (input->IsConstant() &&
+          Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
+        locations->SetInAt(0, Location::ConstantLocation(input->AsConstant()));
+        locations->SetOut(Location::RequiresFpuRegister());
+      } else {
+        locations->SetInAt(0, Location::RequiresFpuRegister());
+        locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      }
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -57,33 +66,58 @@
 
 void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
+  Location src_loc = locations->InAt(0);
   VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Dup(dst.V16B(), InputRegisterAt(instruction, 0));
+      if (src_loc.IsConstant()) {
+        __ Movi(dst.V16B(), Int64ConstantFrom(src_loc));
+      } else {
+        __ Dup(dst.V16B(), InputRegisterAt(instruction, 0));
+      }
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Dup(dst.V8H(), InputRegisterAt(instruction, 0));
+      if (src_loc.IsConstant()) {
+        __ Movi(dst.V8H(), Int64ConstantFrom(src_loc));
+      } else {
+        __ Dup(dst.V8H(), InputRegisterAt(instruction, 0));
+      }
       break;
     case Primitive::kPrimInt:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Dup(dst.V4S(), InputRegisterAt(instruction, 0));
+      if (src_loc.IsConstant()) {
+        __ Movi(dst.V4S(), Int64ConstantFrom(src_loc));
+      } else {
+        __ Dup(dst.V4S(), InputRegisterAt(instruction, 0));
+      }
       break;
     case Primitive::kPrimLong:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Dup(dst.V2D(), XRegisterFrom(locations->InAt(0)));
+      if (src_loc.IsConstant()) {
+        __ Movi(dst.V2D(), Int64ConstantFrom(src_loc));
+      } else {
+        __ Dup(dst.V2D(), XRegisterFrom(src_loc));
+      }
       break;
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Dup(dst.V4S(), VRegisterFrom(locations->InAt(0)).V4S(), 0);
+      if (src_loc.IsConstant()) {
+        __ Fmov(dst.V4S(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
+      } else {
+        __ Dup(dst.V4S(), VRegisterFrom(src_loc).V4S(), 0);
+      }
       break;
     case Primitive::kPrimDouble:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Dup(dst.V2D(), VRegisterFrom(locations->InAt(0)).V2D(), 0);
+      if (src_loc.IsConstant()) {
+        __ Fmov(dst.V2D(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
+      } else {
+        __ Dup(dst.V2D(), VRegisterFrom(src_loc).V2D(), 0);
+      }
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 721f74e..e73fd7d 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -234,9 +234,20 @@
   }
 }
 
-inline bool CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) {
-  DCHECK(constant->IsIntConstant() || constant->IsLongConstant() || constant->IsNullConstant())
-      << constant->DebugName();
+inline bool Arm64CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) {
+  int64_t value = CodeGenerator::GetInt64ValueOf(constant);
+
+  // TODO: Improve this when IsSIMDConstantEncodable method is implemented in VIXL.
+  if (instr->IsVecReplicateScalar()) {
+    if (constant->IsLongConstant()) {
+      return false;
+    } else if (constant->IsFloatConstant()) {
+      return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue());
+    } else if (constant->IsDoubleConstant()) {
+      return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue());
+    }
+    return IsUint<8>(value);
+  }
 
   // For single uses we let VIXL handle the constant generation since it will
   // use registers that are not managed by the register allocator (wip0, wip1).
@@ -249,8 +260,6 @@
     return true;
   }
 
-  int64_t value = CodeGenerator::GetInt64ValueOf(constant);
-
   if (instr->IsAnd() || instr->IsOr() || instr->IsXor()) {
     // Uses logical operations.
     return vixl::aarch64::Assembler::IsImmLogical(value, vixl::aarch64::kXRegSize);
@@ -276,7 +285,7 @@
 inline Location ARM64EncodableConstantOrRegister(HInstruction* constant,
                                                         HInstruction* instr) {
   if (constant->IsConstant()
-      && CanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
+      && Arm64CanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
     return Location::ConstantLocation(constant->AsConstant());
   }
 
diff --git a/test/655-checker-simd-arm-opt/expected.txt b/test/655-checker-simd-arm-opt/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/655-checker-simd-arm-opt/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/655-checker-simd-arm-opt/info.txt b/test/655-checker-simd-arm-opt/info.txt
new file mode 100644
index 0000000..198cc95
--- /dev/null
+++ b/test/655-checker-simd-arm-opt/info.txt
@@ -0,0 +1 @@
+Checker test for arm and arm64 simd optimizations.
diff --git a/test/655-checker-simd-arm-opt/src/Main.java b/test/655-checker-simd-arm-opt/src/Main.java
new file mode 100644
index 0000000..7b61dd7
--- /dev/null
+++ b/test/655-checker-simd-arm-opt/src/Main.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Checker test for arm and arm64 simd optimizations.
+ */
+public class Main {
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.encodableConstants(byte[], short[], char[], int[], long[], float[], double[]) disassembly (after)
+  /// CHECK-DAG: <<C1:i\d+>>   IntConstant 1
+  /// CHECK-DAG: <<C2:i\d+>>   IntConstant 2
+  /// CHECK-DAG: <<C3:i\d+>>   IntConstant 3
+  /// CHECK-DAG: <<C4:i\d+>>   IntConstant 4
+  /// CHECK-DAG: <<L5:j\d+>>   LongConstant 5
+  /// CHECK-DAG: <<F2:f\d+>>   FloatConstant 2
+  /// CHECK-DAG: <<D20:d\d+>>  DoubleConstant 20
+  //
+  /// CHECK-DAG:               VecReplicateScalar [<<C1>>]
+  /// CHECK-DAG:               movi v{{[0-9]+}}.16b, #0x1
+  /// CHECK-DAG:               VecReplicateScalar [<<C2>>]
+  /// CHECK-DAG:               movi v{{[0-9]+}}.8h, #0x2, lsl #0
+  /// CHECK-DAG:               VecReplicateScalar [<<C3>>]
+  /// CHECK-DAG:               movi v{{[0-9]+}}.8h, #0x3, lsl #0
+  /// CHECK-DAG:               VecReplicateScalar [<<C4>>]
+  /// CHECK-DAG:               movi v{{[0-9]+}}.4s, #0x4, lsl #0
+  /// CHECK-DAG:               VecReplicateScalar [<<L5>>]
+  /// CHECK-DAG:               dup v{{[0-9]+}}.2d, x{{[0-9]+}}
+  /// CHECK-DAG:               VecReplicateScalar [<<F2>>]
+  /// CHECK-DAG:               fmov v{{[0-9]+}}.4s, #0x0
+  /// CHECK-DAG:               VecReplicateScalar [<<D20>>]
+  /// CHECK-DAG:               fmov v{{[0-9]+}}.2d, #0x34
+  private static void encodableConstants(byte[] b, short[] s, char[] c, int[] a, long[] l, float[] f, double[] d) {
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      b[i] += 1;
+    }
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      s[i] += 2;
+    }
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      c[i] += 3;
+    }
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      a[i] += 4;
+    }
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      l[i] += 5;
+    }
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      f[i] += 2.0f;
+    }
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      d[i] += 20.0;
+    }
+  }
+
+  private static int sumArray(byte[] b, short[] s, char[] c, int[] a, long[] l, float[] f, double[] d) {
+    int sum = 0;
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      sum += b[i] + s[i] + c[i] + a[i] + l[i] + f[i] + d[i];
+    }
+    return sum;
+  }
+
+  public static final int ARRAY_SIZE = 100;
+
+  public static void main(String[] args) {
+    byte[] b = new byte[ARRAY_SIZE];
+    short[] s = new short[ARRAY_SIZE];
+    char[] c = new char[ARRAY_SIZE];
+    int[] a = new int[ARRAY_SIZE];
+    long[] l = new long[ARRAY_SIZE];
+    float[] f = new float[ARRAY_SIZE];
+    double[] d = new double[ARRAY_SIZE];
+
+    encodableConstants(b, s, c, a, l, f, d);
+    expectEquals(3700, sumArray(b, s, c, a, l, f, d));
+
+    System.out.println("passed");
+  }
+}