vixl32: do not use D14 as a temporary.

D14 is a callee-save register, which means we would need to
save it in the prologue, but at the point we're using it
the prologue has already been generated.

bug: 35977033
test: m ART_USE_VIXL_ARM_BACKEND=true test-art-target
Change-Id: Id7340ad9e87a9e527ce0989f45aae0b3a0963206
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index f5ada52..d75779c 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -2000,15 +2000,10 @@
                          graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Always save the LR register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(LR));
-  // Give d14 and d15 as scratch registers to VIXL.
-  // They are removed from the register allocator in `SetupBlockedRegisters()`.
-  // TODO(VIXL): We need two scratch D registers for `EmitSwap` when swapping two double stack
-  // slots. If that is sufficiently rare, and we have pressure on FP registers, we could instead
-  // spill in `EmitSwap`. But if we actually are guaranteed to have 32 D registers, we could give
-  // d30 and d31 to VIXL to avoid removing registers from the allocator. If that is the case, we may
-  // also want to investigate giving those 14 other D registers to the allocator.
-  GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d14);
-  GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d15);
+  // Give D30 and D31 as scratch register to VIXL. The register allocator only works on
+  // S0-S31, which alias to D0-D15.
+  GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d31);
+  GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d30);
 }
 
 void JumpTableARMVIXL::EmitTable(CodeGeneratorARMVIXL* codegen) {
@@ -2074,13 +2069,6 @@
   // Reserve temp register.
   blocked_core_registers_[IP] = true;
 
-  // Registers s28-s31 (d14-d15) are left to VIXL for scratch registers.
-  // (They are given to the `MacroAssembler` in `CodeGeneratorARMVIXL::CodeGeneratorARMVIXL`.)
-  blocked_fpu_registers_[28] = true;
-  blocked_fpu_registers_[29] = true;
-  blocked_fpu_registers_[30] = true;
-  blocked_fpu_registers_[31] = true;
-
   if (GetGraph()->IsDebuggable()) {
     // Stubs do not save callee-save floating point registers. If the graph
     // is debuggable, we need to deal with these registers differently. For
@@ -6549,13 +6537,16 @@
 void ParallelMoveResolverARMVIXL::Exchange(int mem1, int mem2) {
   // TODO(VIXL32): Double check the performance of this implementation.
   UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler());
-  vixl32::SRegister temp_1 = temps.AcquireS();
-  vixl32::SRegister temp_2 = temps.AcquireS();
+  vixl32::Register temp1 = temps.Acquire();
+  ScratchRegisterScope ensure_scratch(
+      this, temp1.GetCode(), r0.GetCode(), codegen_->GetNumberOfCoreRegisters());
+  vixl32::Register temp2(ensure_scratch.GetRegister());
 
-  __ Vldr(temp_1, MemOperand(sp, mem1));
-  __ Vldr(temp_2, MemOperand(sp, mem2));
-  __ Vstr(temp_1, MemOperand(sp, mem2));
-  __ Vstr(temp_2, MemOperand(sp, mem1));
+  int stack_offset = ensure_scratch.IsSpilled() ? kArmWordSize : 0;
+  GetAssembler()->LoadFromOffset(kLoadWord, temp1, sp, mem1 + stack_offset);
+  GetAssembler()->LoadFromOffset(kLoadWord, temp2, sp, mem2 + stack_offset);
+  GetAssembler()->StoreToOffset(kStoreWord, temp1, sp, mem2 + stack_offset);
+  GetAssembler()->StoreToOffset(kStoreWord, temp2, sp, mem1 + stack_offset);
 }
 
 void ParallelMoveResolverARMVIXL::EmitSwap(size_t index) {
@@ -6578,7 +6569,7 @@
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
     Exchange(source.GetStackIndex(), destination.GetStackIndex());
   } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
-    vixl32::SRegister temp = temps.AcquireS();
+    vixl32::Register temp = temps.Acquire();
     __ Vmov(temp, SRegisterFrom(source));
     __ Vmov(SRegisterFrom(source), SRegisterFrom(destination));
     __ Vmov(SRegisterFrom(destination), temp);
@@ -6637,12 +6628,12 @@
   }
 }
 
-void ParallelMoveResolverARMVIXL::SpillScratch(int reg ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void ParallelMoveResolverARMVIXL::SpillScratch(int reg) {
+  __ Push(vixl32::Register(reg));
 }
 
-void ParallelMoveResolverARMVIXL::RestoreScratch(int reg ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void ParallelMoveResolverARMVIXL::RestoreScratch(int reg) {
+  __ Pop(vixl32::Register(reg));
 }
 
 HLoadClass::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadClassKind(
diff --git a/compiler/utils/arm/assembler_arm_vixl.cc b/compiler/utils/arm/assembler_arm_vixl.cc
index e5eef37..6afc3dd 100644
--- a/compiler/utils/arm/assembler_arm_vixl.cc
+++ b/compiler/utils/arm/assembler_arm_vixl.cc
@@ -230,6 +230,7 @@
   if (!CanHoldStoreOffsetThumb(type, offset)) {
     CHECK_NE(base.GetCode(), kIpCode);
     if ((reg.GetCode() != kIpCode) &&
+        (!vixl_masm_.GetScratchRegisterList()->IsEmpty()) &&
         ((type != kStoreWordPair) || (reg.GetCode() + 1 != kIpCode))) {
       tmp_reg = temps.Acquire();
     } else {
diff --git a/test/642-fp-callees/expected.txt b/test/642-fp-callees/expected.txt
new file mode 100644
index 0000000..77a1486
--- /dev/null
+++ b/test/642-fp-callees/expected.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+Done
diff --git a/test/642-fp-callees/fp_callees.cc b/test/642-fp-callees/fp_callees.cc
new file mode 100644
index 0000000..600f969
--- /dev/null
+++ b/test/642-fp-callees/fp_callees.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/casts.h"
+#include "base/logging.h"
+#include "jni.h"
+
+namespace art {
+
+// Make the array volatile, which is apparently making the C compiler
+// use FP registers in the method below.
+volatile double array[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 };
+
+extern "C" JNIEXPORT void JNICALL Java_Main_holdFpTemporaries(JNIEnv* env, jclass cls) {
+  jmethodID mid = env->GetStaticMethodID(cls, "caller", "(IIJ)V");
+  CHECK(mid != nullptr);
+  // Load values from the arrays, which will be loaded in callee-save FP registers.
+  double a = array[0];
+  double b = array[1];
+  double c = array[2];
+  double d = array[3];
+  double e = array[4];
+  double f = array[5];
+  double g = array[6];
+  double h = array[7];
+  double i = array[8];
+  double j = array[9];
+  double k = array[10];
+  double l = array[11];
+  env->CallStaticVoidMethod(cls, mid, 1, 1, 1L);
+  // Load it in a temporary to please C compiler with bit_cast.
+  double temp = array[0];
+  CHECK_EQ(bit_cast<int64_t>(a), bit_cast<int64_t>(temp));
+  temp = array[1];
+  CHECK_EQ(bit_cast<int64_t>(b), bit_cast<int64_t>(temp));
+  temp = array[2];
+  CHECK_EQ(bit_cast<int64_t>(c), bit_cast<int64_t>(temp));
+  temp = array[3];
+  CHECK_EQ(bit_cast<int64_t>(d), bit_cast<int64_t>(temp));
+  temp = array[4];
+  CHECK_EQ(bit_cast<int64_t>(e), bit_cast<int64_t>(temp));
+  temp = array[5];
+  CHECK_EQ(bit_cast<int64_t>(f), bit_cast<int64_t>(temp));
+  temp = array[6];
+  CHECK_EQ(bit_cast<int64_t>(g), bit_cast<int64_t>(temp));
+  temp = array[7];
+  CHECK_EQ(bit_cast<int64_t>(h), bit_cast<int64_t>(temp));
+  temp = array[8];
+  CHECK_EQ(bit_cast<int64_t>(i), bit_cast<int64_t>(temp));
+  temp = array[9];
+  CHECK_EQ(bit_cast<int64_t>(j), bit_cast<int64_t>(temp));
+  temp = array[10];
+  CHECK_EQ(bit_cast<int64_t>(k), bit_cast<int64_t>(temp));
+  temp = array[11];
+  CHECK_EQ(bit_cast<int64_t>(l), bit_cast<int64_t>(temp));
+}
+
+}  // namespace art
diff --git a/test/642-fp-callees/info.txt b/test/642-fp-callees/info.txt
new file mode 100644
index 0000000..d3e4bda
--- /dev/null
+++ b/test/642-fp-callees/info.txt
@@ -0,0 +1,2 @@
+Regression test for vixl32 backend, which used to incorrectly
+use D14 as a temporary register.
diff --git a/test/642-fp-callees/src/Main.java b/test/642-fp-callees/src/Main.java
new file mode 100644
index 0000000..fa57c93
--- /dev/null
+++ b/test/642-fp-callees/src/Main.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    System.loadLibrary(args[0]);
+    holdFpTemporaries();
+    System.out.println("Done");
+  }
+
+  public static void caller(int a, int b, long c) {
+    $noinline$callee(a, b, c);
+  }
+
+  // This method is "no inline", in order to generate the
+  // bad floating point use at the call site.
+  public static void $noinline$callee(int a, int b, long c) {
+  }
+
+  public native static void holdFpTemporaries();
+}
diff --git a/test/Android.bp b/test/Android.bp
index 594cce2..9911af0 100644
--- a/test/Android.bp
+++ b/test/Android.bp
@@ -336,6 +336,7 @@
         "596-monitor-inflation/monitor_inflation.cc",
         "597-deopt-new-string/deopt.cc",
         "626-const-class-linking/clear_dex_cache_types.cc",
+        "642-fp-callees/fp_callees.cc",
     ],
     shared_libs: [
         "libbacktrace",