AArch64: Clean up CalleeSaveMethod frame and the use of temp registers.

CalleeSaveMethod frame size changes :
SaveAll : 368 -> 176
RefOnly : 176 -> 96
RefsAndArgs : 304 -> 224

JNI register spill size changes :
160 -> 88

In the transition assembly, use registers following the rules:
1. x0-x7 as temp/argument registers.
2. IP0, IP1 as scratch registers.
3. After correct type of callee-save-frame has been setup, all registers
are scratch-able(probably except xSELF and xSUSPEND).
4. When restore callee-save-frame, IP0 and IP1 should be untouched.
5. From C to managed code, we assume all callee save register in AAPCS
will be restored by managed code except x19(SUSPEND).

In quick compiler:
1. Use IP0, IP1 as scratch register.
2. Use IP1 as hidden argument register(IP0 will be scratched by
 trampoline.)

Change-Id: I05ed9d418b01b9e87218a7608536f57e7a286e4c
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index d0633af..3a8ea3f 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -142,6 +142,8 @@
   rwsp = rw31,
 
   // Aliases which are not defined in "ARM Architecture Reference, register names".
+  rxIP0 = rx16,
+  rxIP1 = rx17,
   rxSUSPEND = rx19,
   rxSELF = rx18,
   rxLR = rx30,
@@ -150,6 +152,8 @@
    * the 64-bit view. However, for now we'll define a 32-bit view to keep these from being
    * allocated as 32-bit temp registers.
    */
+  rwIP0 = rw16,
+  rwIP1 = rw17,
   rwSUSPEND = rw19,
   rwSELF = rw18,
   rwLR = rw30,
@@ -165,6 +169,10 @@
 
 constexpr RegStorage rs_xzr(RegStorage::kValid | rxzr);
 constexpr RegStorage rs_wzr(RegStorage::kValid | rwzr);
+constexpr RegStorage rs_xIP0(RegStorage::kValid | rxIP0);
+constexpr RegStorage rs_wIP0(RegStorage::kValid | rwIP0);
+constexpr RegStorage rs_xIP1(RegStorage::kValid | rxIP1);
+constexpr RegStorage rs_wIP1(RegStorage::kValid | rwIP1);
 // Reserved registers.
 constexpr RegStorage rs_xSUSPEND(RegStorage::kValid | rxSUSPEND);
 constexpr RegStorage rs_xSELF(RegStorage::kValid | rxSELF);
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 5e95500..e584548 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -319,8 +319,8 @@
   LockTemp(rs_x5);
   LockTemp(rs_x6);
   LockTemp(rs_x7);
-  LockTemp(rs_x8);
-  LockTemp(rs_x9);
+  LockTemp(rs_xIP0);
+  LockTemp(rs_xIP1);
 
   /*
    * We can safely skip the stack overflow check if we're
@@ -341,7 +341,7 @@
     if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitStackOverflowChecks()) {
       if (!large_frame) {
         // Load stack limit
-        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_x9);
+        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP1);
       }
     } else {
       // TODO(Arm64) Implement implicit checks.
@@ -386,10 +386,10 @@
           m2l_->OpRegImm(kOpAdd, rs_sp, sp_displace_);
           m2l_->ClobberCallerSave();
           ThreadOffset<8> func_offset = QUICK_ENTRYPOINT_OFFSET(8, pThrowStackOverflow);
-          m2l_->LockTemp(rs_x8);
-          m2l_->LoadWordDisp(rs_xSELF, func_offset.Int32Value(), rs_x8);
-          m2l_->NewLIR1(kA64Br1x, rs_x8.GetReg());
-          m2l_->FreeTemp(rs_x8);
+          m2l_->LockTemp(rs_xIP0);
+          m2l_->LoadWordDisp(rs_xSELF, func_offset.Int32Value(), rs_xIP0);
+          m2l_->NewLIR1(kA64Br1x, rs_xIP0.GetReg());
+          m2l_->FreeTemp(rs_xIP0);
         }
 
       private:
@@ -399,11 +399,11 @@
       if (large_frame) {
         // Compare Expected SP against bottom of stack.
         // Branch to throw target if there is not enough room.
-        OpRegRegImm(kOpSub, rs_x9, rs_sp, frame_size_without_spills);
-        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_x8);
-        LIR* branch = OpCmpBranch(kCondUlt, rs_x9, rs_x8, nullptr);
+        OpRegRegImm(kOpSub, rs_xIP1, rs_sp, frame_size_without_spills);
+        LoadWordDisp(rs_xSELF, Thread::StackEndOffset<8>().Int32Value(), rs_xIP0);
+        LIR* branch = OpCmpBranch(kCondUlt, rs_xIP1, rs_xIP0, nullptr);
         AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, spill_size));
-        OpRegCopy(rs_sp, rs_x9);  // Establish stack after checks.
+        OpRegCopy(rs_sp, rs_xIP1);  // Establish stack after checks.
       } else {
         /*
          * If the frame is small enough we are guaranteed to have enough space that remains to
@@ -411,7 +411,7 @@
          * Establishes stack before checks.
          */
         OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size_without_spills);
-        LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_x9, nullptr);
+        LIR* branch = OpCmpBranch(kCondUlt, rs_sp, rs_xIP1, nullptr);
         AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, frame_size_));
       }
     } else {
@@ -431,8 +431,8 @@
   FreeTemp(rs_x5);
   FreeTemp(rs_x6);
   FreeTemp(rs_x7);
-  FreeTemp(rs_x8);
-  FreeTemp(rs_x9);
+  FreeTemp(rs_xIP0);
+  FreeTemp(rs_xIP1);
 }
 
 void Arm64Mir2Lir::GenExitSequence() {
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index dec81cb..9b4546a 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -48,14 +48,12 @@
      rs_d8, rs_d9, rs_d10, rs_d11, rs_d12, rs_d13, rs_d14, rs_d15,
      rs_d16, rs_d17, rs_d18, rs_d19, rs_d20, rs_d21, rs_d22, rs_d23,
      rs_d24, rs_d25, rs_d26, rs_d27, rs_d28, rs_d29, rs_d30, rs_d31};
+// Note: we are not able to call to C function since rs_xSELF is a special register need to be
+// preserved but would be scratched by native functions follow aapcs64.
 static constexpr RegStorage reserved_regs_arr[] =
     {rs_wSUSPEND, rs_wSELF, rs_wsp, rs_wLR, rs_wzr};
 static constexpr RegStorage reserved64_regs_arr[] =
     {rs_xSUSPEND, rs_xSELF, rs_sp, rs_xLR, rs_xzr};
-// TUNING: Are there too many temp registers and too less promote target?
-// This definition need to be matched with runtime.cc, quick entry assembly and JNI compiler
-// Note: we are not able to call to C function directly if it un-match C ABI.
-// Currently, rs_rA64_SELF is not a callee save register which does not match C ABI.
 static constexpr RegStorage core_temps_arr[] =
     {rs_w0, rs_w1, rs_w2, rs_w3, rs_w4, rs_w5, rs_w6, rs_w7,
      rs_w8, rs_w9, rs_w10, rs_w11, rs_w12, rs_w13, rs_w14, rs_w15, rs_w16,
@@ -132,7 +130,7 @@
     case kRet0: res_reg = rs_w0; break;
     case kRet1: res_reg = rs_w1; break;
     case kInvokeTgt: res_reg = rs_wLR; break;
-    case kHiddenArg: res_reg = rs_w12; break;
+    case kHiddenArg: res_reg = rs_wIP1; break;
     case kHiddenFpArg: res_reg = RegStorage::InvalidReg(); break;
     case kCount: res_reg = RegStorage::InvalidReg(); break;
     default: res_reg = RegStorage::InvalidReg();
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 0a00d7d..b95dad2 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -152,7 +152,8 @@
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
                                                      const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X19));
+  // TODO: Ugly hard code...
+  // Should generate these according to the spill mask automatically.
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X20));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X21));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X22));
@@ -164,30 +165,28 @@
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X28));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X29));
   callee_save_regs_.push_back(Arm64ManagedRegister::FromCoreRegister(X30));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D8));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D9));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D10));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D11));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D12));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D13));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D14));
-  callee_save_regs_.push_back(Arm64ManagedRegister::FromDRegister(D15));
 }
 
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
-  uint32_t result = 0;
-  result =  1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 |
-            1 << X25 | 1 << X26 | 1 << X27 | 1 << X28 | 1 << X29 | 1 << LR;
-  return result;
+  // Note: The native jni function may call to some VM runtime functions which may suspend
+  // or trigger GC. And the jni method frame will become top quick frame in those cases.
+  // So we need to satisfy GC to save LR and callee-save registers which is similar to
+  // CalleeSaveMethod(RefOnly) frame.
+  // Jni function is the native function which the java code wants to call.
+  // Jni method is the method that compiled by jni compiler.
+  // Call chain: managed code(java) --> jni method --> jni function.
+  // Thread register(X18, scratched by aapcs64) is not saved on stack, it is saved in ETR(X21).
+  // Suspend register(x19) is preserved by aapcs64 and it is not used in Jni method.
+  return 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 | 1 << X25 |
+         1 << X26 | 1 << X27 | 1 << X28 | 1 << X29 | 1 << LR;
 }
 
 uint32_t Arm64JniCallingConvention::FpSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
-  uint32_t result = 0;
-  result = 1 << D8 | 1 << D9 | 1 << D10 | 1 << D11 | 1 << D12 | 1 << D13 |
-           1 << D14 | 1 << D15;
-  return result;
+  // Note: All callee-save fp registers will be preserved by aapcs64. And they are not used
+  // in the jni method.
+  return 0;
 }
 
 ManagedRegister Arm64JniCallingConvention::ReturnScratchRegister() const {
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 5b97ba0..3f90f21 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -626,7 +626,7 @@
 
   // Move ETR(Callee saved) back to TR(Caller saved) reg. We use ETR on calls
   // to external functions that might trash TR. We do not need the original
-  // X19 saved in BuildFrame().
+  // ETR(X21) saved in BuildFrame().
   ___ Mov(reg_x(TR), reg_x(ETR));
 
   ___ Blr(temp);
@@ -644,20 +644,43 @@
 
   // TODO: *create APCS FP - end of FP chain;
   //       *add support for saving a different set of callee regs.
-  // For now we check that the size of callee regs vector is 20
-  // equivalent to the APCS callee saved regs [X19, x30] [D8, D15].
-  CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
-  ___ PushCalleeSavedRegisters();
-
-  // Move TR(Caller saved) to ETR(Callee saved). The original X19 has been
-  // saved by PushCalleeSavedRegisters(). This way we make sure that TR is not
-  // trashed by native code.
-  ___ Mov(reg_x(ETR), reg_x(TR));
-
+  // For now we check that the size of callee regs vector is 11.
+  CHECK_EQ(callee_save_regs.size(), kJniRefSpillRegsSize);
   // Increase frame to required size - must be at least space to push StackReference<Method>.
-  CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
-  size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
-  IncreaseFrameSize(adjust);
+  CHECK_GT(frame_size, kJniRefSpillRegsSize * kFramePointerSize);
+  IncreaseFrameSize(frame_size);
+
+  // TODO: Ugly hard code...
+  // Should generate these according to the spill mask automatically.
+  // TUNING: Use stp.
+  // Note: Must match Arm64JniCallingConvention::CoreSpillMask().
+  size_t reg_offset = frame_size;
+  reg_offset -= 8;
+  StoreToOffset(LR, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X29, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X28, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X27, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X26, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X25, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X24, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X23, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X22, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X21, SP, reg_offset);
+  reg_offset -= 8;
+  StoreToOffset(X20, SP, reg_offset);
+
+  // Move TR(Caller saved) to ETR(Callee saved). The original (ETR)X21 has been saved on stack.
+  // This way we make sure that TR is not trashed by native code.
+  ___ Mov(reg_x(ETR), reg_x(TR));
 
   // Write StackReference<Method>.
   DCHECK_EQ(4U, sizeof(StackReference<mirror::ArtMethod>));
@@ -690,22 +713,46 @@
 void Arm64Assembler::RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
 
-  // For now we only check that the size of the frame is greater than the
-  // no of APCS callee saved regs [X19, X30] [D8, D15].
-  CHECK_EQ(callee_save_regs.size(), kCalleeSavedRegsSize);
-  CHECK_GT(frame_size, kCalleeSavedRegsSize * kFramePointerSize);
+  // For now we only check that the size of the frame is greater than the spill size.
+  CHECK_EQ(callee_save_regs.size(), kJniRefSpillRegsSize);
+  CHECK_GT(frame_size, kJniRefSpillRegsSize * kFramePointerSize);
 
-  // Decrease frame size to start of callee saved regs.
-  size_t adjust = frame_size - (kCalleeSavedRegsSize * kFramePointerSize);
-  DecreaseFrameSize(adjust);
-
-  // We move ETR (Callee Saved) back to TR (Caller Saved) which might have
-  // been trashed in the native call. The original X19 (ETR) is restored as
-  // part of PopCalleeSavedRegisters().
+  // We move ETR(aapcs64 callee saved) back to TR(aapcs64 caller saved) which might have
+  // been trashed in the native call. The original ETR(X21) is restored from stack.
   ___ Mov(reg_x(TR), reg_x(ETR));
 
+  // TODO: Ugly hard code...
+  // Should generate these according to the spill mask automatically.
+  // TUNING: Use ldp.
+  // Note: Must match Arm64JniCallingConvention::CoreSpillMask().
+  size_t reg_offset = frame_size;
+  reg_offset -= 8;
+  LoadFromOffset(LR, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X29, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X28, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X27, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X26, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X25, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X24, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X23, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X22, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X21, SP, reg_offset);
+  reg_offset -= 8;
+  LoadFromOffset(X20, SP, reg_offset);
+
+  // Decrease frame size to start of callee saved regs.
+  DecreaseFrameSize(frame_size);
+
   // Pop callee saved and return to LR.
-  ___ PopCalleeSavedRegisters();
   ___ Ret();
 }
 
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index 2a08c95..0cbbb1e 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -29,12 +29,12 @@
 namespace art {
 namespace arm64 {
 
-constexpr unsigned int kCalleeSavedRegsSize = 20;
+constexpr unsigned int kJniRefSpillRegsSize = 11;
 
 // Vixl buffer size.
 constexpr size_t kBufferSizeArm64 = 4096*2;
 
-}  // arm64
-}  // art
+}  // namespace arm64
+}  // namespace art
 
 #endif  // ART_COMPILER_UTILS_ARM64_CONSTANTS_ARM64_H_