ARM: Use hardfp calling convention between java to java call.

This patch default to use hardfp calling convention. Softfp can be enabled
by setting kArm32QuickCodeUseSoftFloat to true.

We get about -1 ~ +5% performance improvement with different benchmark
tests. Hopefully, we should be able to get more performance by address the left
TODOs, as some part of the code takes the original assumption which is not
optimal.

DONE:
1. Interpreter to quick code
2. Quick code to interpreter
3. Transition assembly and callee-saves
4. Trampoline(generic jni, resolution, invoke with access check and etc.)
5. Pass fp arg reg following aapcs(gpr and stack do not follow aapcs)
6. Quick helper assembly routines to handle ABI differences
7. Quick code method entry
8. Quick code method invocation
9. JNI compiler

TODO:
10. Rework ArgMap, FlushIn, GenDalvikArgs and affected common code.
11. Rework CallRuntimeHelperXXX().

Change-Id: I9965d8a007f4829f2560b63bcbbde271bdcf6ec2
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index af341bb..93c47dc 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -50,15 +50,19 @@
   // | arg1 spill |  |
   // | Method*    | ---
   // | LR         |
-  // | ...        |    callee saves
-  // | R3         |    arg3
-  // | R2         |    arg2
-  // | R1         |    arg1
-  // | R0         |    padding
+  // | ...        |    4x6 bytes callee saves
+  // | R3         |
+  // | R2         |
+  // | R1         |
+  // | S15        |
+  // | :          |
+  // | S0         |
+  // |            |    4x2 bytes padding
   // | Method*    |  <- sp
-  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
-  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = kArm32QuickCodeUseSoftFloat;
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = !kArm32QuickCodeUseSoftFloat;
+  static constexpr size_t kNumQuickGprArgs = 3;
+  static constexpr size_t kNumQuickFprArgs = kArm32QuickCodeUseSoftFloat ? 0 : 16;
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =
       arm::ArmCalleeSaveFpr1Offset(Runtime::kRefsAndArgs);  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset =
@@ -90,6 +94,7 @@
   // |            |    padding
   // | Method*    |  <- sp
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 7;  // 7 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =
@@ -117,6 +122,7 @@
   // | A1         |    arg1
   // | A0/Method* |  <- sp
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
@@ -141,6 +147,7 @@
   // | ECX         |    arg1
   // | EAX/Method* |  <- sp
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
@@ -178,6 +185,7 @@
   // | Padding         |
   // | RDI/Method*     |  <- sp
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 5;  // 5 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
@@ -222,8 +230,16 @@
           fpr_args_(reinterpret_cast<uint8_t*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
           stack_args_(reinterpret_cast<uint8_t*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
                       + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
-          gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
-          is_split_long_or_double_(false) {}
+          gpr_index_(0), fpr_index_(0), fpr_double_index_(0), stack_index_(0),
+          cur_type_(Primitive::kPrimVoid), is_split_long_or_double_(false) {
+    COMPILE_ASSERT(kQuickSoftFloatAbi == (kNumQuickFprArgs == 0), knum_of_quick_fpr_arg_unexpected);
+    COMPILE_ASSERT(!(kQuickSoftFloatAbi && kQuickDoubleRegAlignedFloatBackFilled),
+        kdouble_align_unexpected);
+    // For register alignment, we want to assume that counters(fpr_double_index_) are even if the
+    // next register is even.
+    COMPILE_ASSERT(!kQuickDoubleRegAlignedFloatBackFilled || kNumQuickFprArgs % 2 == 0,
+        knum_quick_fpr_args_not_even);
+  }
 
   virtual ~QuickArgumentVisitor() {}
 
@@ -237,7 +253,11 @@
     if (!kQuickSoftFloatAbi) {
       Primitive::Type type = GetParamPrimitiveType();
       if (UNLIKELY((type == Primitive::kPrimDouble) || (type == Primitive::kPrimFloat))) {
-        if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
+        if (type == Primitive::kPrimDouble && kQuickDoubleRegAlignedFloatBackFilled) {
+          if (fpr_double_index_ + 2 < kNumQuickFprArgs + 1) {
+            return fpr_args_ + (fpr_double_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
+          }
+        } else if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
           return fpr_args_ + (fpr_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
         }
         return stack_args_ + (stack_index_ * kBytesStackArgLocation);
@@ -268,28 +288,30 @@
 
   uint64_t ReadSplitLongParam() const {
     DCHECK(IsSplitLongOrDouble());
+    // Read low half from register.
     uint64_t low_half = *reinterpret_cast<uint32_t*>(GetParamAddress());
-    uint64_t high_half = *reinterpret_cast<uint32_t*>(stack_args_);
+    // Read high half from the stack. As current stack_index_ indexes the argument, the high part
+    // index should be (stack_index_ + 1).
+    uint64_t high_half = *reinterpret_cast<uint32_t*>(stack_args_
+        + (stack_index_ + 1) * kBytesStackArgLocation);
     return (low_half & 0xffffffffULL) | (high_half << 32);
   }
 
   void VisitArguments() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    // This implementation doesn't support reg-spill area for hard float
-    // ABI targets such as x86_64 and aarch64. So, for those targets whose
-    // 'kQuickSoftFloatAbi' is 'false':
-    //     (a) 'stack_args_' should point to the first method's argument
-    //     (b) whatever the argument type it is, the 'stack_index_' should
-    //         be moved forward along with every visiting.
+    // (a) 'stack_args_' should point to the first method's argument
+    // (b) whatever the argument type it is, the 'stack_index_' should
+    //     be moved forward along with every visiting.
     gpr_index_ = 0;
     fpr_index_ = 0;
+    if (kQuickDoubleRegAlignedFloatBackFilled) {
+      fpr_double_index_ = 0;
+    }
     stack_index_ = 0;
     if (!is_static_) {  // Handle this.
       cur_type_ = Primitive::kPrimNot;
       is_split_long_or_double_ = false;
       Visit();
-      if (!kQuickSoftFloatAbi || kNumQuickGprArgs == 0) {
-        stack_index_++;
-      }
+      stack_index_++;
       if (kNumQuickGprArgs > 0) {
         gpr_index_++;
       }
@@ -305,9 +327,7 @@
         case Primitive::kPrimInt:
           is_split_long_or_double_ = false;
           Visit();
-          if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
-            stack_index_++;
-          }
+          stack_index_++;
           if (gpr_index_ < kNumQuickGprArgs) {
             gpr_index_++;
           }
@@ -315,17 +335,24 @@
         case Primitive::kPrimFloat:
           is_split_long_or_double_ = false;
           Visit();
+          stack_index_++;
           if (kQuickSoftFloatAbi) {
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
-            } else {
-              stack_index_++;
             }
           } else {
-            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
+            if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
               fpr_index_++;
+              if (kQuickDoubleRegAlignedFloatBackFilled) {
+                // Double should not overlap with float.
+                // For example, if fpr_index_ = 3, fpr_double_index_ should be at least 4.
+                fpr_double_index_ = std::max(fpr_double_index_, RoundUp(fpr_index_, 2));
+                // Float should not overlap with double.
+                if (fpr_index_ % 2 == 0) {
+                  fpr_index_ = std::max(fpr_double_index_, fpr_index_);
+                }
+              }
             }
-            stack_index_++;
           }
           break;
         case Primitive::kPrimDouble:
@@ -334,42 +361,46 @@
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
             Visit();
-            if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
-              if (kBytesStackArgLocation == 4) {
-                stack_index_+= 2;
-              } else {
-                CHECK_EQ(kBytesStackArgLocation, 8U);
-                stack_index_++;
-              }
+            if (kBytesStackArgLocation == 4) {
+              stack_index_+= 2;
+            } else {
+              CHECK_EQ(kBytesStackArgLocation, 8U);
+              stack_index_++;
             }
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
               if (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) {
                 if (gpr_index_ < kNumQuickGprArgs) {
                   gpr_index_++;
-                } else if (kQuickSoftFloatAbi) {
-                  stack_index_++;
                 }
               }
             }
           } else {
             is_split_long_or_double_ = (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) &&
-                ((fpr_index_ + 1) == kNumQuickFprArgs);
+                ((fpr_index_ + 1) == kNumQuickFprArgs) && !kQuickDoubleRegAlignedFloatBackFilled;
             Visit();
-            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-              fpr_index_++;
-              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
-                if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-                  fpr_index_++;
-                }
-              }
-            }
             if (kBytesStackArgLocation == 4) {
               stack_index_+= 2;
             } else {
               CHECK_EQ(kBytesStackArgLocation, 8U);
               stack_index_++;
             }
+            if (kQuickDoubleRegAlignedFloatBackFilled) {
+              if (fpr_double_index_ + 2 < kNumQuickFprArgs + 1) {
+                fpr_double_index_ += 2;
+                // Float should not overlap with double.
+                if (fpr_index_ % 2 == 0) {
+                  fpr_index_ = std::max(fpr_double_index_, fpr_index_);
+                }
+              }
+            } else if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
+              fpr_index_++;
+              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
+                if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
+                  fpr_index_++;
+                }
+              }
+            }
           }
           break;
         default:
@@ -381,16 +412,8 @@
  private:
   static size_t StackArgumentStartFromShorty(bool is_static, const char* shorty,
                                              uint32_t shorty_len) {
-    if (kQuickSoftFloatAbi) {
-      CHECK_EQ(kNumQuickFprArgs, 0U);
-      return (kNumQuickGprArgs * GetBytesPerGprSpillLocation(kRuntimeISA))
-          + sizeof(StackReference<mirror::ArtMethod>) /* StackReference<ArtMethod> */;
-    } else {
-      // For now, there is no reg-spill area for the targets with
-      // hard float ABI. So, the offset pointing to the first method's
-      // parameter ('this' for non-static methods) should be returned.
-      return sizeof(StackReference<mirror::ArtMethod>);  // Skip StackReference<ArtMethod>.
-    }
+    // 'stack_args_' points to the first method's argument
+    return sizeof(StackReference<mirror::ArtMethod>);  // Skip StackReference<ArtMethod>.
   }
 
  protected:
@@ -403,7 +426,14 @@
   uint8_t* const fpr_args_;  // Address of FPR arguments in callee save frame.
   uint8_t* const stack_args_;  // Address of stack arguments in caller's frame.
   uint32_t gpr_index_;  // Index into spilled GPRs.
-  uint32_t fpr_index_;  // Index into spilled FPRs.
+  // Index into spilled FPRs.
+  // In case kQuickDoubleRegAlignedFloatBackFilled, it may index a hole while fpr_double_index_
+  // holds a higher register number.
+  uint32_t fpr_index_;
+  // Index into spilled FPRs for aligned double.
+  // Only used when kQuickDoubleRegAlignedFloatBackFilled. Next available double register indexed in
+  // terms of singles, may be behind fpr_index.
+  uint32_t fpr_double_index_;
   uint32_t stack_index_;  // Index into arguments on the stack.
   // The current type of argument during VisitArguments.
   Primitive::Type cur_type_;
@@ -943,8 +973,8 @@
         delegate_(delegate) {
     // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
     // the next register is even; counting down is just to make the compiler happy...
-    CHECK_EQ(kNumNativeGprArgs % 2, 0U);
-    CHECK_EQ(kNumNativeFprArgs % 2, 0U);
+    COMPILE_ASSERT(kNumNativeGprArgs % 2 == 0U, knum_native_gpr_args_not_even);
+    COMPILE_ASSERT(kNumNativeFprArgs % 2 == 0U, knum_native_fpr_args_not_even);
   }
 
   virtual ~BuildNativeCallFrameStateMachine() {}