JNI: Improve argument passing for normal native.

Spill outgoing stack arguments directly to their stack slots
(except for `this` on x86) and convert such references to
`jobject` while spilling. Use the `MoveArguments()` call for
both argument spilling and loading regsister arguments to
let the assembler use multi-register stores.

Improve arm64 JNI assembler to use LDP/STP in the relevant
situations when spilling and loading registers.

Fix arm JNI assembler that called `CreateJObject()` with
a bogus input register in one case.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal6      25.074 25.578 (+2.011%)
NativeDowncallStaticNormalRefs6  25.248 25.248 (0%)
NativeDowncallVirtualNormal6     24.913 25.248 (+1.344%)
NativeDowncallVirtualNormalRefs6 25.074 25.086 (+0.482%)
linux-x64                        before after
NativeDowncallStaticNormal6      27.000 26.987 (-0.0500%)
NativeDowncallStaticNormalRefs6  25.411 25.411 (0%)
NativeDowncallVirtualNormal6     25.248 25.086 (-0.6395%)
NativeDowncallVirtualNormalRefs6 25.086 25.074 (-0.0492%)
linux-armv7                      before after
NativeDowncallStaticNormal6      5.9259 6.0663 (+2.368%)
NativeDowncallStaticNormalRefs6  5.6232 5.7061 (+1.474%)
NativeDowncallVirtualNormal6     5.3659 5.4536 (+1.636%)
NativeDowncallVirtualNormalRefs6 5.0879 5.1595 (+1.407%)
linux-armv8                      before after
NativeDowncallStaticNormal6      6.0663 6.2651 (+3.277%)
NativeDowncallStaticNormalRefs6  5.7279 5.8824 (+2.696%)
NativeDowncallVirtualNormal6     5.9494 6.0663 (+1.964%)
NativeDowncallVirtualNormalRefs6 5.5581 5.6630 (+1.888%)
(The x86 and x86-64 differences seem to be lost in noise.)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing --jit
Test: run-gtests.sh
Test: testrunner.py --target --optimizing --jit
Bug: 172332525
Change-Id: Iaba8244c44d410bb1a4e31f90e4387ee5cc51bec
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index e08037c..4c1b2f7 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -64,10 +64,6 @@
                                    ManagedRegister temp_reg);
 
 template <PointerSize kPointerSize>
-static void CopyParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
-                          ManagedRuntimeCallingConvention* mr_conv,
-                          JniCallingConvention* jni_conv);
-template <PointerSize kPointerSize>
 static void SetNativeParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                                JniCallingConvention* jni_conv,
                                ManagedRegister in_reg);
@@ -261,23 +257,10 @@
     __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset<kPointerSize>());
   }
 
-  // 2. Call into appropriate JniMethodStart to transition out of Runnable for normal native.
+  // 2. Call into appropriate `JniMethodStart*()` to transition out of Runnable for normal native.
 
-  // 2.1. Spill all register arguments to preserve them across the JniMethodStart call.
-  if (LIKELY(!is_critical_native && !is_fast_native)) {
-    // TODO: Pass these in a single call to let the assembler use multi-register stores.
-    // TODO: Spill native stack args straight to their stack locations (adjust SP earlier).
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    for (; mr_conv->HasNext(); mr_conv->Next()) {
-      if (mr_conv->IsCurrentParamInRegister()) {
-        size_t size = mr_conv->IsCurrentParamALongOrDouble() ? 8u : 4u;
-        __ Store(mr_conv->CurrentParamStackOffset(), mr_conv->CurrentParamRegister(), size);
-      }
-    }
-  }  // if (!is_critical_native)
-
-  // 2.2. Move frame down to allow space for out going args.
-  //      This prepares for both the JniMethodStart call as well as the main native call.
+  // 2.1. Move frame down to allow space for out going args.
+  //      This prepares for both the `JniMethodStart*()` call as well as the main native call.
   size_t current_out_arg_size = main_out_arg_size;
   if (UNLIKELY(is_critical_native)) {
     DCHECK_EQ(main_out_arg_size, current_frame_size);
@@ -286,9 +269,67 @@
     current_frame_size += main_out_arg_size;
   }
 
+  // 2.2. Spill all register arguments to preserve them across the `JniMethodStart*()` call.
+  //      Native stack arguments are spilled directly to their argument stack slots and
+  //      references are converted to `jobject`. Native register arguments are spilled to
+  //      the reserved slots in the caller frame, references are not converted to `jobject`.
+  constexpr FrameOffset kInvalidReferenceOffset =
+      JNIMacroAssembler<kPointerSize>::kInvalidReferenceOffset;
+  ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
+  ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
+  ArenaVector<FrameOffset> refs(allocator.Adapter());
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    mr_conv->ResetIterator(FrameOffset(current_frame_size));
+    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+    main_jni_conv->Next();    // Skip JNIEnv*.
+    if (is_static) {
+      main_jni_conv->Next();    // Skip `jclass`.
+      // Add a no-op move for the `jclass` argument to avoid the next
+      // argument being treated as non-null if it's a reference.
+      src_args.emplace_back(method_register, kRawPointerSize);
+      dest_args.emplace_back(method_register, kRawPointerSize);
+      refs.push_back(kInvalidReferenceOffset);
+    } else {
+      // Spill `this` as raw reference without conversion to `jobject` even if the `jobject`
+      // argument is passed on stack. Since `this` cannot be null, the argument move before
+      // the native call does not need to reload the reference, and that argument move also
+      // needs to see the `this` argument to avoid treating another reference as non-null.
+      // This also leaves enough space on stack for `JniMethodStartSynchronized()`
+      // for architectures that pass the second argument on the stack (x86).
+      DCHECK(mr_conv->HasNext());
+      DCHECK(main_jni_conv->HasNext());
+      DCHECK(mr_conv->IsCurrentParamAReference());
+      src_args.push_back(mr_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamRegister(), kObjectReferenceSize)
+          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize));
+      dest_args.emplace_back(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize);
+      refs.push_back(kInvalidReferenceOffset);
+      mr_conv->Next();
+      main_jni_conv->Next();
+    }
+    for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
+      DCHECK(main_jni_conv->HasNext());
+      static_assert(kObjectReferenceSize == 4u);
+      bool is_reference = mr_conv->IsCurrentParamAReference();
+      bool spill_jobject = is_reference && !main_jni_conv->IsCurrentParamInRegister();
+      size_t src_size = (!is_reference && mr_conv->IsCurrentParamALongOrDouble()) ? 8u : 4u;
+      size_t dest_size = spill_jobject ? kRawPointerSize : src_size;
+      src_args.push_back(mr_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
+          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
+      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamStackOffset(), dest_size)
+          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), dest_size));
+      refs.push_back(spill_jobject ? mr_conv->CurrentParamStackOffset() : kInvalidReferenceOffset);
+    }
+    __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args),
+                     ArrayRef<ArgumentLocation>(src_args),
+                     ArrayRef<FrameOffset>(refs));
+  }  // if (!is_critical_native)
+
   // 2.3. Call into appropriate JniMethodStart passing Thread* so that transition out of Runnable
   //      can occur. We abuse the JNI calling convention here, that is guaranteed to support
-  //      passing two pointer arguments.
+  //      passing two pointer arguments, `JNIEnv*` and `jclass`/`jobject`.
   std::unique_ptr<JNIMacroLabel> monitor_enter_exception_slow_path =
       UNLIKELY(is_synchronized) ? __ CreateLabel() : nullptr;
   if (LIKELY(!is_critical_native && !is_fast_native)) {
@@ -360,140 +401,100 @@
 
   // 4. Make the main native call.
 
-  // 4.1. Fill arguments.
-  if (UNLIKELY(is_critical_native || is_fast_native)) {
-    ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
-    ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
-    ArenaVector<FrameOffset> refs(allocator.Adapter());
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    constexpr FrameOffset kInvalidReferenceOffset =
-        JNIMacroAssembler<kPointerSize>::kInvalidReferenceOffset;
-    if (is_critical_native) {
-      // Move the method pointer to the hidden argument register.
-      // TODO: Pass this as the last argument, not first. Change ARM assembler
-      // not to expect all register destinations at the beginning.
-      src_args.emplace_back(mr_conv->MethodRegister(), kRawPointerSize);
-      dest_args.emplace_back(main_jni_conv->HiddenArgumentRegister(), kRawPointerSize);
-      refs.push_back(kInvalidReferenceOffset);
-    } else {
-      DCHECK(is_fast_native);
-      DCHECK(!method_register.IsNoRegister());
-      main_jni_conv->Next();    // Skip JNIEnv*.
-      if (!is_static || !main_jni_conv->IsCurrentParamInRegister()) {
-        // The old method register can be clobbered by argument moves.
-        // Preserve the method in `callee_save_temp`.
-        ManagedRegister new_method_reg = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
-        __ Move(new_method_reg, method_register, kRawPointerSize);
-        method_register = new_method_reg;
-      }
-      if (is_static) {
-        // For static methods, move the method to the `jclass` argument.
-        DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
-        src_args.emplace_back(method_register, kRawPointerSize);
-        if (main_jni_conv->IsCurrentParamInRegister()) {
-          // The `jclass` argument becomes the new method register needed for the call.
-          method_register = main_jni_conv->CurrentParamRegister();
-          dest_args.emplace_back(method_register, kRawPointerSize);
-        } else {
-          dest_args.emplace_back(main_jni_conv->CurrentParamStackOffset(), kRawPointerSize);
-        }
-        refs.push_back(kInvalidReferenceOffset);
-        main_jni_conv->Next();
-      }
-    }
-    // Move normal arguments to their locations.
-    // Note that the `this` argument for @FastNative instance methods is passed first,
-    // so that `MoveArguments()` treats it as non-null.
-    for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
-      DCHECK(main_jni_conv->HasNext());
-      static_assert(kObjectReferenceSize == 4u);
-      bool is_reference = mr_conv->IsCurrentParamAReference();
-      size_t src_size = (!is_reference && mr_conv->IsCurrentParamALongOrDouble()) ? 8u : 4u;
-      size_t dest_size = is_reference ? kRawPointerSize : src_size;
-      src_args.push_back(mr_conv->IsCurrentParamInRegister()
-          ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
-          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
-      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
-          ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), dest_size)
-          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), dest_size));
-      refs.emplace_back(
-          is_reference ? mr_conv->CurrentParamStackOffset() : kInvalidReferenceOffset);
-    }
-    DCHECK(!main_jni_conv->HasNext());
-    __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args),
-                     ArrayRef<ArgumentLocation>(src_args),
-                     ArrayRef<FrameOffset>(refs));
+  // 4.1. Fill arguments except the `JNIEnv*`.
+  src_args.clear();
+  dest_args.clear();
+  refs.clear();
+  mr_conv->ResetIterator(FrameOffset(current_frame_size));
+  main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+  if (UNLIKELY(is_critical_native)) {
+    // Move the method pointer to the hidden argument register.
+    // TODO: Pass this as the last argument, not first. Change ARM assembler
+    // not to expect all register destinations at the beginning.
+    src_args.emplace_back(mr_conv->MethodRegister(), kRawPointerSize);
+    dest_args.emplace_back(main_jni_conv->HiddenArgumentRegister(), kRawPointerSize);
+    refs.push_back(kInvalidReferenceOffset);
   } else {
-    // Iterate over arguments placing values from managed calling convention in
-    // to the convention required for a native call (shuffling). For references
-    // place an index/pointer to the reference after checking whether it is
-    // null (which must be encoded as null).
-    // Note: we do this prior to materializing the JNIEnv* and static's jclass to
-    // give as many free registers for the shuffle as possible.
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    uint32_t args_count = 0;
-    while (mr_conv->HasNext()) {
-      args_count++;
-      mr_conv->Next();
-    }
-
-    // Do a backward pass over arguments, so that the generated code will be "mov
-    // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
-    // TODO: A reverse iterator to improve readability.
-    // TODO: This is currently useless as all archs spill args when building the frame.
-    //       To avoid the full spilling, we would have to do one pass before the BuildFrame()
-    //       to determine which arg registers are clobbered before they are needed.
-    for (uint32_t i = 0; i < args_count; ++i) {
-      mr_conv->ResetIterator(FrameOffset(current_frame_size));
-      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-
-      // Skip the extra JNI parameters for now.
-      main_jni_conv->Next();    // Skip JNIEnv*.
-      if (is_static) {
-        main_jni_conv->Next();  // Skip Class for now.
-      }
-      // Skip to the argument we're interested in.
-      for (uint32_t j = 0; j < args_count - i - 1; ++j) {
-        mr_conv->Next();
-        main_jni_conv->Next();
-      }
-      CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
-    }
-
-    // 4.2. For static method, create jclass argument as a pointer to the method's declaring class.
-    //      Make sure the method is in a register even for non-static methods.
-    DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
-    FrameOffset method_offset =
-        FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
-    DCHECK(method_register.IsNoRegister());
-    if (is_static) {
-      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-      main_jni_conv->Next();  // Skip JNIEnv*
-      // Load reference to the method's declaring class. The method register has been
-      // clobbered by the above call, so we need to load the method from the stack.
-      // Use the `callee_save_temp` if the parameter goes on the stack.
-      method_register = main_jni_conv->IsCurrentParamOnStack()
-          ? __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize)
-          : main_jni_conv->CurrentParamRegister();
-      __ Load(method_register, method_offset, kRawPointerSize);
-      if (main_jni_conv->IsCurrentParamOnStack()) {
-        // Store the method argument.
-        FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
-        __ Store(out_off, method_register, kRawPointerSize);
+    main_jni_conv->Next();    // Skip JNIEnv*.
+    FrameOffset method_offset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
+    if (!is_static || main_jni_conv->IsCurrentParamOnStack()) {
+      // The method shall not be available in the `jclass` argument register.
+      // Make sure it is available in `callee_save_temp` for the call below.
+      // (For @FastNative, the old method register can be clobbered by argument moves.
+      // For normal native, it was already clobbered by the `JniMethodStart*()` call.)
+      ManagedRegister new_method_reg = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
+      if (UNLIKELY(is_fast_native)) {
+        DCHECK(!method_register.IsNoRegister());
+        __ Move(new_method_reg, method_register, kRawPointerSize);
       } else {
-        ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
-        __ Move(out_reg, method_register, kRawPointerSize);  // No-op if equal.
-        method_register = out_reg;
+        DCHECK(method_register.IsNoRegister());
+        __ Load(new_method_reg, method_offset, kRawPointerSize);
       }
+      method_register = new_method_reg;
+    }
+    if (is_static) {
+      // For static methods, move/load the method to the `jclass` argument.
+      DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
+      if (method_register.IsNoRegister()) {
+        DCHECK(main_jni_conv->IsCurrentParamInRegister());
+        src_args.emplace_back(method_offset, kRawPointerSize);
+      } else {
+        src_args.emplace_back(method_register, kRawPointerSize);
+      }
+      if (main_jni_conv->IsCurrentParamInRegister()) {
+        // The `jclass` argument becomes the new method register needed for the call.
+        method_register = main_jni_conv->CurrentParamRegister();
+        dest_args.emplace_back(method_register, kRawPointerSize);
+      } else {
+        dest_args.emplace_back(main_jni_conv->CurrentParamStackOffset(), kRawPointerSize);
+      }
+      refs.push_back(kInvalidReferenceOffset);
+      main_jni_conv->Next();
     } else {
-      // Load the method for non-static methods to `callee_save_temp` as we need it for the call.
-      method_register = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
-      __ Load(method_register, method_offset, kRawPointerSize);
+      // The `this` argument for instance methods is passed first, so that `MoveArguments()`
+      // treats it as non-null. It has not been converted to `jobject` yet, not even for normal
+      // native methods on architectures where this argument is passed on the stack (x86).
+      DCHECK(mr_conv->HasNext());
+      DCHECK(main_jni_conv->HasNext());
+      DCHECK(mr_conv->IsCurrentParamAReference());
+      src_args.push_back(UNLIKELY(is_fast_native) && mr_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamRegister(), kObjectReferenceSize)
+          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize));
+      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), kRawPointerSize)
+          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), kRawPointerSize));
+      refs.push_back(mr_conv->CurrentParamStackOffset());
+      mr_conv->Next();
+      main_jni_conv->Next();
     }
   }
+  // Move normal arguments to their locations.
+  for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
+    DCHECK(main_jni_conv->HasNext());
+    bool dest_in_reg = main_jni_conv->IsCurrentParamInRegister();
+    if (LIKELY(!is_critical_native && !is_fast_native) && !dest_in_reg) {
+      // Stack arguments for normal native have already been filled.
+      continue;
+    }
+    static_assert(kObjectReferenceSize == 4u);
+    bool is_reference = mr_conv->IsCurrentParamAReference();
+    size_t src_size = (!is_reference && mr_conv->IsCurrentParamALongOrDouble()) ? 8u : 4u;
+    size_t dest_size = is_reference ? kRawPointerSize : src_size;
+    src_args.push_back(
+        UNLIKELY(is_critical_native || is_fast_native) && mr_conv->IsCurrentParamInRegister()
+            ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
+            : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
+    dest_args.push_back(dest_in_reg
+        ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), dest_size)
+        : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), dest_size));
+    refs.push_back(is_reference ? mr_conv->CurrentParamStackOffset() : kInvalidReferenceOffset);
+  }
+  DCHECK(!main_jni_conv->HasNext());
+  __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args),
+                   ArrayRef<ArgumentLocation>(src_args),
+                   ArrayRef<FrameOffset>(refs));
 
-  // 4.3. Create 1st argument, the JNI environment ptr.
+  // 4.2. Create 1st argument, the JNI environment ptr.
   if (LIKELY(!is_critical_native)) {
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
     if (main_jni_conv->IsCurrentParamInRegister()) {
@@ -505,7 +506,7 @@
     }
   }
 
-  // 4.4. Plant call to native code associated with method.
+  // 4.3. Plant call to native code associated with method.
   MemberOffset jni_entrypoint_offset =
       ArtMethod::EntryPointFromJniOffset(InstructionSetPointerSize(instruction_set));
   if (UNLIKELY(is_critical_native)) {
@@ -522,7 +523,7 @@
     method_register = ManagedRegister::NoRegister();
   }
 
-  // 4.5. Fix differences in result widths.
+  // 4.4. Fix differences in result widths.
   if (main_jni_conv->RequiresSmallResultTypeExtension()) {
     DCHECK(main_jni_conv->HasSmallReturnType());
     CHECK(!is_critical_native || !main_jni_conv->UseTailCall());
@@ -838,96 +839,6 @@
   __ Store(jni_env_reg, jni_env_cookie_offset, saved_cookie_reg, kIRTCookieSize);
 }
 
-// Copy a single parameter from the managed to the JNI calling convention.
-template <PointerSize kPointerSize>
-static void CopyParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
-                          ManagedRuntimeCallingConvention* mr_conv,
-                          JniCallingConvention* jni_conv) {
-  // We spilled all registers, so use stack locations.
-  bool input_in_reg = false;  // mr_conv->IsCurrentParamInRegister();
-  bool output_in_reg = jni_conv->IsCurrentParamInRegister();
-  FrameOffset spilled_reference_offset(0);
-  bool null_allowed = false;
-  bool ref_param = jni_conv->IsCurrentParamAReference();
-  CHECK(!ref_param || mr_conv->IsCurrentParamAReference());
-  if (output_in_reg) {  // output shouldn't straddle registers and stack
-    CHECK(!jni_conv->IsCurrentParamOnStack());
-  } else {
-    CHECK(jni_conv->IsCurrentParamOnStack());
-  }
-  // References are spilled to caller's reserved out vreg area.
-  if (ref_param) {
-    null_allowed = mr_conv->IsCurrentArgPossiblyNull();
-    // Compute spilled reference offset. Note that null is spilled but the jobject
-    // passed to the native code must be null (not a pointer into the spilled value
-    // as with regular references).
-    spilled_reference_offset = mr_conv->CurrentParamStackOffset();
-    // Check that spilled reference offset is in the spill area in the caller's frame.
-    CHECK_GT(spilled_reference_offset.Uint32Value(), mr_conv->GetDisplacement().Uint32Value());
-  }
-  if (input_in_reg && output_in_reg) {
-    ManagedRegister in_reg = mr_conv->CurrentParamRegister();
-    ManagedRegister out_reg = jni_conv->CurrentParamRegister();
-    if (ref_param) {
-      __ CreateJObject(out_reg, spilled_reference_offset, in_reg, null_allowed);
-    } else {
-      if (!mr_conv->IsCurrentParamOnStack()) {
-        // regular non-straddling move
-        __ Move(out_reg, in_reg, mr_conv->CurrentParamSize());
-      } else {
-        UNIMPLEMENTED(FATAL);  // we currently don't expect to see this case
-      }
-    }
-  } else if (!input_in_reg && !output_in_reg) {
-    FrameOffset out_off = jni_conv->CurrentParamStackOffset();
-    if (ref_param) {
-      __ CreateJObject(out_off, spilled_reference_offset, null_allowed);
-    } else {
-      FrameOffset in_off = mr_conv->CurrentParamStackOffset();
-      size_t param_size = mr_conv->CurrentParamSize();
-      CHECK_EQ(param_size, jni_conv->CurrentParamSize());
-      __ Copy(out_off, in_off, param_size);
-    }
-  } else if (!input_in_reg && output_in_reg) {
-    FrameOffset in_off = mr_conv->CurrentParamStackOffset();
-    ManagedRegister out_reg = jni_conv->CurrentParamRegister();
-    // Check that incoming stack arguments are above the current stack frame.
-    CHECK_GT(in_off.Uint32Value(), mr_conv->GetDisplacement().Uint32Value());
-    if (ref_param) {
-      __ CreateJObject(out_reg,
-                       spilled_reference_offset,
-                       ManagedRegister::NoRegister(),
-                       null_allowed);
-    } else {
-      size_t param_size = mr_conv->CurrentParamSize();
-      CHECK_EQ(param_size, jni_conv->CurrentParamSize());
-      __ Load(out_reg, in_off, param_size);
-    }
-  } else {
-    CHECK(input_in_reg && !output_in_reg);
-    ManagedRegister in_reg = mr_conv->CurrentParamRegister();
-    FrameOffset out_off = jni_conv->CurrentParamStackOffset();
-    // Check outgoing argument is within frame part dedicated to out args.
-    CHECK_LT(out_off.Uint32Value(), jni_conv->GetDisplacement().Uint32Value());
-    if (ref_param) {
-      // TODO: recycle value in in_reg rather than reload from spill slot.
-      __ CreateJObject(out_off, spilled_reference_offset, null_allowed);
-    } else {
-      size_t param_size = mr_conv->CurrentParamSize();
-      CHECK_EQ(param_size, jni_conv->CurrentParamSize());
-      if (!mr_conv->IsCurrentParamOnStack()) {
-        // regular non-straddling store
-        __ Store(out_off, in_reg, param_size);
-      } else {
-        // store where input straddles registers and stack
-        CHECK_EQ(param_size, 8u);
-        FrameOffset in_off = mr_conv->CurrentParamStackOffset();
-        __ StoreSpanning(out_off, in_reg, in_off);
-      }
-    }
-  }
-}
-
 template <PointerSize kPointerSize>
 static void SetNativeParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                                JniCallingConvention* jni_conv,
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index ac263c1..9ea6f04 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -451,7 +451,6 @@
   DCHECK(!loc2.IsRegister());
   uint32_t loc1_offset = loc1.GetFrameOffset().Uint32Value();
   uint32_t loc2_offset = loc2.GetFrameOffset().Uint32Value();
-  DCHECK_LT(loc1_offset, loc2_offset);
   return loc1_offset + loc1.GetSize() == loc2_offset;
 }
 
@@ -574,6 +573,7 @@
   }
 
   // Convert reference registers to `jobject` values.
+  // TODO: Delay this for references that are copied to another register.
   for (size_t i = 0; i != arg_count; ++i) {
     if (refs[i] != kInvalidReferenceOffset && srcs[i].IsRegister()) {
       // Note: We can clobber `srcs[i]` here as the register cannot hold more than one argument.
@@ -583,17 +583,11 @@
   }
 
   // Native ABI is soft-float, so all destinations should be core registers or stack offsets.
-  // And register locations should be first, followed by stack locations with increasing offset.
+  // And register locations should be first, followed by stack locations.
   auto is_register = [](const ArgumentLocation& loc) { return loc.IsRegister(); };
   DCHECK(std::is_partitioned(dests.begin(), dests.end(), is_register));
   size_t num_reg_dests =
       std::distance(dests.begin(), std::partition_point(dests.begin(), dests.end(), is_register));
-  DCHECK(std::is_sorted(
-      dests.begin() + num_reg_dests,
-      dests.end(),
-      [](const ArgumentLocation& lhs, const ArgumentLocation& rhs) {
-        return lhs.GetFrameOffset().Uint32Value() < rhs.GetFrameOffset().Uint32Value();
-      }));
 
   // Collect registers to move. No need to record FP regs as destinations are only core regs.
   uint32_t src_regs = 0u;
@@ -783,12 +777,14 @@
   // opportunities to use LDRD/VMOV to fill 2 registers with one instruction.
   for (size_t i = 0, j; i != num_reg_dests; i = j) {
     j = i + 1u;
-    DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+    DCHECK(dests[i].IsRegister());
+    ArmManagedRegister dest_reg = dests[i].GetRegister().AsArm();
+    DCHECK(IsCoreRegisterOrPair(dest_reg));
     if (srcs[i].IsRegister() && IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
       DCHECK_EQ(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
       continue;  // Equals destination or moved above.
     }
-    DCHECK_NE(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+    DCHECK_NE(GetCoreRegisterMask(dest_reg) & dest_regs, 0u);
     if (dests[i].GetSize() == 4u) {
       // Find next register to load.
       while (j != num_reg_dests &&
@@ -803,8 +799,7 @@
                    MemOperand(sp, srcs[i].GetFrameOffset().Uint32Value()));
           if (refs[i] != kInvalidReferenceOffset) {
             DCHECK_EQ(refs[i], srcs[i].GetFrameOffset());
-            ManagedRegister dest_i_reg = dests[i].GetRegister();
-            CreateJObject(dest_i_reg, refs[i], dest_i_reg, /*null_allowed=*/ i != 0u);
+            CreateJObject(dest_reg, refs[i], dest_reg, /*null_allowed=*/ i != 0u);
           }
           if (refs[j] != kInvalidReferenceOffset) {
             DCHECK_EQ(refs[j], srcs[j].GetFrameOffset());
@@ -818,7 +813,7 @@
           uint32_t first_sreg = GetSRegisterNumber(srcs[i].GetRegister().AsArm());
           if (IsAligned<2u>(first_sreg) &&
               first_sreg + 1u == GetSRegisterNumber(srcs[j].GetRegister().AsArm())) {
-            ___ Vmov(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+            ___ Vmov(AsVIXLRegister(dest_reg),
                      AsVIXLRegister(dests[j].GetRegister().AsArm()),
                      vixl32::DRegister(first_sreg / 2u));
             ++j;
@@ -830,10 +825,9 @@
     if (srcs[i].IsRegister()) {
       Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
     } else if (refs[i] != kInvalidReferenceOffset) {
-      ManagedRegister dest_i_reg = dests[i].GetRegister();
-      CreateJObject(dest_i_reg, refs[i], dest_i_reg, /*null_allowed=*/ i != 0u);
+      CreateJObject(dest_reg, refs[i], ManagedRegister::NoRegister(), /*null_allowed=*/ i != 0u);
     } else {
-      Load(dests[i].GetRegister(), srcs[i].GetFrameOffset(), dests[i].GetSize());
+      Load(dest_reg, srcs[i].GetFrameOffset(), dests[i].GetSize());
     }
   }
 }
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index 073c2f0..0f1203e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -45,6 +45,15 @@
 // STP signed offset for W-register can encode any 4-byte aligned offset smaller than this cutoff.
 static constexpr size_t kStpWOffsetCutoff = 256u;
 
+// STP signed offset for X-register can encode any 8-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStpXOffsetCutoff = 512u;
+
+// STP signed offset for S-register can encode any 4-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStpSOffsetCutoff = 256u;
+
+// STP signed offset for D-register can encode any 8-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStpDOffsetCutoff = 512u;
+
 Arm64JNIMacroAssembler::~Arm64JNIMacroAssembler() {
 }
 
@@ -419,12 +428,16 @@
     }
   };
 
+  // More than 8 core or FP reg args are very rare, so we do not optimize for
+  // that case by using LDP/STP, except for situations that arise for normal
+  // native even with low number of arguments. We use STP for the non-reference
+  // spilling which also covers the initial spill for native reference register
+  // args as they are spilled as raw 32-bit values. We also optimize loading
+  // args to registers with LDP, whether references or not, except for the
+  // initial non-null reference which we do not need to load at all.
+
   // Collect registers to move while storing/copying args to stack slots.
   // Convert processed references to `jobject`.
-  // More than 8 core or FP reg args are very rare, so we do not optimize
-  // for that case by using LDP/STP.
-  // TODO: LDP/STP will be useful for normal native methods where we need
-  // to spill even the leading arguments.
   uint64_t src_regs = 0u;
   uint64_t dest_regs = 0u;
   for (size_t i = 0; i != arg_count; ++i) {
@@ -465,7 +478,47 @@
       }
     } else {
       if (src.IsRegister()) {
-        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+        static_assert(kStpWOffsetCutoff == kStpSOffsetCutoff);
+        static_assert(kStpXOffsetCutoff == kStpDOffsetCutoff);
+        if (i + 1u != arg_count &&
+            srcs[i + 1u].IsRegister() &&
+            srcs[i + 1u].GetSize() == dest.GetSize() &&
+            src.GetRegister().AsArm64().IsGPRegister() ==
+                srcs[i + 1u].GetRegister().AsArm64().IsGPRegister() &&
+            refs[i + 1u] == kInvalidReferenceOffset &&
+            !dests[i + 1u].IsRegister() &&
+            dests[i + 1u].GetFrameOffset().SizeValue() ==
+                dest.GetFrameOffset().SizeValue() + dest.GetSize() &&
+            IsAlignedParam(dest.GetFrameOffset().SizeValue(), dest.GetSize()) &&
+            dest.GetFrameOffset().SizeValue() <
+                (dest.GetSize() == 8u ? kStpXOffsetCutoff : kStpWOffsetCutoff)) {
+          DCHECK_EQ(dests[i + 1u].GetSize(), dest.GetSize());
+          Arm64ManagedRegister src_reg = src.GetRegister().AsArm64();
+          Arm64ManagedRegister src2_reg = srcs[i + 1u].GetRegister().AsArm64();
+          DCHECK_EQ(dest.GetSize() == 8u, src_reg.IsXRegister() || src_reg.IsDRegister());
+          DCHECK_EQ(dest.GetSize() == 8u, src2_reg.IsXRegister() || src2_reg.IsDRegister());
+          if (src_reg.IsWRegister()) {
+            ___ Stp(reg_w(src_reg.AsWRegister()),
+                    reg_w(src2_reg.AsWRegister()),
+                    MEM_OP(sp, dest.GetFrameOffset().SizeValue()));
+          } else if (src_reg.IsXRegister()) {
+            ___ Stp(reg_x(src_reg.AsXRegister()),
+                    reg_x(src2_reg.AsXRegister()),
+                    MEM_OP(sp, dest.GetFrameOffset().SizeValue()));
+          } else if (src_reg.IsSRegister()) {
+            ___ Stp(reg_s(src_reg.AsSRegister()),
+                    reg_s(src2_reg.AsSRegister()),
+                    MEM_OP(sp, dest.GetFrameOffset().SizeValue()));
+          } else {
+            DCHECK(src_reg.IsDRegister());
+            ___ Stp(reg_d(src_reg.AsDRegister()),
+                    reg_d(src2_reg.AsDRegister()),
+                    MEM_OP(sp, dest.GetFrameOffset().SizeValue()));
+          }
+          ++i;
+        } else {
+          Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+        }
       } else {
         Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
       }
@@ -482,12 +535,18 @@
       if (!dest.IsRegister()) {
         continue;  // Stored in first loop above.
       }
-      uint64_t dest_reg_mask = get_mask(dest.GetRegister());
-      if ((dest_reg_mask & dest_regs) == 0u) {
-        continue;  // Equals source, or already filled in one of previous iterations.
-      }
-      if ((dest_reg_mask & src_regs) != 0u) {
-        continue;  // Cannot clobber this register yet.
+      auto can_process = [&](ManagedRegister dest_reg) {
+        uint64_t dest_reg_mask = get_mask(dest_reg);
+        if ((dest_reg_mask & dest_regs) == 0u) {
+          return false;  // Equals source, or already filled in one of previous iterations.
+        }
+        if ((dest_reg_mask & src_regs) != 0u) {
+          return false;  // Cannot clobber this register yet.
+        }
+        return true;
+      };
+      if (!can_process(dest.GetRegister())) {
+        continue;
       }
       if (src.IsRegister()) {
         if (ref != kInvalidReferenceOffset) {
@@ -496,6 +555,58 @@
           Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
         }
         src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
+      } else if (i + 1u != arg_count &&
+                 (i != 0u || ref == kInvalidReferenceOffset) &&  // Not for non-null reference.
+                 dests[i + 1u].IsRegister() &&
+                 dest.GetRegister().AsArm64().IsGPRegister() ==
+                     dests[i + 1u].GetRegister().AsArm64().IsGPRegister() &&
+                 !srcs[i + 1u].IsRegister() &&
+                 srcs[i + 1u].GetSize() == src.GetSize() &&
+                 srcs[i + 1u].GetFrameOffset().SizeValue() ==
+                     src.GetFrameOffset().SizeValue() + src.GetSize() &&
+                 IsAlignedParam(src.GetFrameOffset().SizeValue(), src.GetSize()) &&
+                 can_process(dests[i + 1u].GetRegister())) {
+        Arm64ManagedRegister dest_reg = dest.GetRegister().AsArm64();
+        Arm64ManagedRegister dest2_reg = dests[i + 1u].GetRegister().AsArm64();
+        DCHECK(ref == kInvalidReferenceOffset || dest_reg.IsXRegister());
+        DCHECK(refs[i + 1u] == kInvalidReferenceOffset || dest2_reg.IsXRegister());
+        if (dest_reg.IsDRegister()) {
+          DCHECK_EQ(dest.GetSize(), 8u);
+          DCHECK_EQ(dests[i + 1u].GetSize(), 8u);
+          ___ Ldp(reg_d(dest_reg.AsDRegister()),
+                  reg_d(dest2_reg.AsDRegister()),
+                  MEM_OP(sp, src.GetFrameOffset().SizeValue()));
+        } else if (dest_reg.IsSRegister()) {
+          DCHECK_EQ(dest.GetSize(), 4u);
+          DCHECK_EQ(dests[i + 1u].GetSize(), 4u);
+          ___ Ldp(reg_s(dest_reg.AsSRegister()),
+                  reg_s(dest2_reg.AsSRegister()),
+                  MEM_OP(sp, src.GetFrameOffset().SizeValue()));
+        } else if (src.GetSize() == 8u) {
+          DCHECK_EQ(dest.GetSize(), 8u);
+          DCHECK_EQ(dests[i + 1u].GetSize(), 8u);
+          ___ Ldp(reg_x(dest_reg.AsXRegister()),
+                  reg_x(dest2_reg.AsXRegister()),
+                  MEM_OP(sp, src.GetFrameOffset().SizeValue()));
+        } else {
+          DCHECK_EQ(dest.GetSize(), ref != kInvalidReferenceOffset ? 8u : 4u);
+          DCHECK_EQ(dests[i + 1u].GetSize(), refs[i + 1u] != kInvalidReferenceOffset ? 8u : 4u);
+          auto to_w = [](Arm64ManagedRegister reg) {
+            return reg_w(reg.IsXRegister() ? reg.AsOverlappingWRegister() : reg.AsWRegister());
+          };
+          ___ Ldp(to_w(dest_reg), to_w(dest2_reg), MEM_OP(sp, src.GetFrameOffset().SizeValue()));
+          auto to_mr_w = [](Arm64ManagedRegister reg) {
+            return Arm64ManagedRegister::FromWRegister(reg.AsOverlappingWRegister());
+          };
+          if (ref != kInvalidReferenceOffset) {
+            CreateJObject(dest_reg, ref, to_mr_w(dest_reg), /*null_allowed=*/ true);
+          }
+          if (refs[i + 1u] != kInvalidReferenceOffset) {
+            CreateJObject(dest2_reg, refs[i + 1u], to_mr_w(dest2_reg), /*null_allowed=*/ true);
+          }
+        }
+        dest_regs &= ~get_mask(dest2_reg);  // Destination register was filled.
+        ++i;  // Proceed to mark the other destination register as filled.
       } else {
         if (ref != kInvalidReferenceOffset) {
           CreateJObject(