Move @CriticalNative arguments in registers.

And spill stack arguments directly to the right location.
Do not spill to the reserved space in the caller's frame.

Preliminary Golem results for art-opt-cc:
                            x86 x86-64    arm  arm64
NativeDowncallCritical6:    n/a +14.3% +17.2% +26.1%
(x86 seems to be currently providing results that are worse
than interpreter, so something is not working.)

Test: Additional tests in 178-app-image-native-method test.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: I709c52ab2585a8f5f441f53ad2bf4a01d2b25dca
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index ac060cc..036cdbb 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -24,6 +24,7 @@
 
 #include "art_method.h"
 #include "base/arena_allocator.h"
+#include "base/arena_containers.h"
 #include "base/enums.h"
 #include "base/logging.h"  // For VLOG.
 #include "base/macros.h"
@@ -227,13 +228,10 @@
   __ BuildFrame(current_frame_size, method_register, callee_save_regs);
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
 
-  {
+  if (LIKELY(!is_critical_native)) {
     // Spill all register arguments.
     // TODO: Spill reference args directly to the HandleScope.
     // TODO: Spill native stack args straight to their stack locations (adjust SP earlier).
-    // TODO: Move args in registers without spilling for @CriticalNative.
-    // TODO: Provide assembler API for batched moves to allow moving multiple arguments
-    // with single instruction (arm: LDRD/STRD/LDMIA/STMIA, arm64: LDP/STP).
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     for (; mr_conv->HasNext(); mr_conv->Next()) {
       if (mr_conv->IsCurrentParamInRegister()) {
@@ -241,9 +239,7 @@
         __ Store(mr_conv->CurrentParamStackOffset(), mr_conv->CurrentParamRegister(), size);
       }
     }
-  }
 
-  if (LIKELY(!is_critical_native)) {
     // NOTE: @CriticalNative methods don't have a HandleScope
     //       because they can't have any reference parameters or return values.
 
@@ -320,10 +316,6 @@
   size_t current_out_arg_size = main_out_arg_size;
   if (UNLIKELY(is_critical_native)) {
     DCHECK_EQ(main_out_arg_size, current_frame_size);
-    // Move the method pointer to the hidden argument register.
-    __ Move(main_jni_conv->HiddenArgumentRegister(),
-            mr_conv->MethodRegister(),
-            static_cast<size_t>(main_jni_conv->GetFramePointerSize()));
   } else {
     __ IncreaseFrameSize(main_out_arg_size);
     current_frame_size += main_out_arg_size;
@@ -434,65 +426,86 @@
     __ Store(saved_cookie_offset, main_jni_conv->IntReturnRegister(), 4 /* sizeof cookie */);
   }
 
-  // 7. Iterate over arguments placing values from managed calling convention in
-  //    to the convention required for a native call (shuffling). For references
-  //    place an index/pointer to the reference after checking whether it is
-  //    null (which must be encoded as null).
-  //    Note: we do this prior to materializing the JNIEnv* and static's jclass to
-  //    give as many free registers for the shuffle as possible.
-  mr_conv->ResetIterator(FrameOffset(current_frame_size));
-  uint32_t args_count = 0;
-  while (mr_conv->HasNext()) {
-    args_count++;
-    mr_conv->Next();
-  }
-
-  // Do a backward pass over arguments, so that the generated code will be "mov
-  // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
-  // TODO: A reverse iterator to improve readability.
-  // TODO: This is currently useless as all archs spill args when building the frame.
-  //       To avoid the full spilling, we would have to do one pass before the BuildFrame()
-  //       to determine which arg registers are clobbered before they are needed.
-  // TODO: For @CriticalNative, do a forward pass because there are no JNIEnv* and jclass* args.
-  for (uint32_t i = 0; i < args_count; ++i) {
+  // 7. Fill arguments.
+  if (UNLIKELY(is_critical_native)) {
+    ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
+    ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
+    // Move the method pointer to the hidden argument register.
+    size_t pointer_size = static_cast<size_t>(kPointerSize);
+    dest_args.push_back(ArgumentLocation(main_jni_conv->HiddenArgumentRegister(), pointer_size));
+    src_args.push_back(ArgumentLocation(mr_conv->MethodRegister(), pointer_size));
+    // Move normal arguments to their locations.
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+    for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
+      DCHECK(main_jni_conv->HasNext());
+      size_t size = mr_conv->IsCurrentParamALongOrDouble() ? 8u : 4u;
+      src_args.push_back(mr_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamRegister(), size)
+          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), size));
+      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), size)
+          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), size));
+    }
+    DCHECK(!main_jni_conv->HasNext());
+    __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args), ArrayRef<ArgumentLocation>(src_args));
+  } else {
+    // Iterate over arguments placing values from managed calling convention in
+    // to the convention required for a native call (shuffling). For references
+    // place an index/pointer to the reference after checking whether it is
+    // null (which must be encoded as null).
+    // Note: we do this prior to materializing the JNIEnv* and static's jclass to
+    // give as many free registers for the shuffle as possible.
+    mr_conv->ResetIterator(FrameOffset(current_frame_size));
+    uint32_t args_count = 0;
+    while (mr_conv->HasNext()) {
+      args_count++;
+      mr_conv->Next();
+    }
 
-    // Skip the extra JNI parameters for now.
-    if (LIKELY(!is_critical_native)) {
+    // Do a backward pass over arguments, so that the generated code will be "mov
+    // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
+    // TODO: A reverse iterator to improve readability.
+    // TODO: This is currently useless as all archs spill args when building the frame.
+    //       To avoid the full spilling, we would have to do one pass before the BuildFrame()
+    //       to determine which arg registers are clobbered before they are needed.
+    for (uint32_t i = 0; i < args_count; ++i) {
+      mr_conv->ResetIterator(FrameOffset(current_frame_size));
+      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+
+      // Skip the extra JNI parameters for now.
       main_jni_conv->Next();    // Skip JNIEnv*.
       if (is_static) {
         main_jni_conv->Next();  // Skip Class for now.
       }
+      // Skip to the argument we're interested in.
+      for (uint32_t j = 0; j < args_count - i - 1; ++j) {
+        mr_conv->Next();
+        main_jni_conv->Next();
+      }
+      CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
     }
-    // Skip to the argument we're interested in.
-    for (uint32_t j = 0; j < args_count - i - 1; ++j) {
-      mr_conv->Next();
-      main_jni_conv->Next();
-    }
-    CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
-  }
-  if (is_static && !is_critical_native) {
-    // Create argument for Class
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    main_jni_conv->Next();  // Skip JNIEnv*
-    FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
-    if (main_jni_conv->IsCurrentParamOnStack()) {
-      FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
-      __ CreateHandleScopeEntry(out_off, handle_scope_offset, /*null_allowed=*/ false);
-    } else {
-      ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
-      __ CreateHandleScopeEntry(out_reg,
-                                handle_scope_offset,
-                                ManagedRegister::NoRegister(),
+    if (is_static) {
+      // Create argument for Class
+      mr_conv->ResetIterator(FrameOffset(current_frame_size));
+      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+      main_jni_conv->Next();  // Skip JNIEnv*
+      FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
+      if (main_jni_conv->IsCurrentParamOnStack()) {
+        FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
+        __ CreateHandleScopeEntry(out_off, handle_scope_offset, /*null_allowed=*/ false);
+      } else {
+        ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
+        __ CreateHandleScopeEntry(out_reg,
+                                  handle_scope_offset,
+                                  ManagedRegister::NoRegister(),
                                 /*null_allowed=*/ false);
+      }
     }
-  }
 
-  // Set the iterator back to the incoming Method*.
-  main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-  if (LIKELY(!is_critical_native)) {
+    // Set the iterator back to the incoming Method*.
+    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+
     // 8. Create 1st argument, the JNI environment ptr.
     // Register that will hold local indirect reference table
     if (main_jni_conv->IsCurrentParamInRegister()) {
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 5a355be..85b253c 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -41,6 +41,9 @@
 static constexpr size_t kAapcsStackAlignment = 8u;
 static_assert(kAapcsStackAlignment < kStackAlignment);
 
+// STRD immediate can encode any 4-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStrdOffsetCutoff = 1024u;
+
 vixl::aarch32::Register AsVIXLRegister(ArmManagedRegister reg) {
   CHECK(reg.IsCoreRegister());
   return vixl::aarch32::Register(reg.RegId());
@@ -223,8 +226,9 @@
     asm_.StoreToOffset(kStoreWord, AsVIXLRegister(src), sp, dest.Int32Value());
   } else if (src.IsRegisterPair()) {
     CHECK_EQ(8u, size);
-    asm_.StoreToOffset(kStoreWord, AsVIXLRegisterPairLow(src),  sp, dest.Int32Value());
-    asm_.StoreToOffset(kStoreWord, AsVIXLRegisterPairHigh(src), sp, dest.Int32Value() + 4);
+    ___ Strd(AsVIXLRegisterPairLow(src),
+             AsVIXLRegisterPairHigh(src),
+             MemOperand(sp, dest.Int32Value()));
   } else if (src.IsSRegister()) {
     CHECK_EQ(4u, size);
     asm_.StoreSToOffset(AsVIXLSRegister(src), sp, dest.Int32Value());
@@ -365,6 +369,310 @@
   UNIMPLEMENTED(FATAL) << "no zero extension necessary for arm";
 }
 
+static inline bool IsCoreRegisterOrPair(ArmManagedRegister reg) {
+  return reg.IsCoreRegister() || reg.IsRegisterPair();
+}
+
+static inline bool NoSpillGap(const ArgumentLocation& loc1, const ArgumentLocation& loc2) {
+  DCHECK(!loc1.IsRegister());
+  DCHECK(!loc2.IsRegister());
+  uint32_t loc1_offset = loc1.GetFrameOffset().Uint32Value();
+  uint32_t loc2_offset = loc2.GetFrameOffset().Uint32Value();
+  DCHECK_LT(loc1_offset, loc2_offset);
+  return loc1_offset + loc1.GetSize() == loc2_offset;
+}
+
+static inline uint32_t GetSRegisterNumber(ArmManagedRegister reg) {
+  if (reg.IsSRegister()) {
+    return static_cast<uint32_t>(reg.AsSRegister());
+  } else {
+    DCHECK(reg.IsDRegister());
+    return 2u * static_cast<uint32_t>(reg.AsDRegister());
+  }
+}
+
+// Get the number of locations to spill together.
+static inline size_t GetSpillChunkSize(ArrayRef<ArgumentLocation> dests,
+                                       ArrayRef<ArgumentLocation> srcs,
+                                       size_t start,
+                                       bool have_extra_temp) {
+  DCHECK_LT(start, dests.size());
+  DCHECK_ALIGNED(dests[start].GetFrameOffset().Uint32Value(), 4u);
+  const ArgumentLocation& first_src = srcs[start];
+  if (!first_src.IsRegister()) {
+    DCHECK_ALIGNED(first_src.GetFrameOffset().Uint32Value(), 4u);
+    // If we have an extra temporary, look for opportunities to move 2 words
+    // at a time with LDRD/STRD when the source types are word-sized.
+    if (have_extra_temp &&
+        start + 1u != dests.size() &&
+        !srcs[start + 1u].IsRegister() &&
+        first_src.GetSize() == 4u &&
+        srcs[start + 1u].GetSize() == 4u &&
+        NoSpillGap(first_src, srcs[start + 1u]) &&
+        NoSpillGap(dests[start], dests[start + 1u]) &&
+        dests[start].GetFrameOffset().Uint32Value() < kStrdOffsetCutoff) {
+      // Note: The source and destination may not be 8B aligned (but they are 4B aligned).
+      return 2u;
+    }
+    return 1u;
+  }
+  ArmManagedRegister first_src_reg = first_src.GetRegister().AsArm();
+  size_t end = start + 1u;
+  if (IsCoreRegisterOrPair(first_src_reg)) {
+    while (end != dests.size() &&
+           NoSpillGap(dests[end - 1u], dests[end]) &&
+           srcs[end].IsRegister() &&
+           IsCoreRegisterOrPair(srcs[end].GetRegister().AsArm())) {
+      ++end;
+    }
+  } else {
+    DCHECK(first_src_reg.IsSRegister() || first_src_reg.IsDRegister());
+    uint32_t next_sreg = GetSRegisterNumber(first_src_reg) + first_src.GetSize() / kSRegSizeInBytes;
+    while (end != dests.size() &&
+           NoSpillGap(dests[end - 1u], dests[end]) &&
+           srcs[end].IsRegister() &&
+           !IsCoreRegisterOrPair(srcs[end].GetRegister().AsArm()) &&
+           GetSRegisterNumber(srcs[end].GetRegister().AsArm()) == next_sreg) {
+      next_sreg += srcs[end].GetSize() / kSRegSizeInBytes;
+      ++end;
+    }
+  }
+  return end - start;
+}
+
+static inline uint32_t GetCoreRegisterMask(ArmManagedRegister reg) {
+  if (reg.IsCoreRegister()) {
+    return 1u << static_cast<size_t>(reg.AsCoreRegister());
+  } else {
+    DCHECK(reg.IsRegisterPair());
+    DCHECK_LT(reg.AsRegisterPairLow(), reg.AsRegisterPairHigh());
+    return (1u << static_cast<size_t>(reg.AsRegisterPairLow())) |
+           (1u << static_cast<size_t>(reg.AsRegisterPairHigh()));
+  }
+}
+
+static inline uint32_t GetCoreRegisterMask(ArrayRef<ArgumentLocation> srcs) {
+  uint32_t mask = 0u;
+  for (const ArgumentLocation& loc : srcs) {
+    DCHECK(loc.IsRegister());
+    mask |= GetCoreRegisterMask(loc.GetRegister().AsArm());
+  }
+  return mask;
+}
+
+static inline bool UseStrdForChunk(ArrayRef<ArgumentLocation> srcs, size_t start, size_t length) {
+  DCHECK_GE(length, 2u);
+  DCHECK(srcs[start].IsRegister());
+  DCHECK(srcs[start + 1u].IsRegister());
+  // The destination may not be 8B aligned (but it is 4B aligned).
+  // Allow arbitrary destination offset, macro assembler will use a temp if needed.
+  // Note: T32 allows unrelated registers in STRD. (A32 does not.)
+  return length == 2u &&
+         srcs[start].GetRegister().AsArm().IsCoreRegister() &&
+         srcs[start + 1u].GetRegister().AsArm().IsCoreRegister();
+}
+
+static inline bool UseVstrForChunk(ArrayRef<ArgumentLocation> srcs, size_t start, size_t length) {
+  DCHECK_GE(length, 2u);
+  DCHECK(srcs[start].IsRegister());
+  DCHECK(srcs[start + 1u].IsRegister());
+  // The destination may not be 8B aligned (but it is 4B aligned).
+  // Allow arbitrary destination offset, macro assembler will use a temp if needed.
+  return length == 2u &&
+         srcs[start].GetRegister().AsArm().IsSRegister() &&
+         srcs[start + 1u].GetRegister().AsArm().IsSRegister() &&
+         IsAligned<2u>(static_cast<size_t>(srcs[start].GetRegister().AsArm().AsSRegister()));
+}
+
+void ArmVIXLJNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                             ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+
+  // Native ABI is soft-float, so all destinations should be core registers or stack offsets.
+  // And register locations should be first, followed by stack locations with increasing offset.
+  auto is_register = [](const ArgumentLocation& loc) { return loc.IsRegister(); };
+  DCHECK(std::is_partitioned(dests.begin(), dests.end(), is_register));
+  size_t num_reg_dests =
+      std::distance(dests.begin(), std::partition_point(dests.begin(), dests.end(), is_register));
+  DCHECK(std::is_sorted(
+      dests.begin() + num_reg_dests,
+      dests.end(),
+      [](const ArgumentLocation& lhs, const ArgumentLocation& rhs) {
+        return lhs.GetFrameOffset().Uint32Value() < rhs.GetFrameOffset().Uint32Value();
+      }));
+
+  // Collect registers to move. No need to record FP regs as destinations are only core regs.
+  uint32_t src_regs = 0u;
+  uint32_t dest_regs = 0u;
+  for (size_t i = 0; i != num_reg_dests; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK(dest.IsRegister() && IsCoreRegisterOrPair(dest.GetRegister().AsArm()));
+    if (src.IsRegister() && IsCoreRegisterOrPair(src.GetRegister().AsArm())) {
+      if (src.GetRegister().Equals(dest.GetRegister())) {
+        continue;
+      }
+      src_regs |= GetCoreRegisterMask(src.GetRegister().AsArm());
+    }
+    dest_regs |= GetCoreRegisterMask(dest.GetRegister().AsArm());
+  }
+
+  // Spill args first. Look for opportunities to spill multiple arguments at once.
+  {
+    UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+    vixl32::Register xtemp;  // Extra temp register;
+    if ((dest_regs & ~src_regs) != 0u) {
+      xtemp = vixl32::Register(CTZ(dest_regs & ~src_regs));
+      DCHECK(!temps.IsAvailable(xtemp));
+    }
+    auto move_two_words = [&](FrameOffset dest_offset, FrameOffset src_offset) {
+      DCHECK(xtemp.IsValid());
+      DCHECK_LT(dest_offset.Uint32Value(), kStrdOffsetCutoff);
+      // VIXL macro assembler can use destination registers for loads from large offsets.
+      UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
+      vixl32::Register temp2 = temps2.Acquire();
+      ___ Ldrd(xtemp, temp2, MemOperand(sp, src_offset.Uint32Value()));
+      ___ Strd(xtemp, temp2, MemOperand(sp, dest_offset.Uint32Value()));
+    };
+    for (size_t i = num_reg_dests, arg_count = dests.size(); i != arg_count; ) {
+      const ArgumentLocation& src = srcs[i];
+      const ArgumentLocation& dest = dests[i];
+      DCHECK_EQ(src.GetSize(), dest.GetSize());
+      DCHECK(!dest.IsRegister());
+      uint32_t frame_offset = dest.GetFrameOffset().Uint32Value();
+      size_t chunk_size = GetSpillChunkSize(dests, srcs, i, xtemp.IsValid());
+      DCHECK_NE(chunk_size, 0u);
+      if (chunk_size == 1u) {
+        if (src.IsRegister()) {
+          Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+        } else if (dest.GetSize() == 8u && xtemp.IsValid() && frame_offset < kStrdOffsetCutoff) {
+          move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
+        } else {
+          Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+        }
+      } else if (!src.IsRegister()) {
+        DCHECK_EQ(chunk_size, 2u);
+        DCHECK_EQ(dest.GetSize(), 4u);
+        DCHECK_EQ(dests[i + 1u].GetSize(), 4u);
+        move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
+      } else if (UseStrdForChunk(srcs, i, chunk_size)) {
+        ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
+                 AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
+                 MemOperand(sp, frame_offset));
+      } else if (UseVstrForChunk(srcs, i, chunk_size)) {
+        size_t sreg = GetSRegisterNumber(src.GetRegister().AsArm());
+        DCHECK_ALIGNED(sreg, 2u);
+        ___ Vstr(vixl32::DRegister(sreg / 2u), MemOperand(sp, frame_offset));
+      } else {
+        UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
+        vixl32::Register base_reg;
+        if (frame_offset == 0u) {
+          base_reg = sp;
+        } else {
+          base_reg = temps2.Acquire();
+          ___ Add(base_reg, sp, frame_offset);
+        }
+
+        ArmManagedRegister src_reg = src.GetRegister().AsArm();
+        if (IsCoreRegisterOrPair(src_reg)) {
+          uint32_t core_reg_mask = GetCoreRegisterMask(srcs.SubArray(i, chunk_size));
+          ___ Stm(base_reg, NO_WRITE_BACK, RegisterList(core_reg_mask));
+        } else {
+          uint32_t start_sreg = GetSRegisterNumber(src_reg);
+          const ArgumentLocation& last_dest = dests[i + chunk_size - 1u];
+          uint32_t total_size =
+              last_dest.GetFrameOffset().Uint32Value() + last_dest.GetSize() - frame_offset;
+          if (IsAligned<2u>(start_sreg) &&
+              IsAligned<kDRegSizeInBytes>(frame_offset) &&
+              IsAligned<kDRegSizeInBytes>(total_size)) {
+            uint32_t dreg_count = total_size / kDRegSizeInBytes;
+            DRegisterList dreg_list(vixl32::DRegister(start_sreg / 2u), dreg_count);
+            ___ Vstm(F64, base_reg, NO_WRITE_BACK, dreg_list);
+          } else {
+            uint32_t sreg_count = total_size / kSRegSizeInBytes;
+            SRegisterList sreg_list(vixl32::SRegister(start_sreg), sreg_count);
+            ___ Vstm(F32, base_reg, NO_WRITE_BACK, sreg_list);
+          }
+        }
+      }
+      i += chunk_size;
+    }
+  }
+
+  // Fill destination registers from source core registers.
+  // There should be no cycles, so this algorithm should make progress.
+  while (src_regs != 0u) {
+    uint32_t old_src_regs = src_regs;
+    for (size_t i = 0; i != num_reg_dests; ++i) {
+      DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+      if (!srcs[i].IsRegister() || !IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
+        continue;
+      }
+      uint32_t dest_reg_mask = GetCoreRegisterMask(dests[i].GetRegister().AsArm());
+      if ((dest_reg_mask & dest_regs) == 0u) {
+        continue;  // Equals source, or already filled in one of previous iterations.
+      }
+      // There are no partial overlaps of 8-byte arguments, otherwise we would have to
+      // tweak this check; Move() can deal with partial overlap for historical reasons.
+      if ((dest_reg_mask & src_regs) != 0u) {
+        continue;  // Cannot clobber this register yet.
+      }
+      Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+      uint32_t src_reg_mask = GetCoreRegisterMask(srcs[i].GetRegister().AsArm());
+      DCHECK_EQ(src_regs & src_reg_mask, src_reg_mask);
+      src_regs &= ~src_reg_mask;  // Allow clobbering the source register or pair.
+      dest_regs &= ~dest_reg_mask;  // Destination register or pair was filled.
+    }
+    CHECK_NE(old_src_regs, src_regs);
+    DCHECK_EQ(0u, src_regs & ~old_src_regs);
+  }
+
+  // Now fill destination registers from FP registers or stack slots, looking for
+  // opportunities to use LDRD/VMOV to fill 2 registers with one instruction.
+  for (size_t i = 0, j; i != num_reg_dests; i = j) {
+    j = i + 1u;
+    DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+    if (srcs[i].IsRegister() && IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
+      DCHECK_EQ(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+      continue;  // Equals destination or moved above.
+    }
+    DCHECK_NE(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+    if (dests[i].GetSize() == 4u) {
+      // Find next register to load.
+      while (j != num_reg_dests &&
+             (srcs[j].IsRegister() && IsCoreRegisterOrPair(srcs[j].GetRegister().AsArm()))) {
+        DCHECK_EQ(GetCoreRegisterMask(dests[j].GetRegister().AsArm()) & dest_regs, 0u);
+        ++j;  // Equals destination or moved above.
+      }
+      if (j != num_reg_dests && dests[j].GetSize() == 4u) {
+        if (!srcs[i].IsRegister() && !srcs[j].IsRegister() && NoSpillGap(srcs[i], srcs[j])) {
+          ___ Ldrd(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+                   AsVIXLRegister(dests[j].GetRegister().AsArm()),
+                   MemOperand(sp, srcs[i].GetFrameOffset().Uint32Value()));
+          ++j;
+          continue;
+        }
+        if (srcs[i].IsRegister() && srcs[j].IsRegister()) {
+          uint32_t first_sreg = GetSRegisterNumber(srcs[i].GetRegister().AsArm());
+          if (IsAligned<2u>(first_sreg) &&
+              first_sreg + 1u == GetSRegisterNumber(srcs[j].GetRegister().AsArm())) {
+            ___ Vmov(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+                     AsVIXLRegister(dests[j].GetRegister().AsArm()),
+                     vixl32::DRegister(first_sreg / 2u));
+            ++j;
+            continue;
+          }
+        }
+      }
+    }
+    if (srcs[i].IsRegister()) {
+      Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+    } else {
+      Load(dests[i].GetRegister(), srcs[i].GetFrameOffset(), dests[i].GetSize());
+    }
+  }
+}
+
 void ArmVIXLJNIMacroAssembler::Move(ManagedRegister mdst,
                                     ManagedRegister msrc,
                                     size_t size  ATTRIBUTE_UNUSED) {
@@ -387,8 +695,12 @@
   ArmManagedRegister src = msrc.AsArm();
   if (!dst.Equals(src)) {
     if (dst.IsCoreRegister()) {
-      CHECK(src.IsCoreRegister()) << src;
-      ___ Mov(AsVIXLRegister(dst), AsVIXLRegister(src));
+      if (src.IsCoreRegister()) {
+        ___ Mov(AsVIXLRegister(dst), AsVIXLRegister(src));
+      } else {
+        CHECK(src.IsSRegister()) << src;
+        ___ Vmov(AsVIXLRegister(dst), AsVIXLSRegister(src));
+      }
     } else if (dst.IsDRegister()) {
       if (src.IsDRegister()) {
         ___ Vmov(F64, AsVIXLDRegister(dst), AsVIXLDRegister(src));
@@ -407,14 +719,18 @@
       }
     } else {
       CHECK(dst.IsRegisterPair()) << dst;
-      CHECK(src.IsRegisterPair()) << src;
-      // Ensure that the first move doesn't clobber the input of the second.
-      if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
-        ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
-        ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+      if (src.IsRegisterPair()) {
+        // Ensure that the first move doesn't clobber the input of the second.
+        if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
+          ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
+          ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+        } else {
+          ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+          ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
+        }
       } else {
-        ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
-        ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
+        CHECK(src.IsDRegister()) << src;
+        ___ Vmov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairHigh(dst), AsVIXLDRegister(src));
       }
     }
   }
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 2bd571e..2f6813a 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -93,6 +93,8 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
 
   // Copying routines.
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset32 thr_offs) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a31ed93..bb93a96 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -326,6 +326,88 @@
 }
 
 // Copying routines.
+void Arm64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                           ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+  auto get_mask = [](ManagedRegister reg) -> uint64_t {
+    Arm64ManagedRegister arm64_reg = reg.AsArm64();
+    if (arm64_reg.IsXRegister()) {
+      size_t core_reg_number = static_cast<size_t>(arm64_reg.AsXRegister());
+      DCHECK_LT(core_reg_number, 31u);  // xSP, xZR not allowed.
+      return UINT64_C(1) << core_reg_number;
+    } else if (arm64_reg.IsWRegister()) {
+      size_t core_reg_number = static_cast<size_t>(arm64_reg.AsWRegister());
+      DCHECK_LT(core_reg_number, 31u);  // wSP, wZR not allowed.
+      return UINT64_C(1) << core_reg_number;
+    } else if (arm64_reg.IsDRegister()) {
+      size_t fp_reg_number = static_cast<size_t>(arm64_reg.AsDRegister());
+      DCHECK_LT(fp_reg_number, 32u);
+      return (UINT64_C(1) << 32u) << fp_reg_number;
+    } else {
+      DCHECK(arm64_reg.IsSRegister());
+      size_t fp_reg_number = static_cast<size_t>(arm64_reg.AsSRegister());
+      DCHECK_LT(fp_reg_number, 32u);
+      return (UINT64_C(1) << 32u) << fp_reg_number;
+    }
+  };
+  // Collect registers to move while storing/copying args to stack slots.
+  // More than 8 core or FP reg args are very rare, so we do not optimize
+  // for that case by using LDP/STP.
+  // TODO: LDP/STP will be useful for normal and @FastNative where we need
+  // to spill even the leading arguments.
+  uint64_t src_regs = 0u;
+  uint64_t dest_regs = 0u;
+  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    if (dest.IsRegister()) {
+      if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+        // Nothing to do.
+      } else {
+        if (src.IsRegister()) {
+          src_regs |= get_mask(src.GetRegister());
+        }
+        dest_regs |= get_mask(dest.GetRegister());
+      }
+    } else {
+      if (src.IsRegister()) {
+        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
+    }
+  }
+  // Fill destination registers.
+  // There should be no cycles, so this simple algorithm should make progress.
+  while (dest_regs != 0u) {
+    uint64_t old_dest_regs = dest_regs;
+    for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+      const ArgumentLocation& src = srcs[i];
+      const ArgumentLocation& dest = dests[i];
+      if (!dest.IsRegister()) {
+        continue;  // Stored in first loop above.
+      }
+      uint64_t dest_reg_mask = get_mask(dest.GetRegister());
+      if ((dest_reg_mask & dest_regs) == 0u) {
+        continue;  // Equals source, or already filled in one of previous iterations.
+      }
+      if ((dest_reg_mask & src_regs) != 0u) {
+        continue;  // Cannot clobber this register yet.
+      }
+      if (src.IsRegister()) {
+        Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+        src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
+      } else {
+        Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+      }
+      dest_regs &= ~get_mask(dest.GetRegister());  // Destination register was filled.
+    }
+    CHECK_NE(old_dest_regs, dest_regs);
+    DCHECK_EQ(0u, dest_regs & ~old_dest_regs);
+  }
+}
+
 void Arm64JNIMacroAssembler::Move(ManagedRegister m_dst, ManagedRegister m_src, size_t size) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   if (kIsDebugBuild) {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 64b5595..9f3eea2 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -85,6 +85,7 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
 
   // Copying routines.
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;
   void CopyRawPtrToThread(ThreadOffset64 thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 48b3f01..3490959 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -43,6 +43,40 @@
   kNotZero
 };
 
+class ArgumentLocation {
+ public:
+  ArgumentLocation(ManagedRegister reg, size_t size)
+      : reg_(reg), frame_offset_(0u), size_(size) {
+    DCHECK(reg.IsRegister());
+  }
+
+  ArgumentLocation(FrameOffset frame_offset, size_t size)
+      : reg_(ManagedRegister::NoRegister()), frame_offset_(frame_offset), size_(size) {}
+
+  bool IsRegister() const {
+    return reg_.IsRegister();
+  }
+
+  ManagedRegister GetRegister() const {
+    DCHECK(IsRegister());
+    return reg_;
+  }
+
+  FrameOffset GetFrameOffset() const {
+    DCHECK(!IsRegister());
+    return frame_offset_;
+  }
+
+  size_t GetSize() const {
+    return size_;
+  }
+
+ private:
+  ManagedRegister reg_;
+  FrameOffset frame_offset_;
+  size_t size_;
+};
+
 template <PointerSize kPointerSize>
 class JNIMacroAssembler : public DeletableArenaObject<kArenaAllocAssembler> {
  public:
@@ -112,6 +146,8 @@
   virtual void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset<kPointerSize> offs) = 0;
 
   // Copying routines
+  virtual void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) = 0;
+
   virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
 
   virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset<kPointerSize> thr_offs) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 1adcc20..67ec93d 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -300,6 +300,30 @@
   }
 }
 
+void X86JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                         ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+  bool found_hidden_arg = false;
+  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    if (UNLIKELY(dest.IsRegister())) {
+      // Native ABI has only stack arguments but we may pass one "hidden arg" in register.
+      CHECK(!found_hidden_arg);
+      found_hidden_arg = true;
+      CHECK(src.IsRegister());
+      Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+    } else {
+      if (src.IsRegister()) {
+        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
+    }
+  }
+}
+
 void X86JNIMacroAssembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
   DCHECK(!mdest.Equals(X86ManagedRegister::FromCpuRegister(GetScratchRegister())));
   X86ManagedRegister dest = mdest.AsX86();
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 1223471..0239ff7 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -82,6 +82,8 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
 
   // Copying routines
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset32 thr_offs) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index d57ea41..2649084 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -338,6 +338,76 @@
   }
 }
 
+void X86_64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                            ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+  auto get_mask = [](ManagedRegister reg) -> uint32_t {
+    X86_64ManagedRegister x86_64_reg = reg.AsX86_64();
+    if (x86_64_reg.IsCpuRegister()) {
+      size_t cpu_reg_number = static_cast<size_t>(x86_64_reg.AsCpuRegister().AsRegister());
+      DCHECK_LT(cpu_reg_number, 16u);
+      return 1u << cpu_reg_number;
+    } else {
+      DCHECK(x86_64_reg.IsXmmRegister());
+      size_t xmm_reg_number = static_cast<size_t>(x86_64_reg.AsXmmRegister().AsFloatRegister());
+      DCHECK_LT(xmm_reg_number, 16u);
+      return (1u << 16u) << xmm_reg_number;
+    }
+  };
+  // Collect registers to move while storing/copying args to stack slots.
+  uint32_t src_regs = 0u;
+  uint32_t dest_regs = 0u;
+  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    if (dest.IsRegister()) {
+      if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+        // Nothing to do.
+      } else {
+        if (src.IsRegister()) {
+          src_regs |= get_mask(src.GetRegister());
+        }
+        dest_regs |= get_mask(dest.GetRegister());
+      }
+    } else {
+      if (src.IsRegister()) {
+        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
+    }
+  }
+  // Fill destination registers.
+  // There should be no cycles, so this simple algorithm should make progress.
+  while (dest_regs != 0u) {
+    uint32_t old_dest_regs = dest_regs;
+    for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+      const ArgumentLocation& src = srcs[i];
+      const ArgumentLocation& dest = dests[i];
+      if (!dest.IsRegister()) {
+        continue;  // Stored in first loop above.
+      }
+      uint32_t dest_reg_mask = get_mask(dest.GetRegister());
+      if ((dest_reg_mask & dest_regs) == 0u) {
+        continue;  // Equals source, or already filled in one of previous iterations.
+      }
+      if ((dest_reg_mask & src_regs) != 0u) {
+        continue;  // Cannot clobber this register yet.
+      }
+      if (src.IsRegister()) {
+        Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+        src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
+      } else {
+        Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+      }
+      dest_regs &= ~get_mask(dest.GetRegister());  // Destination register was filled.
+    }
+    CHECK_NE(old_dest_regs, dest_regs);
+    DCHECK_EQ(0u, dest_regs & ~old_dest_regs);
+  }
+}
+
 void X86_64JNIMacroAssembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
   DCHECK(!mdest.Equals(X86_64ManagedRegister::FromCpuRegister(GetScratchRegister().AsRegister())));
   X86_64ManagedRegister dest = mdest.AsX86_64();
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 4592eba..6589544 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -85,6 +85,8 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
 
   // Copying routines
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;