Move @CriticalNative arguments in registers.

And spill stack arguments directly to the right location.
Do not spill to the reserved space in the caller's frame.

Preliminary Golem results for art-opt-cc:
                            x86 x86-64    arm  arm64
NativeDowncallCritical6:    n/a +14.3% +17.2% +26.1%
(x86 seems to be currently providing results that are worse
than interpreter, so something is not working.)

Test: Additional tests in 178-app-image-native-method test.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: I709c52ab2585a8f5f441f53ad2bf4a01d2b25dca
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index ac060cc..036cdbb 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -24,6 +24,7 @@
 
 #include "art_method.h"
 #include "base/arena_allocator.h"
+#include "base/arena_containers.h"
 #include "base/enums.h"
 #include "base/logging.h"  // For VLOG.
 #include "base/macros.h"
@@ -227,13 +228,10 @@
   __ BuildFrame(current_frame_size, method_register, callee_save_regs);
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
 
-  {
+  if (LIKELY(!is_critical_native)) {
     // Spill all register arguments.
     // TODO: Spill reference args directly to the HandleScope.
     // TODO: Spill native stack args straight to their stack locations (adjust SP earlier).
-    // TODO: Move args in registers without spilling for @CriticalNative.
-    // TODO: Provide assembler API for batched moves to allow moving multiple arguments
-    // with single instruction (arm: LDRD/STRD/LDMIA/STMIA, arm64: LDP/STP).
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     for (; mr_conv->HasNext(); mr_conv->Next()) {
       if (mr_conv->IsCurrentParamInRegister()) {
@@ -241,9 +239,7 @@
         __ Store(mr_conv->CurrentParamStackOffset(), mr_conv->CurrentParamRegister(), size);
       }
     }
-  }
 
-  if (LIKELY(!is_critical_native)) {
     // NOTE: @CriticalNative methods don't have a HandleScope
     //       because they can't have any reference parameters or return values.
 
@@ -320,10 +316,6 @@
   size_t current_out_arg_size = main_out_arg_size;
   if (UNLIKELY(is_critical_native)) {
     DCHECK_EQ(main_out_arg_size, current_frame_size);
-    // Move the method pointer to the hidden argument register.
-    __ Move(main_jni_conv->HiddenArgumentRegister(),
-            mr_conv->MethodRegister(),
-            static_cast<size_t>(main_jni_conv->GetFramePointerSize()));
   } else {
     __ IncreaseFrameSize(main_out_arg_size);
     current_frame_size += main_out_arg_size;
@@ -434,65 +426,86 @@
     __ Store(saved_cookie_offset, main_jni_conv->IntReturnRegister(), 4 /* sizeof cookie */);
   }
 
-  // 7. Iterate over arguments placing values from managed calling convention in
-  //    to the convention required for a native call (shuffling). For references
-  //    place an index/pointer to the reference after checking whether it is
-  //    null (which must be encoded as null).
-  //    Note: we do this prior to materializing the JNIEnv* and static's jclass to
-  //    give as many free registers for the shuffle as possible.
-  mr_conv->ResetIterator(FrameOffset(current_frame_size));
-  uint32_t args_count = 0;
-  while (mr_conv->HasNext()) {
-    args_count++;
-    mr_conv->Next();
-  }
-
-  // Do a backward pass over arguments, so that the generated code will be "mov
-  // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
-  // TODO: A reverse iterator to improve readability.
-  // TODO: This is currently useless as all archs spill args when building the frame.
-  //       To avoid the full spilling, we would have to do one pass before the BuildFrame()
-  //       to determine which arg registers are clobbered before they are needed.
-  // TODO: For @CriticalNative, do a forward pass because there are no JNIEnv* and jclass* args.
-  for (uint32_t i = 0; i < args_count; ++i) {
+  // 7. Fill arguments.
+  if (UNLIKELY(is_critical_native)) {
+    ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
+    ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
+    // Move the method pointer to the hidden argument register.
+    size_t pointer_size = static_cast<size_t>(kPointerSize);
+    dest_args.push_back(ArgumentLocation(main_jni_conv->HiddenArgumentRegister(), pointer_size));
+    src_args.push_back(ArgumentLocation(mr_conv->MethodRegister(), pointer_size));
+    // Move normal arguments to their locations.
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+    for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
+      DCHECK(main_jni_conv->HasNext());
+      size_t size = mr_conv->IsCurrentParamALongOrDouble() ? 8u : 4u;
+      src_args.push_back(mr_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamRegister(), size)
+          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), size));
+      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), size)
+          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), size));
+    }
+    DCHECK(!main_jni_conv->HasNext());
+    __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args), ArrayRef<ArgumentLocation>(src_args));
+  } else {
+    // Iterate over arguments placing values from managed calling convention in
+    // to the convention required for a native call (shuffling). For references
+    // place an index/pointer to the reference after checking whether it is
+    // null (which must be encoded as null).
+    // Note: we do this prior to materializing the JNIEnv* and static's jclass to
+    // give as many free registers for the shuffle as possible.
+    mr_conv->ResetIterator(FrameOffset(current_frame_size));
+    uint32_t args_count = 0;
+    while (mr_conv->HasNext()) {
+      args_count++;
+      mr_conv->Next();
+    }
 
-    // Skip the extra JNI parameters for now.
-    if (LIKELY(!is_critical_native)) {
+    // Do a backward pass over arguments, so that the generated code will be "mov
+    // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
+    // TODO: A reverse iterator to improve readability.
+    // TODO: This is currently useless as all archs spill args when building the frame.
+    //       To avoid the full spilling, we would have to do one pass before the BuildFrame()
+    //       to determine which arg registers are clobbered before they are needed.
+    for (uint32_t i = 0; i < args_count; ++i) {
+      mr_conv->ResetIterator(FrameOffset(current_frame_size));
+      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+
+      // Skip the extra JNI parameters for now.
       main_jni_conv->Next();    // Skip JNIEnv*.
       if (is_static) {
         main_jni_conv->Next();  // Skip Class for now.
       }
+      // Skip to the argument we're interested in.
+      for (uint32_t j = 0; j < args_count - i - 1; ++j) {
+        mr_conv->Next();
+        main_jni_conv->Next();
+      }
+      CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
     }
-    // Skip to the argument we're interested in.
-    for (uint32_t j = 0; j < args_count - i - 1; ++j) {
-      mr_conv->Next();
-      main_jni_conv->Next();
-    }
-    CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
-  }
-  if (is_static && !is_critical_native) {
-    // Create argument for Class
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    main_jni_conv->Next();  // Skip JNIEnv*
-    FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
-    if (main_jni_conv->IsCurrentParamOnStack()) {
-      FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
-      __ CreateHandleScopeEntry(out_off, handle_scope_offset, /*null_allowed=*/ false);
-    } else {
-      ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
-      __ CreateHandleScopeEntry(out_reg,
-                                handle_scope_offset,
-                                ManagedRegister::NoRegister(),
+    if (is_static) {
+      // Create argument for Class
+      mr_conv->ResetIterator(FrameOffset(current_frame_size));
+      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+      main_jni_conv->Next();  // Skip JNIEnv*
+      FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
+      if (main_jni_conv->IsCurrentParamOnStack()) {
+        FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
+        __ CreateHandleScopeEntry(out_off, handle_scope_offset, /*null_allowed=*/ false);
+      } else {
+        ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
+        __ CreateHandleScopeEntry(out_reg,
+                                  handle_scope_offset,
+                                  ManagedRegister::NoRegister(),
                                 /*null_allowed=*/ false);
+      }
     }
-  }
 
-  // Set the iterator back to the incoming Method*.
-  main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-  if (LIKELY(!is_critical_native)) {
+    // Set the iterator back to the incoming Method*.
+    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+
     // 8. Create 1st argument, the JNI environment ptr.
     // Register that will hold local indirect reference table
     if (main_jni_conv->IsCurrentParamInRegister()) {
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 5a355be..85b253c 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -41,6 +41,9 @@
 static constexpr size_t kAapcsStackAlignment = 8u;
 static_assert(kAapcsStackAlignment < kStackAlignment);
 
+// STRD immediate can encode any 4-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStrdOffsetCutoff = 1024u;
+
 vixl::aarch32::Register AsVIXLRegister(ArmManagedRegister reg) {
   CHECK(reg.IsCoreRegister());
   return vixl::aarch32::Register(reg.RegId());
@@ -223,8 +226,9 @@
     asm_.StoreToOffset(kStoreWord, AsVIXLRegister(src), sp, dest.Int32Value());
   } else if (src.IsRegisterPair()) {
     CHECK_EQ(8u, size);
-    asm_.StoreToOffset(kStoreWord, AsVIXLRegisterPairLow(src),  sp, dest.Int32Value());
-    asm_.StoreToOffset(kStoreWord, AsVIXLRegisterPairHigh(src), sp, dest.Int32Value() + 4);
+    ___ Strd(AsVIXLRegisterPairLow(src),
+             AsVIXLRegisterPairHigh(src),
+             MemOperand(sp, dest.Int32Value()));
   } else if (src.IsSRegister()) {
     CHECK_EQ(4u, size);
     asm_.StoreSToOffset(AsVIXLSRegister(src), sp, dest.Int32Value());
@@ -365,6 +369,310 @@
   UNIMPLEMENTED(FATAL) << "no zero extension necessary for arm";
 }
 
+static inline bool IsCoreRegisterOrPair(ArmManagedRegister reg) {
+  return reg.IsCoreRegister() || reg.IsRegisterPair();
+}
+
+static inline bool NoSpillGap(const ArgumentLocation& loc1, const ArgumentLocation& loc2) {
+  DCHECK(!loc1.IsRegister());
+  DCHECK(!loc2.IsRegister());
+  uint32_t loc1_offset = loc1.GetFrameOffset().Uint32Value();
+  uint32_t loc2_offset = loc2.GetFrameOffset().Uint32Value();
+  DCHECK_LT(loc1_offset, loc2_offset);
+  return loc1_offset + loc1.GetSize() == loc2_offset;
+}
+
+static inline uint32_t GetSRegisterNumber(ArmManagedRegister reg) {
+  if (reg.IsSRegister()) {
+    return static_cast<uint32_t>(reg.AsSRegister());
+  } else {
+    DCHECK(reg.IsDRegister());
+    return 2u * static_cast<uint32_t>(reg.AsDRegister());
+  }
+}
+
+// Get the number of locations to spill together.
+static inline size_t GetSpillChunkSize(ArrayRef<ArgumentLocation> dests,
+                                       ArrayRef<ArgumentLocation> srcs,
+                                       size_t start,
+                                       bool have_extra_temp) {
+  DCHECK_LT(start, dests.size());
+  DCHECK_ALIGNED(dests[start].GetFrameOffset().Uint32Value(), 4u);
+  const ArgumentLocation& first_src = srcs[start];
+  if (!first_src.IsRegister()) {
+    DCHECK_ALIGNED(first_src.GetFrameOffset().Uint32Value(), 4u);
+    // If we have an extra temporary, look for opportunities to move 2 words
+    // at a time with LDRD/STRD when the source types are word-sized.
+    if (have_extra_temp &&
+        start + 1u != dests.size() &&
+        !srcs[start + 1u].IsRegister() &&
+        first_src.GetSize() == 4u &&
+        srcs[start + 1u].GetSize() == 4u &&
+        NoSpillGap(first_src, srcs[start + 1u]) &&
+        NoSpillGap(dests[start], dests[start + 1u]) &&
+        dests[start].GetFrameOffset().Uint32Value() < kStrdOffsetCutoff) {
+      // Note: The source and destination may not be 8B aligned (but they are 4B aligned).
+      return 2u;
+    }
+    return 1u;
+  }
+  ArmManagedRegister first_src_reg = first_src.GetRegister().AsArm();
+  size_t end = start + 1u;
+  if (IsCoreRegisterOrPair(first_src_reg)) {
+    while (end != dests.size() &&
+           NoSpillGap(dests[end - 1u], dests[end]) &&
+           srcs[end].IsRegister() &&
+           IsCoreRegisterOrPair(srcs[end].GetRegister().AsArm())) {
+      ++end;
+    }
+  } else {
+    DCHECK(first_src_reg.IsSRegister() || first_src_reg.IsDRegister());
+    uint32_t next_sreg = GetSRegisterNumber(first_src_reg) + first_src.GetSize() / kSRegSizeInBytes;
+    while (end != dests.size() &&
+           NoSpillGap(dests[end - 1u], dests[end]) &&
+           srcs[end].IsRegister() &&
+           !IsCoreRegisterOrPair(srcs[end].GetRegister().AsArm()) &&
+           GetSRegisterNumber(srcs[end].GetRegister().AsArm()) == next_sreg) {
+      next_sreg += srcs[end].GetSize() / kSRegSizeInBytes;
+      ++end;
+    }
+  }
+  return end - start;
+}
+
+static inline uint32_t GetCoreRegisterMask(ArmManagedRegister reg) {
+  if (reg.IsCoreRegister()) {
+    return 1u << static_cast<size_t>(reg.AsCoreRegister());
+  } else {
+    DCHECK(reg.IsRegisterPair());
+    DCHECK_LT(reg.AsRegisterPairLow(), reg.AsRegisterPairHigh());
+    return (1u << static_cast<size_t>(reg.AsRegisterPairLow())) |
+           (1u << static_cast<size_t>(reg.AsRegisterPairHigh()));
+  }
+}
+
+static inline uint32_t GetCoreRegisterMask(ArrayRef<ArgumentLocation> srcs) {
+  uint32_t mask = 0u;
+  for (const ArgumentLocation& loc : srcs) {
+    DCHECK(loc.IsRegister());
+    mask |= GetCoreRegisterMask(loc.GetRegister().AsArm());
+  }
+  return mask;
+}
+
+static inline bool UseStrdForChunk(ArrayRef<ArgumentLocation> srcs, size_t start, size_t length) {
+  DCHECK_GE(length, 2u);
+  DCHECK(srcs[start].IsRegister());
+  DCHECK(srcs[start + 1u].IsRegister());
+  // The destination may not be 8B aligned (but it is 4B aligned).
+  // Allow arbitrary destination offset, macro assembler will use a temp if needed.
+  // Note: T32 allows unrelated registers in STRD. (A32 does not.)
+  return length == 2u &&
+         srcs[start].GetRegister().AsArm().IsCoreRegister() &&
+         srcs[start + 1u].GetRegister().AsArm().IsCoreRegister();
+}
+
+static inline bool UseVstrForChunk(ArrayRef<ArgumentLocation> srcs, size_t start, size_t length) {
+  DCHECK_GE(length, 2u);
+  DCHECK(srcs[start].IsRegister());
+  DCHECK(srcs[start + 1u].IsRegister());
+  // The destination may not be 8B aligned (but it is 4B aligned).
+  // Allow arbitrary destination offset, macro assembler will use a temp if needed.
+  return length == 2u &&
+         srcs[start].GetRegister().AsArm().IsSRegister() &&
+         srcs[start + 1u].GetRegister().AsArm().IsSRegister() &&
+         IsAligned<2u>(static_cast<size_t>(srcs[start].GetRegister().AsArm().AsSRegister()));
+}
+
+void ArmVIXLJNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                             ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+
+  // Native ABI is soft-float, so all destinations should be core registers or stack offsets.
+  // And register locations should be first, followed by stack locations with increasing offset.
+  auto is_register = [](const ArgumentLocation& loc) { return loc.IsRegister(); };
+  DCHECK(std::is_partitioned(dests.begin(), dests.end(), is_register));
+  size_t num_reg_dests =
+      std::distance(dests.begin(), std::partition_point(dests.begin(), dests.end(), is_register));
+  DCHECK(std::is_sorted(
+      dests.begin() + num_reg_dests,
+      dests.end(),
+      [](const ArgumentLocation& lhs, const ArgumentLocation& rhs) {
+        return lhs.GetFrameOffset().Uint32Value() < rhs.GetFrameOffset().Uint32Value();
+      }));
+
+  // Collect registers to move. No need to record FP regs as destinations are only core regs.
+  uint32_t src_regs = 0u;
+  uint32_t dest_regs = 0u;
+  for (size_t i = 0; i != num_reg_dests; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK(dest.IsRegister() && IsCoreRegisterOrPair(dest.GetRegister().AsArm()));
+    if (src.IsRegister() && IsCoreRegisterOrPair(src.GetRegister().AsArm())) {
+      if (src.GetRegister().Equals(dest.GetRegister())) {
+        continue;
+      }
+      src_regs |= GetCoreRegisterMask(src.GetRegister().AsArm());
+    }
+    dest_regs |= GetCoreRegisterMask(dest.GetRegister().AsArm());
+  }
+
+  // Spill args first. Look for opportunities to spill multiple arguments at once.
+  {
+    UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+    vixl32::Register xtemp;  // Extra temp register;
+    if ((dest_regs & ~src_regs) != 0u) {
+      xtemp = vixl32::Register(CTZ(dest_regs & ~src_regs));
+      DCHECK(!temps.IsAvailable(xtemp));
+    }
+    auto move_two_words = [&](FrameOffset dest_offset, FrameOffset src_offset) {
+      DCHECK(xtemp.IsValid());
+      DCHECK_LT(dest_offset.Uint32Value(), kStrdOffsetCutoff);
+      // VIXL macro assembler can use destination registers for loads from large offsets.
+      UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
+      vixl32::Register temp2 = temps2.Acquire();
+      ___ Ldrd(xtemp, temp2, MemOperand(sp, src_offset.Uint32Value()));
+      ___ Strd(xtemp, temp2, MemOperand(sp, dest_offset.Uint32Value()));
+    };
+    for (size_t i = num_reg_dests, arg_count = dests.size(); i != arg_count; ) {
+      const ArgumentLocation& src = srcs[i];
+      const ArgumentLocation& dest = dests[i];
+      DCHECK_EQ(src.GetSize(), dest.GetSize());
+      DCHECK(!dest.IsRegister());
+      uint32_t frame_offset = dest.GetFrameOffset().Uint32Value();
+      size_t chunk_size = GetSpillChunkSize(dests, srcs, i, xtemp.IsValid());
+      DCHECK_NE(chunk_size, 0u);
+      if (chunk_size == 1u) {
+        if (src.IsRegister()) {
+          Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+        } else if (dest.GetSize() == 8u && xtemp.IsValid() && frame_offset < kStrdOffsetCutoff) {
+          move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
+        } else {
+          Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+        }
+      } else if (!src.IsRegister()) {
+        DCHECK_EQ(chunk_size, 2u);
+        DCHECK_EQ(dest.GetSize(), 4u);
+        DCHECK_EQ(dests[i + 1u].GetSize(), 4u);
+        move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
+      } else if (UseStrdForChunk(srcs, i, chunk_size)) {
+        ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
+                 AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
+                 MemOperand(sp, frame_offset));
+      } else if (UseVstrForChunk(srcs, i, chunk_size)) {
+        size_t sreg = GetSRegisterNumber(src.GetRegister().AsArm());
+        DCHECK_ALIGNED(sreg, 2u);
+        ___ Vstr(vixl32::DRegister(sreg / 2u), MemOperand(sp, frame_offset));
+      } else {
+        UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
+        vixl32::Register base_reg;
+        if (frame_offset == 0u) {
+          base_reg = sp;
+        } else {
+          base_reg = temps2.Acquire();
+          ___ Add(base_reg, sp, frame_offset);
+        }
+
+        ArmManagedRegister src_reg = src.GetRegister().AsArm();
+        if (IsCoreRegisterOrPair(src_reg)) {
+          uint32_t core_reg_mask = GetCoreRegisterMask(srcs.SubArray(i, chunk_size));
+          ___ Stm(base_reg, NO_WRITE_BACK, RegisterList(core_reg_mask));
+        } else {
+          uint32_t start_sreg = GetSRegisterNumber(src_reg);
+          const ArgumentLocation& last_dest = dests[i + chunk_size - 1u];
+          uint32_t total_size =
+              last_dest.GetFrameOffset().Uint32Value() + last_dest.GetSize() - frame_offset;
+          if (IsAligned<2u>(start_sreg) &&
+              IsAligned<kDRegSizeInBytes>(frame_offset) &&
+              IsAligned<kDRegSizeInBytes>(total_size)) {
+            uint32_t dreg_count = total_size / kDRegSizeInBytes;
+            DRegisterList dreg_list(vixl32::DRegister(start_sreg / 2u), dreg_count);
+            ___ Vstm(F64, base_reg, NO_WRITE_BACK, dreg_list);
+          } else {
+            uint32_t sreg_count = total_size / kSRegSizeInBytes;
+            SRegisterList sreg_list(vixl32::SRegister(start_sreg), sreg_count);
+            ___ Vstm(F32, base_reg, NO_WRITE_BACK, sreg_list);
+          }
+        }
+      }
+      i += chunk_size;
+    }
+  }
+
+  // Fill destination registers from source core registers.
+  // There should be no cycles, so this algorithm should make progress.
+  while (src_regs != 0u) {
+    uint32_t old_src_regs = src_regs;
+    for (size_t i = 0; i != num_reg_dests; ++i) {
+      DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+      if (!srcs[i].IsRegister() || !IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
+        continue;
+      }
+      uint32_t dest_reg_mask = GetCoreRegisterMask(dests[i].GetRegister().AsArm());
+      if ((dest_reg_mask & dest_regs) == 0u) {
+        continue;  // Equals source, or already filled in one of previous iterations.
+      }
+      // There are no partial overlaps of 8-byte arguments, otherwise we would have to
+      // tweak this check; Move() can deal with partial overlap for historical reasons.
+      if ((dest_reg_mask & src_regs) != 0u) {
+        continue;  // Cannot clobber this register yet.
+      }
+      Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+      uint32_t src_reg_mask = GetCoreRegisterMask(srcs[i].GetRegister().AsArm());
+      DCHECK_EQ(src_regs & src_reg_mask, src_reg_mask);
+      src_regs &= ~src_reg_mask;  // Allow clobbering the source register or pair.
+      dest_regs &= ~dest_reg_mask;  // Destination register or pair was filled.
+    }
+    CHECK_NE(old_src_regs, src_regs);
+    DCHECK_EQ(0u, src_regs & ~old_src_regs);
+  }
+
+  // Now fill destination registers from FP registers or stack slots, looking for
+  // opportunities to use LDRD/VMOV to fill 2 registers with one instruction.
+  for (size_t i = 0, j; i != num_reg_dests; i = j) {
+    j = i + 1u;
+    DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+    if (srcs[i].IsRegister() && IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
+      DCHECK_EQ(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+      continue;  // Equals destination or moved above.
+    }
+    DCHECK_NE(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+    if (dests[i].GetSize() == 4u) {
+      // Find next register to load.
+      while (j != num_reg_dests &&
+             (srcs[j].IsRegister() && IsCoreRegisterOrPair(srcs[j].GetRegister().AsArm()))) {
+        DCHECK_EQ(GetCoreRegisterMask(dests[j].GetRegister().AsArm()) & dest_regs, 0u);
+        ++j;  // Equals destination or moved above.
+      }
+      if (j != num_reg_dests && dests[j].GetSize() == 4u) {
+        if (!srcs[i].IsRegister() && !srcs[j].IsRegister() && NoSpillGap(srcs[i], srcs[j])) {
+          ___ Ldrd(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+                   AsVIXLRegister(dests[j].GetRegister().AsArm()),
+                   MemOperand(sp, srcs[i].GetFrameOffset().Uint32Value()));
+          ++j;
+          continue;
+        }
+        if (srcs[i].IsRegister() && srcs[j].IsRegister()) {
+          uint32_t first_sreg = GetSRegisterNumber(srcs[i].GetRegister().AsArm());
+          if (IsAligned<2u>(first_sreg) &&
+              first_sreg + 1u == GetSRegisterNumber(srcs[j].GetRegister().AsArm())) {
+            ___ Vmov(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+                     AsVIXLRegister(dests[j].GetRegister().AsArm()),
+                     vixl32::DRegister(first_sreg / 2u));
+            ++j;
+            continue;
+          }
+        }
+      }
+    }
+    if (srcs[i].IsRegister()) {
+      Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+    } else {
+      Load(dests[i].GetRegister(), srcs[i].GetFrameOffset(), dests[i].GetSize());
+    }
+  }
+}
+
 void ArmVIXLJNIMacroAssembler::Move(ManagedRegister mdst,
                                     ManagedRegister msrc,
                                     size_t size  ATTRIBUTE_UNUSED) {
@@ -387,8 +695,12 @@
   ArmManagedRegister src = msrc.AsArm();
   if (!dst.Equals(src)) {
     if (dst.IsCoreRegister()) {
-      CHECK(src.IsCoreRegister()) << src;
-      ___ Mov(AsVIXLRegister(dst), AsVIXLRegister(src));
+      if (src.IsCoreRegister()) {
+        ___ Mov(AsVIXLRegister(dst), AsVIXLRegister(src));
+      } else {
+        CHECK(src.IsSRegister()) << src;
+        ___ Vmov(AsVIXLRegister(dst), AsVIXLSRegister(src));
+      }
     } else if (dst.IsDRegister()) {
       if (src.IsDRegister()) {
         ___ Vmov(F64, AsVIXLDRegister(dst), AsVIXLDRegister(src));
@@ -407,14 +719,18 @@
       }
     } else {
       CHECK(dst.IsRegisterPair()) << dst;
-      CHECK(src.IsRegisterPair()) << src;
-      // Ensure that the first move doesn't clobber the input of the second.
-      if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
-        ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
-        ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+      if (src.IsRegisterPair()) {
+        // Ensure that the first move doesn't clobber the input of the second.
+        if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
+          ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
+          ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+        } else {
+          ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+          ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
+        }
       } else {
-        ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
-        ___ Mov(AsVIXLRegisterPairLow(dst),  AsVIXLRegisterPairLow(src));
+        CHECK(src.IsDRegister()) << src;
+        ___ Vmov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairHigh(dst), AsVIXLDRegister(src));
       }
     }
   }
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 2bd571e..2f6813a 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -93,6 +93,8 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
 
   // Copying routines.
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset32 thr_offs) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a31ed93..bb93a96 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -326,6 +326,88 @@
 }
 
 // Copying routines.
+void Arm64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                           ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+  auto get_mask = [](ManagedRegister reg) -> uint64_t {
+    Arm64ManagedRegister arm64_reg = reg.AsArm64();
+    if (arm64_reg.IsXRegister()) {
+      size_t core_reg_number = static_cast<size_t>(arm64_reg.AsXRegister());
+      DCHECK_LT(core_reg_number, 31u);  // xSP, xZR not allowed.
+      return UINT64_C(1) << core_reg_number;
+    } else if (arm64_reg.IsWRegister()) {
+      size_t core_reg_number = static_cast<size_t>(arm64_reg.AsWRegister());
+      DCHECK_LT(core_reg_number, 31u);  // wSP, wZR not allowed.
+      return UINT64_C(1) << core_reg_number;
+    } else if (arm64_reg.IsDRegister()) {
+      size_t fp_reg_number = static_cast<size_t>(arm64_reg.AsDRegister());
+      DCHECK_LT(fp_reg_number, 32u);
+      return (UINT64_C(1) << 32u) << fp_reg_number;
+    } else {
+      DCHECK(arm64_reg.IsSRegister());
+      size_t fp_reg_number = static_cast<size_t>(arm64_reg.AsSRegister());
+      DCHECK_LT(fp_reg_number, 32u);
+      return (UINT64_C(1) << 32u) << fp_reg_number;
+    }
+  };
+  // Collect registers to move while storing/copying args to stack slots.
+  // More than 8 core or FP reg args are very rare, so we do not optimize
+  // for that case by using LDP/STP.
+  // TODO: LDP/STP will be useful for normal and @FastNative where we need
+  // to spill even the leading arguments.
+  uint64_t src_regs = 0u;
+  uint64_t dest_regs = 0u;
+  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    if (dest.IsRegister()) {
+      if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+        // Nothing to do.
+      } else {
+        if (src.IsRegister()) {
+          src_regs |= get_mask(src.GetRegister());
+        }
+        dest_regs |= get_mask(dest.GetRegister());
+      }
+    } else {
+      if (src.IsRegister()) {
+        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
+    }
+  }
+  // Fill destination registers.
+  // There should be no cycles, so this simple algorithm should make progress.
+  while (dest_regs != 0u) {
+    uint64_t old_dest_regs = dest_regs;
+    for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+      const ArgumentLocation& src = srcs[i];
+      const ArgumentLocation& dest = dests[i];
+      if (!dest.IsRegister()) {
+        continue;  // Stored in first loop above.
+      }
+      uint64_t dest_reg_mask = get_mask(dest.GetRegister());
+      if ((dest_reg_mask & dest_regs) == 0u) {
+        continue;  // Equals source, or already filled in one of previous iterations.
+      }
+      if ((dest_reg_mask & src_regs) != 0u) {
+        continue;  // Cannot clobber this register yet.
+      }
+      if (src.IsRegister()) {
+        Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+        src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
+      } else {
+        Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+      }
+      dest_regs &= ~get_mask(dest.GetRegister());  // Destination register was filled.
+    }
+    CHECK_NE(old_dest_regs, dest_regs);
+    DCHECK_EQ(0u, dest_regs & ~old_dest_regs);
+  }
+}
+
 void Arm64JNIMacroAssembler::Move(ManagedRegister m_dst, ManagedRegister m_src, size_t size) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   if (kIsDebugBuild) {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 64b5595..9f3eea2 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -85,6 +85,7 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
 
   // Copying routines.
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;
   void CopyRawPtrToThread(ThreadOffset64 thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 48b3f01..3490959 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -43,6 +43,40 @@
   kNotZero
 };
 
+class ArgumentLocation {
+ public:
+  ArgumentLocation(ManagedRegister reg, size_t size)
+      : reg_(reg), frame_offset_(0u), size_(size) {
+    DCHECK(reg.IsRegister());
+  }
+
+  ArgumentLocation(FrameOffset frame_offset, size_t size)
+      : reg_(ManagedRegister::NoRegister()), frame_offset_(frame_offset), size_(size) {}
+
+  bool IsRegister() const {
+    return reg_.IsRegister();
+  }
+
+  ManagedRegister GetRegister() const {
+    DCHECK(IsRegister());
+    return reg_;
+  }
+
+  FrameOffset GetFrameOffset() const {
+    DCHECK(!IsRegister());
+    return frame_offset_;
+  }
+
+  size_t GetSize() const {
+    return size_;
+  }
+
+ private:
+  ManagedRegister reg_;
+  FrameOffset frame_offset_;
+  size_t size_;
+};
+
 template <PointerSize kPointerSize>
 class JNIMacroAssembler : public DeletableArenaObject<kArenaAllocAssembler> {
  public:
@@ -112,6 +146,8 @@
   virtual void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset<kPointerSize> offs) = 0;
 
   // Copying routines
+  virtual void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) = 0;
+
   virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
 
   virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset<kPointerSize> thr_offs) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 1adcc20..67ec93d 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -300,6 +300,30 @@
   }
 }
 
+void X86JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                         ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+  bool found_hidden_arg = false;
+  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    if (UNLIKELY(dest.IsRegister())) {
+      // Native ABI has only stack arguments but we may pass one "hidden arg" in register.
+      CHECK(!found_hidden_arg);
+      found_hidden_arg = true;
+      CHECK(src.IsRegister());
+      Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+    } else {
+      if (src.IsRegister()) {
+        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
+    }
+  }
+}
+
 void X86JNIMacroAssembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
   DCHECK(!mdest.Equals(X86ManagedRegister::FromCpuRegister(GetScratchRegister())));
   X86ManagedRegister dest = mdest.AsX86();
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 1223471..0239ff7 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -82,6 +82,8 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
 
   // Copying routines
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset32 thr_offs) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index d57ea41..2649084 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -338,6 +338,76 @@
   }
 }
 
+void X86_64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+                                            ArrayRef<ArgumentLocation> srcs) {
+  DCHECK_EQ(dests.size(), srcs.size());
+  auto get_mask = [](ManagedRegister reg) -> uint32_t {
+    X86_64ManagedRegister x86_64_reg = reg.AsX86_64();
+    if (x86_64_reg.IsCpuRegister()) {
+      size_t cpu_reg_number = static_cast<size_t>(x86_64_reg.AsCpuRegister().AsRegister());
+      DCHECK_LT(cpu_reg_number, 16u);
+      return 1u << cpu_reg_number;
+    } else {
+      DCHECK(x86_64_reg.IsXmmRegister());
+      size_t xmm_reg_number = static_cast<size_t>(x86_64_reg.AsXmmRegister().AsFloatRegister());
+      DCHECK_LT(xmm_reg_number, 16u);
+      return (1u << 16u) << xmm_reg_number;
+    }
+  };
+  // Collect registers to move while storing/copying args to stack slots.
+  uint32_t src_regs = 0u;
+  uint32_t dest_regs = 0u;
+  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    const ArgumentLocation& src = srcs[i];
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    if (dest.IsRegister()) {
+      if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+        // Nothing to do.
+      } else {
+        if (src.IsRegister()) {
+          src_regs |= get_mask(src.GetRegister());
+        }
+        dest_regs |= get_mask(dest.GetRegister());
+      }
+    } else {
+      if (src.IsRegister()) {
+        Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
+    }
+  }
+  // Fill destination registers.
+  // There should be no cycles, so this simple algorithm should make progress.
+  while (dest_regs != 0u) {
+    uint32_t old_dest_regs = dest_regs;
+    for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+      const ArgumentLocation& src = srcs[i];
+      const ArgumentLocation& dest = dests[i];
+      if (!dest.IsRegister()) {
+        continue;  // Stored in first loop above.
+      }
+      uint32_t dest_reg_mask = get_mask(dest.GetRegister());
+      if ((dest_reg_mask & dest_regs) == 0u) {
+        continue;  // Equals source, or already filled in one of previous iterations.
+      }
+      if ((dest_reg_mask & src_regs) != 0u) {
+        continue;  // Cannot clobber this register yet.
+      }
+      if (src.IsRegister()) {
+        Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+        src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
+      } else {
+        Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+      }
+      dest_regs &= ~get_mask(dest.GetRegister());  // Destination register was filled.
+    }
+    CHECK_NE(old_dest_regs, dest_regs);
+    DCHECK_EQ(0u, dest_regs & ~old_dest_regs);
+  }
+}
+
 void X86_64JNIMacroAssembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
   DCHECK(!mdest.Equals(X86_64ManagedRegister::FromCpuRegister(GetScratchRegister().AsRegister())));
   X86_64ManagedRegister dest = mdest.AsX86_64();
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 4592eba..6589544 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -85,6 +85,8 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
 
   // Copying routines
+  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;
diff --git a/test/178-app-image-native-method/expected.txt b/test/178-app-image-native-method/expected.txt
index 30cc336..b4e5ece 100644
--- a/test/178-app-image-native-method/expected.txt
+++ b/test/178-app-image-native-method/expected.txt
@@ -5,6 +5,7 @@
 testMissing
 testMissingFast
 testMissingCritical
+testCriticalSignatures
 JNI_OnLoad called
 test
 testFast
@@ -12,3 +13,4 @@
 testMissing
 testMissingFast
 testMissingCritical
+testCriticalSignatures
diff --git a/test/178-app-image-native-method/native_methods.cc b/test/178-app-image-native-method/native_methods.cc
index 794a78a..becda81 100644
--- a/test/178-app-image-native-method/native_methods.cc
+++ b/test/178-app-image-native-method/native_methods.cc
@@ -127,4 +127,486 @@
   return ok ? 42 : -1;
 }
 
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeILFFFFD(
+    jint i,
+    jlong l,
+    jfloat f1,
+    jfloat f2,
+    jfloat f3,
+    jfloat f4,
+    jdouble d) {
+  if (i != 1) return -1;
+  if (l != INT64_C(0xf00000002)) return -2;
+  if (f1 != 3.0f) return -3;
+  if (f2 != 4.0f) return -4;
+  if (f3 != 5.0f) return -5;
+  if (f4 != 6.0f) return -6;
+  if (d != 7.0) return -7;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeLIFFFFD(
+    jlong l,
+    jint i,
+    jfloat f1,
+    jfloat f2,
+    jfloat f3,
+    jfloat f4,
+    jdouble d) {
+  if (l != INT64_C(0xf00000007)) return -1;
+  if (i != 6) return -2;
+  if (f1 != 5.0f) return -3;
+  if (f2 != 4.0f) return -4;
+  if (f3 != 3.0f) return -5;
+  if (f4 != 2.0f) return -6;
+  if (d != 1.0) return -7;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeFLIFFFD(
+    jfloat f1,
+    jlong l,
+    jint i,
+    jfloat f2,
+    jfloat f3,
+    jfloat f4,
+    jdouble d) {
+  if (f1 != 1.0f) return -3;
+  if (l != INT64_C(0xf00000002)) return -1;
+  if (i != 3) return -2;
+  if (f2 != 4.0f) return -4;
+  if (f3 != 5.0f) return -5;
+  if (f4 != 6.0f) return -6;
+  if (d != 7.0) return -7;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeDDIIIIII(
+    jdouble d1,
+    jdouble d2,
+    jint i1,
+    jint i2,
+    jint i3,
+    jint i4,
+    jint i5,
+    jint i6) {
+  if (d1 != 8.0) return -1;
+  if (d2 != 7.0) return -2;
+  if (i1 != 6) return -3;
+  if (i2 != 5) return -4;
+  if (i3 != 4) return -5;
+  if (i4 != 3) return -6;
+  if (i5 != 2) return -7;
+  if (i6 != 1) return -8;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeDFFILIII(
+    jdouble d,
+    jfloat f1,
+    jfloat f2,
+    jint i1,
+    jlong l,
+    jint i2,
+    jint i3,
+    jint i4) {
+  if (d != 1.0) return -1;
+  if (f1 != 2.0f) return -2;
+  if (f2 != 3.0f) return -3;
+  if (i1 != 4) return -4;
+  if (l != INT64_C(0xf00000005)) return -5;
+  if (i2 != 6) return -6;
+  if (i3 != 7) return -7;
+  if (i4 != 8) return -8;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeDDFILIII(
+    jdouble d1,
+    jdouble d2,
+    jfloat f,
+    jint i1,
+    jlong l,
+    jint i2,
+    jint i3,
+    jint i4) {
+  if (d1 != 8.0) return -1;
+  if (d2 != 7.0) return -2;
+  if (f != 6.0f) return -3;
+  if (i1 != 5) return -4;
+  if (l != INT64_C(0xf00000004)) return -5;
+  if (i2 != 3) return -6;
+  if (i3 != 2) return -7;
+  if (i4 != 1) return -8;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeDDIFII(
+    jdouble d1,
+    jdouble d2,
+    jint i1,
+    jfloat f,
+    jint i2,
+    jint i3) {
+  if (d1 != 1.0) return -1;
+  if (d2 != 2.0) return -2;
+  if (i1 != 3) return -3;
+  if (f != 4.0f) return -4;
+  if (i2 != 5) return -5;
+  if (i3 != 6) return -6;
+  return 42;
+}
+
+extern "C" JNIEXPORT jint JNICALL Java_CriticalSignatures_nativeFullArgs(
+    // Generated by script (then modified to close argument list):
+    //   for i in {0..84}; do echo "    jlong l$((i*3)),"; echo "    jint i$((i*3+2)),"; done
+    jlong l0,
+    jint i2,
+    jlong l3,
+    jint i5,
+    jlong l6,
+    jint i8,
+    jlong l9,
+    jint i11,
+    jlong l12,
+    jint i14,
+    jlong l15,
+    jint i17,
+    jlong l18,
+    jint i20,
+    jlong l21,
+    jint i23,
+    jlong l24,
+    jint i26,
+    jlong l27,
+    jint i29,
+    jlong l30,
+    jint i32,
+    jlong l33,
+    jint i35,
+    jlong l36,
+    jint i38,
+    jlong l39,
+    jint i41,
+    jlong l42,
+    jint i44,
+    jlong l45,
+    jint i47,
+    jlong l48,
+    jint i50,
+    jlong l51,
+    jint i53,
+    jlong l54,
+    jint i56,
+    jlong l57,
+    jint i59,
+    jlong l60,
+    jint i62,
+    jlong l63,
+    jint i65,
+    jlong l66,
+    jint i68,
+    jlong l69,
+    jint i71,
+    jlong l72,
+    jint i74,
+    jlong l75,
+    jint i77,
+    jlong l78,
+    jint i80,
+    jlong l81,
+    jint i83,
+    jlong l84,
+    jint i86,
+    jlong l87,
+    jint i89,
+    jlong l90,
+    jint i92,
+    jlong l93,
+    jint i95,
+    jlong l96,
+    jint i98,
+    jlong l99,
+    jint i101,
+    jlong l102,
+    jint i104,
+    jlong l105,
+    jint i107,
+    jlong l108,
+    jint i110,
+    jlong l111,
+    jint i113,
+    jlong l114,
+    jint i116,
+    jlong l117,
+    jint i119,
+    jlong l120,
+    jint i122,
+    jlong l123,
+    jint i125,
+    jlong l126,
+    jint i128,
+    jlong l129,
+    jint i131,
+    jlong l132,
+    jint i134,
+    jlong l135,
+    jint i137,
+    jlong l138,
+    jint i140,
+    jlong l141,
+    jint i143,
+    jlong l144,
+    jint i146,
+    jlong l147,
+    jint i149,
+    jlong l150,
+    jint i152,
+    jlong l153,
+    jint i155,
+    jlong l156,
+    jint i158,
+    jlong l159,
+    jint i161,
+    jlong l162,
+    jint i164,
+    jlong l165,
+    jint i167,
+    jlong l168,
+    jint i170,
+    jlong l171,
+    jint i173,
+    jlong l174,
+    jint i176,
+    jlong l177,
+    jint i179,
+    jlong l180,
+    jint i182,
+    jlong l183,
+    jint i185,
+    jlong l186,
+    jint i188,
+    jlong l189,
+    jint i191,
+    jlong l192,
+    jint i194,
+    jlong l195,
+    jint i197,
+    jlong l198,
+    jint i200,
+    jlong l201,
+    jint i203,
+    jlong l204,
+    jint i206,
+    jlong l207,
+    jint i209,
+    jlong l210,
+    jint i212,
+    jlong l213,
+    jint i215,
+    jlong l216,
+    jint i218,
+    jlong l219,
+    jint i221,
+    jlong l222,
+    jint i224,
+    jlong l225,
+    jint i227,
+    jlong l228,
+    jint i230,
+    jlong l231,
+    jint i233,
+    jlong l234,
+    jint i236,
+    jlong l237,
+    jint i239,
+    jlong l240,
+    jint i242,
+    jlong l243,
+    jint i245,
+    jlong l246,
+    jint i248,
+    jlong l249,
+    jint i251,
+    jlong l252,
+    jint i254) {
+  jlong l = INT64_C(0xf00000000);
+  // Generated by script (then modified to close argument list):
+  //   for i in {0..84}; do \
+  //     echo "  if (l$((i*3)) != l + $(($i*3))) return -$(($i*3));"; \
+  //     echo "  if (i$(($i*3+2)) != $(($i*3+2))) return -$(($i*3+2));"; \
+  //  done
+  if (l0 != l + 0) return -0;
+  if (i2 != 2) return -2;
+  if (l3 != l + 3) return -3;
+  if (i5 != 5) return -5;
+  if (l6 != l + 6) return -6;
+  if (i8 != 8) return -8;
+  if (l9 != l + 9) return -9;
+  if (i11 != 11) return -11;
+  if (l12 != l + 12) return -12;
+  if (i14 != 14) return -14;
+  if (l15 != l + 15) return -15;
+  if (i17 != 17) return -17;
+  if (l18 != l + 18) return -18;
+  if (i20 != 20) return -20;
+  if (l21 != l + 21) return -21;
+  if (i23 != 23) return -23;
+  if (l24 != l + 24) return -24;
+  if (i26 != 26) return -26;
+  if (l27 != l + 27) return -27;
+  if (i29 != 29) return -29;
+  if (l30 != l + 30) return -30;
+  if (i32 != 32) return -32;
+  if (l33 != l + 33) return -33;
+  if (i35 != 35) return -35;
+  if (l36 != l + 36) return -36;
+  if (i38 != 38) return -38;
+  if (l39 != l + 39) return -39;
+  if (i41 != 41) return -41;
+  if (l42 != l + 42) return -42;
+  if (i44 != 44) return -44;
+  if (l45 != l + 45) return -45;
+  if (i47 != 47) return -47;
+  if (l48 != l + 48) return -48;
+  if (i50 != 50) return -50;
+  if (l51 != l + 51) return -51;
+  if (i53 != 53) return -53;
+  if (l54 != l + 54) return -54;
+  if (i56 != 56) return -56;
+  if (l57 != l + 57) return -57;
+  if (i59 != 59) return -59;
+  if (l60 != l + 60) return -60;
+  if (i62 != 62) return -62;
+  if (l63 != l + 63) return -63;
+  if (i65 != 65) return -65;
+  if (l66 != l + 66) return -66;
+  if (i68 != 68) return -68;
+  if (l69 != l + 69) return -69;
+  if (i71 != 71) return -71;
+  if (l72 != l + 72) return -72;
+  if (i74 != 74) return -74;
+  if (l75 != l + 75) return -75;
+  if (i77 != 77) return -77;
+  if (l78 != l + 78) return -78;
+  if (i80 != 80) return -80;
+  if (l81 != l + 81) return -81;
+  if (i83 != 83) return -83;
+  if (l84 != l + 84) return -84;
+  if (i86 != 86) return -86;
+  if (l87 != l + 87) return -87;
+  if (i89 != 89) return -89;
+  if (l90 != l + 90) return -90;
+  if (i92 != 92) return -92;
+  if (l93 != l + 93) return -93;
+  if (i95 != 95) return -95;
+  if (l96 != l + 96) return -96;
+  if (i98 != 98) return -98;
+  if (l99 != l + 99) return -99;
+  if (i101 != 101) return -101;
+  if (l102 != l + 102) return -102;
+  if (i104 != 104) return -104;
+  if (l105 != l + 105) return -105;
+  if (i107 != 107) return -107;
+  if (l108 != l + 108) return -108;
+  if (i110 != 110) return -110;
+  if (l111 != l + 111) return -111;
+  if (i113 != 113) return -113;
+  if (l114 != l + 114) return -114;
+  if (i116 != 116) return -116;
+  if (l117 != l + 117) return -117;
+  if (i119 != 119) return -119;
+  if (l120 != l + 120) return -120;
+  if (i122 != 122) return -122;
+  if (l123 != l + 123) return -123;
+  if (i125 != 125) return -125;
+  if (l126 != l + 126) return -126;
+  if (i128 != 128) return -128;
+  if (l129 != l + 129) return -129;
+  if (i131 != 131) return -131;
+  if (l132 != l + 132) return -132;
+  if (i134 != 134) return -134;
+  if (l135 != l + 135) return -135;
+  if (i137 != 137) return -137;
+  if (l138 != l + 138) return -138;
+  if (i140 != 140) return -140;
+  if (l141 != l + 141) return -141;
+  if (i143 != 143) return -143;
+  if (l144 != l + 144) return -144;
+  if (i146 != 146) return -146;
+  if (l147 != l + 147) return -147;
+  if (i149 != 149) return -149;
+  if (l150 != l + 150) return -150;
+  if (i152 != 152) return -152;
+  if (l153 != l + 153) return -153;
+  if (i155 != 155) return -155;
+  if (l156 != l + 156) return -156;
+  if (i158 != 158) return -158;
+  if (l159 != l + 159) return -159;
+  if (i161 != 161) return -161;
+  if (l162 != l + 162) return -162;
+  if (i164 != 164) return -164;
+  if (l165 != l + 165) return -165;
+  if (i167 != 167) return -167;
+  if (l168 != l + 168) return -168;
+  if (i170 != 170) return -170;
+  if (l171 != l + 171) return -171;
+  if (i173 != 173) return -173;
+  if (l174 != l + 174) return -174;
+  if (i176 != 176) return -176;
+  if (l177 != l + 177) return -177;
+  if (i179 != 179) return -179;
+  if (l180 != l + 180) return -180;
+  if (i182 != 182) return -182;
+  if (l183 != l + 183) return -183;
+  if (i185 != 185) return -185;
+  if (l186 != l + 186) return -186;
+  if (i188 != 188) return -188;
+  if (l189 != l + 189) return -189;
+  if (i191 != 191) return -191;
+  if (l192 != l + 192) return -192;
+  if (i194 != 194) return -194;
+  if (l195 != l + 195) return -195;
+  if (i197 != 197) return -197;
+  if (l198 != l + 198) return -198;
+  if (i200 != 200) return -200;
+  if (l201 != l + 201) return -201;
+  if (i203 != 203) return -203;
+  if (l204 != l + 204) return -204;
+  if (i206 != 206) return -206;
+  if (l207 != l + 207) return -207;
+  if (i209 != 209) return -209;
+  if (l210 != l + 210) return -210;
+  if (i212 != 212) return -212;
+  if (l213 != l + 213) return -213;
+  if (i215 != 215) return -215;
+  if (l216 != l + 216) return -216;
+  if (i218 != 218) return -218;
+  if (l219 != l + 219) return -219;
+  if (i221 != 221) return -221;
+  if (l222 != l + 222) return -222;
+  if (i224 != 224) return -224;
+  if (l225 != l + 225) return -225;
+  if (i227 != 227) return -227;
+  if (l228 != l + 228) return -228;
+  if (i230 != 230) return -230;
+  if (l231 != l + 231) return -231;
+  if (i233 != 233) return -233;
+  if (l234 != l + 234) return -234;
+  if (i236 != 236) return -236;
+  if (l237 != l + 237) return -237;
+  if (i239 != 239) return -239;
+  if (l240 != l + 240) return -240;
+  if (i242 != 242) return -242;
+  if (l243 != l + 243) return -243;
+  if (i245 != 245) return -245;
+  if (l246 != l + 246) return -246;
+  if (i248 != 248) return -248;
+  if (l249 != l + 249) return -249;
+  if (i251 != 251) return -251;
+  if (l252 != l + 252) return -252;
+  if (i254 != 254) return -254;
+  return 42;
+}
+
 }  // namespace art
diff --git a/test/178-app-image-native-method/src/Main.java b/test/178-app-image-native-method/src/Main.java
index 07990cb..9043081 100644
--- a/test/178-app-image-native-method/src/Main.java
+++ b/test/178-app-image-native-method/src/Main.java
@@ -29,6 +29,7 @@
     new TestMissing();
     new TestMissingFast();
     new TestMissingCritical();
+    new CriticalSignatures();
     makeVisiblyInitialized();  // Make sure they are visibly initialized.
 
     test();
@@ -37,6 +38,8 @@
     testMissing();
     testMissingFast();
     testMissingCritical();
+
+    testCriticalSignatures();
   }
 
   static void test() {
@@ -165,6 +168,194 @@
     } catch (LinkageError expected) {}
   }
 
+  static void testCriticalSignatures() {
+    System.out.println("testCriticalSignatures");
+    long l = 0xf00000000L;
+    assertEquals(42, CriticalSignatures.nativeILFFFFD(1, l + 2L, 3.0f, 4.0f, 5.0f, 6.0f, 7.0));
+    assertEquals(42, CriticalSignatures.nativeLIFFFFD(l + 7L, 6, 5.0f, 4.0f, 3.0f, 2.0f, 1.0));
+    assertEquals(42, CriticalSignatures.nativeFLIFFFD(1.0f, l + 2L, 3, 4.0f, 5.0f, 6.0f, 7.0));
+    assertEquals(42, CriticalSignatures.nativeDDIIIIII(8.0, 7.0, 6, 5, 4, 3, 2, 1));
+    assertEquals(42, CriticalSignatures.nativeDFFILIII(1.0, 2.0f, 3.0f, 4, l + 5L, 6, 7, 8));
+    assertEquals(42, CriticalSignatures.nativeDDFILIII(8.0, 7.0, 6.0f, 5, l + 4L, 3, 2, 1));
+    assertEquals(42, CriticalSignatures.nativeDDIFII(1.0, 2.0, 3, 4.0f, 5, 6));
+    assertEquals(42, CriticalSignatures.nativeFullArgs(
+        // Generated by script (then modified to close argument list):
+        //   for i in {0..84}; \
+        //     do echo "        0xf00000000L + $((i*3))L,"; \
+        //     echo "        $((i*3+2)),"; \
+        //  done
+        0xf00000000L + 0L,
+        2,
+        0xf00000000L + 3L,
+        5,
+        0xf00000000L + 6L,
+        8,
+        0xf00000000L + 9L,
+        11,
+        0xf00000000L + 12L,
+        14,
+        0xf00000000L + 15L,
+        17,
+        0xf00000000L + 18L,
+        20,
+        0xf00000000L + 21L,
+        23,
+        0xf00000000L + 24L,
+        26,
+        0xf00000000L + 27L,
+        29,
+        0xf00000000L + 30L,
+        32,
+        0xf00000000L + 33L,
+        35,
+        0xf00000000L + 36L,
+        38,
+        0xf00000000L + 39L,
+        41,
+        0xf00000000L + 42L,
+        44,
+        0xf00000000L + 45L,
+        47,
+        0xf00000000L + 48L,
+        50,
+        0xf00000000L + 51L,
+        53,
+        0xf00000000L + 54L,
+        56,
+        0xf00000000L + 57L,
+        59,
+        0xf00000000L + 60L,
+        62,
+        0xf00000000L + 63L,
+        65,
+        0xf00000000L + 66L,
+        68,
+        0xf00000000L + 69L,
+        71,
+        0xf00000000L + 72L,
+        74,
+        0xf00000000L + 75L,
+        77,
+        0xf00000000L + 78L,
+        80,
+        0xf00000000L + 81L,
+        83,
+        0xf00000000L + 84L,
+        86,
+        0xf00000000L + 87L,
+        89,
+        0xf00000000L + 90L,
+        92,
+        0xf00000000L + 93L,
+        95,
+        0xf00000000L + 96L,
+        98,
+        0xf00000000L + 99L,
+        101,
+        0xf00000000L + 102L,
+        104,
+        0xf00000000L + 105L,
+        107,
+        0xf00000000L + 108L,
+        110,
+        0xf00000000L + 111L,
+        113,
+        0xf00000000L + 114L,
+        116,
+        0xf00000000L + 117L,
+        119,
+        0xf00000000L + 120L,
+        122,
+        0xf00000000L + 123L,
+        125,
+        0xf00000000L + 126L,
+        128,
+        0xf00000000L + 129L,
+        131,
+        0xf00000000L + 132L,
+        134,
+        0xf00000000L + 135L,
+        137,
+        0xf00000000L + 138L,
+        140,
+        0xf00000000L + 141L,
+        143,
+        0xf00000000L + 144L,
+        146,
+        0xf00000000L + 147L,
+        149,
+        0xf00000000L + 150L,
+        152,
+        0xf00000000L + 153L,
+        155,
+        0xf00000000L + 156L,
+        158,
+        0xf00000000L + 159L,
+        161,
+        0xf00000000L + 162L,
+        164,
+        0xf00000000L + 165L,
+        167,
+        0xf00000000L + 168L,
+        170,
+        0xf00000000L + 171L,
+        173,
+        0xf00000000L + 174L,
+        176,
+        0xf00000000L + 177L,
+        179,
+        0xf00000000L + 180L,
+        182,
+        0xf00000000L + 183L,
+        185,
+        0xf00000000L + 186L,
+        188,
+        0xf00000000L + 189L,
+        191,
+        0xf00000000L + 192L,
+        194,
+        0xf00000000L + 195L,
+        197,
+        0xf00000000L + 198L,
+        200,
+        0xf00000000L + 201L,
+        203,
+        0xf00000000L + 204L,
+        206,
+        0xf00000000L + 207L,
+        209,
+        0xf00000000L + 210L,
+        212,
+        0xf00000000L + 213L,
+        215,
+        0xf00000000L + 216L,
+        218,
+        0xf00000000L + 219L,
+        221,
+        0xf00000000L + 222L,
+        224,
+        0xf00000000L + 225L,
+        227,
+        0xf00000000L + 228L,
+        230,
+        0xf00000000L + 231L,
+        233,
+        0xf00000000L + 234L,
+        236,
+        0xf00000000L + 237L,
+        239,
+        0xf00000000L + 240L,
+        242,
+        0xf00000000L + 243L,
+        245,
+        0xf00000000L + 246L,
+        248,
+        0xf00000000L + 249L,
+        251,
+        0xf00000000L + 252L,
+        254));
+  }
+
   static void assertEquals(int expected, int actual) {
     if (expected != actual) {
       throw new AssertionError("Expected " + expected + " got " + actual);
@@ -281,3 +472,280 @@
       int i7, long l7, float f7, double d7,
       int i8, long l8, float f8, double d8);
 }
+
+class CriticalSignatures {
+  // The following signatures exercise ARM argument moving and serve
+  // as an example of the optimizations performed by the assembler.
+  // Moving arguments is a lot simpler for other architectures.
+
+  // JNI compiler does not emit the CFG, so we cannot CHECK the "dissassembly (after)".
+
+  // vstm sp, {d0-d2}         # f1, f2, f3, f4, d -- store floats as D regs together with double
+  // mov r4, r0               # hidden arg
+  // mov r0, r1               # i
+  //                          # l stays in r2-r3
+  @CriticalNative
+  public static native int nativeILFFFFD(
+      int i, long l, float f1, float f2, float f3, float f4, double d);
+
+  // vstm sp, {s1-s3}         # f2, f3, f4 -- store floats up to alignment gap
+  // vstr d2, [sp, #16]       # d
+  // mov r4, r0               # hidden arg
+  // mov r0, r2               # low(l)
+  // mov r1, r3               # high(l)
+  // ldr r2, [sp, #...]       # i
+  // vmov r3, s0              # f1
+  @CriticalNative
+  public static native int nativeLIFFFFD(
+      long l, int i, float f1, float f2, float f3, float f4, double d);
+
+  // ldr  ip, [sp, #...]      # i
+  // str  ip, [sp]            # i
+  // add  ip, sp, #4          # Spilling multiple floats at an offset from SP
+  // vstm ip, {s1-s5}         # f2, f3, f4, d
+  // mov r4, r0               # hidden arg
+  // vmov r0, s0              # f1
+  //                          # l stays in r2-r3
+  @CriticalNative
+  public static native int nativeFLIFFFD(
+      float f1, long l, int i, float f2, float f3, float f4, double d);
+
+  // stm sp, {r1,r2,r3}       # i1, i2, i3 -- store ints together
+  // ldrd r1, ip, [sp, #...]  # i4, i5
+  // strd r1, ip, [sp, #12]   # i4, i5
+  // ldr ip, [sp, #72]        # i6
+  // str ip, [sp, #20]        # i6
+  // mov r4, r0               # hidden arg
+  // vmov r0, r1, d0          # d1
+  // vmov r2, r3, d1          # d2
+  @CriticalNative
+  public static native int nativeDDIIIIII(
+      double d1, double d2, int i1, int i2, int i3, int i4, int i5, int i6);
+
+  // str r1, [sp]             # i1 -- cannot store with l due to alignment gap
+  // strd r2, r3, [sp, #8]    # l
+  // ldrd r1, ip, [sp, #...]  # i2, i3
+  // strd r1, ip, [sp, #16]   # i2, i3
+  // ldr ip, [sp, #...]       # i4
+  // str ip, [sp, #24]        # i4
+  // mov r4, r0               # hidden arg
+  // vmov r0, r1, d0          # d
+  // vmov r2, r3, d1          # f1, f2 -- move both floats together as double
+  @CriticalNative
+  public static native int nativeDFFILIII(
+      double d, float f1, float f2, int i1, long l, int i2, int i3, int i4);
+
+  // vstr s4, [sp]            # f
+  // add ip, sp, #4           # Spilling multiple core registers at an offset from SP
+  // stm ip, {r1,r2,r3}       # i1, l -- store int together with long
+  // ldrd r1, ip, [sp, #...]  # i2, i3
+  // strd r1, ip, [sp, #16]   # i2, i3
+  // ldr ip, [sp, #...]       # i4
+  // str ip, [sp, #24]        # i4
+  // mov r4, r0               # hidden arg
+  // vmov r0, r1, d0          # d1
+  // vmov r2, r3, d1          # d2
+  @CriticalNative
+  public static native int nativeDDFILIII(
+      double d1, double d2, float f, int i1, long l, int i2, int i3, int i4);
+
+  // str r1, [sp]             # i1
+  // vstr s4, [sp, #4]        # f
+  // strd r2, r3, [sp, #8]    # i2, i3 -- store ints together with STRD
+  // mov r4, r0               # hidden arg
+  // vmov r0, r1, d0          # d1
+  // vmov r2, r3, d1          # d2
+  @CriticalNative
+  public static native int nativeDDIFII(
+      double d1, double d2, int i1, float f, int i2, int i3);
+
+  // ...
+  // ldr ip, [sp, #2112]      # int
+  // str ip, [sp, #1000]      # int
+  // add r1, sp, #2048        # Prepare to use LDRD for loading long from a large offset
+  // ldrd r1, ip, [r1, #68]   # long
+  // strd r1, ip, [sp, #1008] # long
+  // ldr ip, [sp, #2124]      # int
+  // str ip, [sp, #1016]      # int
+  // ldr ip, [sp, #2128]      # low(long) -- copy the next long as two words because the offset
+  // str ip, [sp, #1024]      # low(long) -- is too large for STRD and we only use 2 temps (r1, ip)
+  // ldr ip, [sp, #2132]      # high(long)
+  // str ip, [sp, #1028]      # high(long)
+  // ...
+  @CriticalNative
+  public static native int nativeFullArgs(
+      // Note: Numbered by dalvik registers, 0-254 (max 255 regs for invoke-*-range)
+      //
+      // Generated by script (then modified to close the argument list):
+      //   for i in {0..84}; do echo "      long l$((i*3)),"; echo "      int i$(($i*3+2)),"; done
+      long l0,
+      int i2,
+      long l3,
+      int i5,
+      long l6,
+      int i8,
+      long l9,
+      int i11,
+      long l12,
+      int i14,
+      long l15,
+      int i17,
+      long l18,
+      int i20,
+      long l21,
+      int i23,
+      long l24,
+      int i26,
+      long l27,
+      int i29,
+      long l30,
+      int i32,
+      long l33,
+      int i35,
+      long l36,
+      int i38,
+      long l39,
+      int i41,
+      long l42,
+      int i44,
+      long l45,
+      int i47,
+      long l48,
+      int i50,
+      long l51,
+      int i53,
+      long l54,
+      int i56,
+      long l57,
+      int i59,
+      long l60,
+      int i62,
+      long l63,
+      int i65,
+      long l66,
+      int i68,
+      long l69,
+      int i71,
+      long l72,
+      int i74,
+      long l75,
+      int i77,
+      long l78,
+      int i80,
+      long l81,
+      int i83,
+      long l84,
+      int i86,
+      long l87,
+      int i89,
+      long l90,
+      int i92,
+      long l93,
+      int i95,
+      long l96,
+      int i98,
+      long l99,
+      int i101,
+      long l102,
+      int i104,
+      long l105,
+      int i107,
+      long l108,
+      int i110,
+      long l111,
+      int i113,
+      long l114,
+      int i116,
+      long l117,
+      int i119,
+      long l120,
+      int i122,
+      long l123,
+      int i125,
+      long l126,
+      int i128,
+      long l129,
+      int i131,
+      long l132,
+      int i134,
+      long l135,
+      int i137,
+      long l138,
+      int i140,
+      long l141,
+      int i143,
+      long l144,
+      int i146,
+      long l147,
+      int i149,
+      long l150,
+      int i152,
+      long l153,
+      int i155,
+      long l156,
+      int i158,
+      long l159,
+      int i161,
+      long l162,
+      int i164,
+      long l165,
+      int i167,
+      long l168,
+      int i170,
+      long l171,
+      int i173,
+      long l174,
+      int i176,
+      long l177,
+      int i179,
+      long l180,
+      int i182,
+      long l183,
+      int i185,
+      long l186,
+      int i188,
+      long l189,
+      int i191,
+      long l192,
+      int i194,
+      long l195,
+      int i197,
+      long l198,
+      int i200,
+      long l201,
+      int i203,
+      long l204,
+      int i206,
+      long l207,
+      int i209,
+      long l210,
+      int i212,
+      long l213,
+      int i215,
+      long l216,
+      int i218,
+      long l219,
+      int i221,
+      long l222,
+      int i224,
+      long l225,
+      int i227,
+      long l228,
+      int i230,
+      long l231,
+      int i233,
+      long l234,
+      int i236,
+      long l237,
+      int i239,
+      long l240,
+      int i242,
+      long l243,
+      int i245,
+      long l246,
+      int i248,
+      long l249,
+      int i251,
+      long l252,
+      int i254);
+}