JNI: Move args in registers for @FastNative.

Golem results for art-opt-cc (higher is better):
linux-ia32                     before after
NativeDowncallStaticFast       222.00 222.17 (+0.0751%)
NativeDowncallStaticFast6      139.86 161.00 (+15.11%)
NativeDowncallStaticFastRefs6  131.00 137.86 (+5.238%)
NativeDowncallVirtualFast      211.79 217.17 (+2.543%)
NativeDowncallVirtualFast6     137.36 150.55 (+9.599%)
NativeDowncallVirtualFastRefs6 131.50 132.60 (+0.8382%)
linux-x64                      before after
NativeDowncallStaticFast       173.15 173.24 (+0.0499%)
NativeDowncallStaticFast6      135.50 157.61 (+16.31%)
NativeDowncallStaticFastRefs6  127.06 134.87 (+6.147%)
NativeDowncallVirtualFast      163.67 165.83 (+1.321%)
NativeDowncallVirtualFast6     128.18 147.35 (+14.96%)
NativeDowncallVirtualFastRefs6 123.44 130.74 (+5.914%)
linux-armv7                    before after
NativeDowncallStaticFast       21.622 21.622 (0%)
NativeDowncallStaticFast6      17.250 18.719 (+8.518%)
NativeDowncallStaticFastRefs6  14.757 15.663 (+6.145%)
NativeDowncallVirtualFast      21.027 21.319 (+1.388%)
NativeDowncallVirtualFast6     17.439 18.953 (+8.680%)
NativeDowncallVirtualFastRefs6 14.764 15.992 (+8.319%)
linux-armv8                    before after
NativeDowncallStaticFast       23.244 23.610 (+1.575%)
NativeDowncallStaticFast6      18.719 21.622 (+15.50%)
NativeDowncallStaticFastRefs6  14.757 18.491 (+20.89%)
NativeDowncallVirtualFast      20.197 21.319 (+5.554%)
NativeDowncallVirtualFast6     18.272 21.027 (+15.08%)
NativeDowncallVirtualFastRefs6 13.951 16.865 (+20.89%)
(The arm64 NativeDowncallVirtualFast reference value is very
low, resulting in an unexpected +5.554% improvement. As the
previous results seem to jump between 20.197 and 20.741,
the actual improvement is probably just around 2.5%.)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: I2b596414458b48a758826eafc223529e9f2fe059
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2857ff4..ac263c1 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -45,6 +45,9 @@
 // STRD immediate can encode any 4-byte aligned offset smaller than this cutoff.
 static constexpr size_t kStrdOffsetCutoff = 1024u;
 
+// ADD sp, imm can encode 4-byte aligned immediate smaller than this cutoff.
+static constexpr size_t kAddSpImmCutoff = 1024u;
+
 vixl::aarch32::Register AsVIXLRegister(ArmManagedRegister reg) {
   CHECK(reg.IsCoreRegister());
   return vixl::aarch32::Register(reg.RegId());
@@ -464,28 +467,11 @@
 // Get the number of locations to spill together.
 static inline size_t GetSpillChunkSize(ArrayRef<ArgumentLocation> dests,
                                        ArrayRef<ArgumentLocation> srcs,
-                                       size_t start,
-                                       bool have_extra_temp) {
+                                       size_t start) {
   DCHECK_LT(start, dests.size());
   DCHECK_ALIGNED(dests[start].GetFrameOffset().Uint32Value(), 4u);
   const ArgumentLocation& first_src = srcs[start];
-  if (!first_src.IsRegister()) {
-    DCHECK_ALIGNED(first_src.GetFrameOffset().Uint32Value(), 4u);
-    // If we have an extra temporary, look for opportunities to move 2 words
-    // at a time with LDRD/STRD when the source types are word-sized.
-    if (have_extra_temp &&
-        start + 1u != dests.size() &&
-        !srcs[start + 1u].IsRegister() &&
-        first_src.GetSize() == 4u &&
-        srcs[start + 1u].GetSize() == 4u &&
-        NoSpillGap(first_src, srcs[start + 1u]) &&
-        NoSpillGap(dests[start], dests[start + 1u]) &&
-        dests[start].GetFrameOffset().Uint32Value() < kStrdOffsetCutoff) {
-      // Note: The source and destination may not be 8B aligned (but they are 4B aligned).
-      return 2u;
-    }
-    return 1u;
-  }
+  DCHECK(first_src.IsRegister());
   ArmManagedRegister first_src_reg = first_src.GetRegister().AsArm();
   size_t end = start + 1u;
   if (IsCoreRegisterOrPair(first_src_reg)) {
@@ -555,8 +541,46 @@
 }
 
 void ArmVIXLJNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
-                                             ArrayRef<ArgumentLocation> srcs) {
-  DCHECK_EQ(dests.size(), srcs.size());
+                                             ArrayRef<ArgumentLocation> srcs,
+                                             ArrayRef<FrameOffset> refs) {
+  size_t arg_count = dests.size();
+  DCHECK_EQ(arg_count, srcs.size());
+  DCHECK_EQ(arg_count, refs.size());
+
+  // Spill reference registers. Spill two references together with STRD where possible.
+  for (size_t i = 0; i != arg_count; ++i) {
+    if (refs[i] != kInvalidReferenceOffset) {
+      DCHECK_EQ(srcs[i].GetSize(), kObjectReferenceSize);
+      if (srcs[i].IsRegister()) {
+        DCHECK_EQ(srcs[i].GetSize(), kObjectReferenceSize);
+        // Use STRD if we're storing 2 consecutive references within the available STRD range.
+        if (i + 1u != arg_count &&
+            refs[i + 1u] != kInvalidReferenceOffset &&
+            srcs[i + 1u].IsRegister() &&
+            refs[i].SizeValue() < kStrdOffsetCutoff) {
+          DCHECK_EQ(srcs[i + 1u].GetSize(), kObjectReferenceSize);
+          DCHECK_EQ(refs[i + 1u].SizeValue(), refs[i].SizeValue() + kObjectReferenceSize);
+          ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
+                   AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
+                   MemOperand(sp, refs[i].SizeValue()));
+          ++i;
+        } else {
+          Store(refs[i], srcs[i].GetRegister(), kObjectReferenceSize);
+        }
+      } else {
+        DCHECK_EQ(srcs[i].GetFrameOffset(), refs[i]);
+      }
+    }
+  }
+
+  // Convert reference registers to `jobject` values.
+  for (size_t i = 0; i != arg_count; ++i) {
+    if (refs[i] != kInvalidReferenceOffset && srcs[i].IsRegister()) {
+      // Note: We can clobber `srcs[i]` here as the register cannot hold more than one argument.
+      ManagedRegister src_i_reg = srcs[i].GetRegister();
+      CreateJObject(src_i_reg, refs[i], src_i_reg, /*null_allowed=*/ i != 0u);
+    }
+  }
 
   // Native ABI is soft-float, so all destinations should be core registers or stack offsets.
   // And register locations should be first, followed by stack locations with increasing offset.
@@ -574,12 +598,14 @@
   // Collect registers to move. No need to record FP regs as destinations are only core regs.
   uint32_t src_regs = 0u;
   uint32_t dest_regs = 0u;
+  uint32_t same_regs = 0u;
   for (size_t i = 0; i != num_reg_dests; ++i) {
     const ArgumentLocation& src = srcs[i];
     const ArgumentLocation& dest = dests[i];
     DCHECK(dest.IsRegister() && IsCoreRegisterOrPair(dest.GetRegister().AsArm()));
     if (src.IsRegister() && IsCoreRegisterOrPair(src.GetRegister().AsArm())) {
       if (src.GetRegister().Equals(dest.GetRegister())) {
+        same_regs |= GetCoreRegisterMask(src.GetRegister().AsArm());
         continue;
       }
       src_regs |= GetCoreRegisterMask(src.GetRegister().AsArm());
@@ -587,85 +613,141 @@
     dest_regs |= GetCoreRegisterMask(dest.GetRegister().AsArm());
   }
 
-  // Spill args first. Look for opportunities to spill multiple arguments at once.
-  {
-    UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
-    vixl32::Register xtemp;  // Extra temp register;
-    if ((dest_regs & ~src_regs) != 0u) {
-      xtemp = vixl32::Register(CTZ(dest_regs & ~src_regs));
-      DCHECK(!temps.IsAvailable(xtemp));
+  // Spill register arguments to stack slots.
+  for (size_t i = num_reg_dests; i != arg_count; ) {
+    const ArgumentLocation& src = srcs[i];
+    if (!src.IsRegister()) {
+      ++i;
+      continue;
     }
-    auto move_two_words = [&](FrameOffset dest_offset, FrameOffset src_offset) {
-      DCHECK(xtemp.IsValid());
-      DCHECK_LT(dest_offset.Uint32Value(), kStrdOffsetCutoff);
-      // VIXL macro assembler can use destination registers for loads from large offsets.
+    const ArgumentLocation& dest = dests[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());  // Even for references.
+    DCHECK(!dest.IsRegister());
+    uint32_t frame_offset = dest.GetFrameOffset().Uint32Value();
+    size_t chunk_size = GetSpillChunkSize(dests, srcs, i);
+    DCHECK_NE(chunk_size, 0u);
+    if (chunk_size == 1u) {
+      Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+    } else if (UseStrdForChunk(srcs, i, chunk_size)) {
+      ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
+               AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
+               MemOperand(sp, frame_offset));
+    } else if (UseVstrForChunk(srcs, i, chunk_size)) {
+      size_t sreg = GetSRegisterNumber(src.GetRegister().AsArm());
+      DCHECK_ALIGNED(sreg, 2u);
+      ___ Vstr(vixl32::DRegister(sreg / 2u), MemOperand(sp, frame_offset));
+    } else {
       UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
-      vixl32::Register temp2 = temps2.Acquire();
-      ___ Ldrd(xtemp, temp2, MemOperand(sp, src_offset.Uint32Value()));
-      ___ Strd(xtemp, temp2, MemOperand(sp, dest_offset.Uint32Value()));
-    };
-    for (size_t i = num_reg_dests, arg_count = dests.size(); i != arg_count; ) {
-      const ArgumentLocation& src = srcs[i];
-      const ArgumentLocation& dest = dests[i];
-      DCHECK_EQ(src.GetSize(), dest.GetSize());
-      DCHECK(!dest.IsRegister());
-      uint32_t frame_offset = dest.GetFrameOffset().Uint32Value();
-      size_t chunk_size = GetSpillChunkSize(dests, srcs, i, xtemp.IsValid());
-      DCHECK_NE(chunk_size, 0u);
-      if (chunk_size == 1u) {
-        if (src.IsRegister()) {
-          Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
-        } else if (dest.GetSize() == 8u && xtemp.IsValid() && frame_offset < kStrdOffsetCutoff) {
-          move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
-        } else {
-          Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
-        }
-      } else if (!src.IsRegister()) {
-        DCHECK_EQ(chunk_size, 2u);
-        DCHECK_EQ(dest.GetSize(), 4u);
-        DCHECK_EQ(dests[i + 1u].GetSize(), 4u);
-        move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
-      } else if (UseStrdForChunk(srcs, i, chunk_size)) {
-        ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
-                 AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
-                 MemOperand(sp, frame_offset));
-      } else if (UseVstrForChunk(srcs, i, chunk_size)) {
-        size_t sreg = GetSRegisterNumber(src.GetRegister().AsArm());
-        DCHECK_ALIGNED(sreg, 2u);
-        ___ Vstr(vixl32::DRegister(sreg / 2u), MemOperand(sp, frame_offset));
+      vixl32::Register base_reg;
+      if (frame_offset == 0u) {
+        base_reg = sp;
       } else {
-        UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
-        vixl32::Register base_reg;
-        if (frame_offset == 0u) {
-          base_reg = sp;
-        } else {
-          base_reg = temps2.Acquire();
-          ___ Add(base_reg, sp, frame_offset);
-        }
+        base_reg = temps2.Acquire();
+        ___ Add(base_reg, sp, frame_offset);
+      }
 
-        ArmManagedRegister src_reg = src.GetRegister().AsArm();
-        if (IsCoreRegisterOrPair(src_reg)) {
-          uint32_t core_reg_mask = GetCoreRegisterMask(srcs.SubArray(i, chunk_size));
-          ___ Stm(base_reg, NO_WRITE_BACK, RegisterList(core_reg_mask));
+      ArmManagedRegister src_reg = src.GetRegister().AsArm();
+      if (IsCoreRegisterOrPair(src_reg)) {
+        uint32_t core_reg_mask = GetCoreRegisterMask(srcs.SubArray(i, chunk_size));
+        ___ Stm(base_reg, NO_WRITE_BACK, RegisterList(core_reg_mask));
+      } else {
+        uint32_t start_sreg = GetSRegisterNumber(src_reg);
+        const ArgumentLocation& last_dest = dests[i + chunk_size - 1u];
+        uint32_t total_size =
+            last_dest.GetFrameOffset().Uint32Value() + last_dest.GetSize() - frame_offset;
+        if (IsAligned<2u>(start_sreg) &&
+            IsAligned<kDRegSizeInBytes>(frame_offset) &&
+            IsAligned<kDRegSizeInBytes>(total_size)) {
+          uint32_t dreg_count = total_size / kDRegSizeInBytes;
+          DRegisterList dreg_list(vixl32::DRegister(start_sreg / 2u), dreg_count);
+          ___ Vstm(F64, base_reg, NO_WRITE_BACK, dreg_list);
         } else {
-          uint32_t start_sreg = GetSRegisterNumber(src_reg);
-          const ArgumentLocation& last_dest = dests[i + chunk_size - 1u];
-          uint32_t total_size =
-              last_dest.GetFrameOffset().Uint32Value() + last_dest.GetSize() - frame_offset;
-          if (IsAligned<2u>(start_sreg) &&
-              IsAligned<kDRegSizeInBytes>(frame_offset) &&
-              IsAligned<kDRegSizeInBytes>(total_size)) {
-            uint32_t dreg_count = total_size / kDRegSizeInBytes;
-            DRegisterList dreg_list(vixl32::DRegister(start_sreg / 2u), dreg_count);
-            ___ Vstm(F64, base_reg, NO_WRITE_BACK, dreg_list);
-          } else {
-            uint32_t sreg_count = total_size / kSRegSizeInBytes;
-            SRegisterList sreg_list(vixl32::SRegister(start_sreg), sreg_count);
-            ___ Vstm(F32, base_reg, NO_WRITE_BACK, sreg_list);
-          }
+          uint32_t sreg_count = total_size / kSRegSizeInBytes;
+          SRegisterList sreg_list(vixl32::SRegister(start_sreg), sreg_count);
+          ___ Vstm(F32, base_reg, NO_WRITE_BACK, sreg_list);
         }
       }
-      i += chunk_size;
+    }
+    i += chunk_size;
+  }
+
+  // Copy incoming stack arguments to outgoing stack arguments.
+  // Registers r0-r3 are argument registers for both managed and native ABI and r4
+  // is a scratch register in managed ABI but also a hidden argument register for
+  // @CriticalNative call. We can use these registers as temporaries for copying
+  // stack arguments as long as they do not currently hold live values.
+  // TODO: Use the callee-save scratch registers instead to avoid using calling
+  // convention knowledge in the assembler. This would require reordering the
+  // argument move with pushing the IRT frame where those registers are used.
+  uint32_t copy_temp_regs = ((1u << 5) - 1u) & ~(same_regs | src_regs);
+  if ((dest_regs & (1u << R4)) != 0) {
+    // For @CriticalNative, R4 shall hold the hidden argument but it is available
+    // for use as a temporary at this point. However, it may be the only available
+    // register, so we shall use IP as the second temporary if needed.
+    // We do not need to worry about `CreateJObject` for @CriticalNative.
+    DCHECK_NE(copy_temp_regs, 0u);
+    DCHECK(std::all_of(refs.begin(),
+                       refs.end(),
+                       [](FrameOffset r) { return r == kInvalidReferenceOffset; }));
+  } else {
+    // For normal native and @FastNative, R4 and at least one of R0-R3 should be
+    // available because there are only 3 destination registers R1-R3 where the
+    // source registers can be moved. The R0 shall be filled by the `JNIEnv*`
+    // argument later. We need to keep IP available for `CreateJObject()`.
+    DCHECK_GE(POPCOUNT(copy_temp_regs), 2);
+  }
+  vixl32::Register copy_temp1 = vixl32::Register(LeastSignificantBit(copy_temp_regs));
+  copy_temp_regs ^= 1u << copy_temp1.GetCode();
+  vixl32::Register copy_xtemp = (copy_temp_regs != 0u)
+      ? vixl32::Register(LeastSignificantBit(copy_temp_regs))
+      : vixl32::Register();
+  for (size_t i = num_reg_dests; i != arg_count; ++i) {
+    if (srcs[i].IsRegister()) {
+      continue;
+    }
+    FrameOffset src_offset = srcs[i].GetFrameOffset();
+    DCHECK_ALIGNED(src_offset.Uint32Value(), 4u);
+    FrameOffset dest_offset = dests[i].GetFrameOffset();
+    DCHECK_ALIGNED(dest_offset.Uint32Value(), 4u);
+    // Look for opportunities to move 2 words at a time with LDRD/STRD
+    // when the source types are word-sized.
+    if (srcs[i].GetSize() == 4u &&
+        i + 1u != arg_count &&
+        !srcs[i + 1u].IsRegister() &&
+        srcs[i + 1u].GetSize() == 4u &&
+        NoSpillGap(srcs[i], srcs[i + 1u]) &&
+        NoSpillGap(dests[i], dests[i + 1u]) &&
+        dest_offset.Uint32Value() < kStrdOffsetCutoff) {
+      UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+      vixl32::Register copy_temp2 = copy_xtemp.IsValid() ? copy_xtemp : temps.Acquire();
+      ___ Ldrd(copy_temp1, copy_temp2, MemOperand(sp, src_offset.Uint32Value()));
+      if (refs[i] != kInvalidReferenceOffset) {
+        ArmManagedRegister m_copy_temp1 = ArmManagedRegister::FromCoreRegister(
+            enum_cast<Register>(copy_temp1.GetCode()));
+        CreateJObject(m_copy_temp1, refs[i], m_copy_temp1, /*null_allowed=*/ i != 0u);
+      }
+      if (refs[i + 1u] != kInvalidReferenceOffset) {
+        ArmManagedRegister m_copy_temp2 = ArmManagedRegister::FromCoreRegister(
+            enum_cast<Register>(copy_temp2.GetCode()));
+        CreateJObject(m_copy_temp2, refs[i + 1u], m_copy_temp2, /*null_allowed=*/ true);
+      }
+      ___ Strd(copy_temp1, copy_temp2, MemOperand(sp, dest_offset.Uint32Value()));
+      ++i;
+    } else if (dests[i].GetSize() == 8u && dest_offset.Uint32Value() < kStrdOffsetCutoff) {
+      UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+      vixl32::Register copy_temp2 = copy_xtemp.IsValid() ? copy_xtemp : temps.Acquire();
+      ___ Ldrd(copy_temp1, copy_temp2, MemOperand(sp, src_offset.Uint32Value()));
+      ___ Strd(copy_temp1, copy_temp2, MemOperand(sp, dest_offset.Uint32Value()));
+    } else if (refs[i] != kInvalidReferenceOffset) {
+      // Do not use the `CreateJObject()` overload for stack target as it generates
+      // worse code than explicitly using a low register temporary.
+      ___ Ldr(copy_temp1, MemOperand(sp, src_offset.Uint32Value()));
+      ArmManagedRegister m_copy_temp1 = ArmManagedRegister::FromCoreRegister(
+          enum_cast<Register>(copy_temp1.GetCode()));
+      CreateJObject(m_copy_temp1, refs[i], m_copy_temp1, /*null_allowed=*/ i != 0u);
+      ___ Str(copy_temp1, MemOperand(sp, dest_offset.Uint32Value()));
+    } else {
+      Copy(dest_offset, src_offset, dests[i].GetSize());
     }
   }
 
@@ -719,6 +801,16 @@
           ___ Ldrd(AsVIXLRegister(dests[i].GetRegister().AsArm()),
                    AsVIXLRegister(dests[j].GetRegister().AsArm()),
                    MemOperand(sp, srcs[i].GetFrameOffset().Uint32Value()));
+          if (refs[i] != kInvalidReferenceOffset) {
+            DCHECK_EQ(refs[i], srcs[i].GetFrameOffset());
+            ManagedRegister dest_i_reg = dests[i].GetRegister();
+            CreateJObject(dest_i_reg, refs[i], dest_i_reg, /*null_allowed=*/ i != 0u);
+          }
+          if (refs[j] != kInvalidReferenceOffset) {
+            DCHECK_EQ(refs[j], srcs[j].GetFrameOffset());
+            ManagedRegister dest_j_reg = dests[j].GetRegister();
+            CreateJObject(dest_j_reg, refs[j], dest_j_reg, /*null_allowed=*/ true);
+          }
           ++j;
           continue;
         }
@@ -737,6 +829,9 @@
     }
     if (srcs[i].IsRegister()) {
       Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+    } else if (refs[i] != kInvalidReferenceOffset) {
+      ManagedRegister dest_i_reg = dests[i].GetRegister();
+      CreateJObject(dest_i_reg, refs[i], dest_i_reg, /*null_allowed=*/ i != 0u);
     } else {
       Load(dests[i].GetRegister(), srcs[i].GetFrameOffset(), dests[i].GetSize());
     }
@@ -881,27 +976,27 @@
       in_reg = out_reg;
     }
 
-    temps.Exclude(in_reg);
-    ___ Cmp(in_reg, 0);
-
-    if (asm_.ShifterOperandCanHold(ADD, spilled_reference_offset.Int32Value())) {
-      if (!out_reg.Is(in_reg)) {
-        ExactAssemblyScope guard(asm_.GetVIXLAssembler(),
-                                 3 * vixl32::kMaxInstructionSizeInBytes,
-                                 CodeBufferCheckScope::kMaximumSize);
-        ___ it(eq, 0xc);
-        ___ mov(eq, out_reg, 0);
-        asm_.AddConstantInIt(out_reg, sp, spilled_reference_offset.Int32Value(), ne);
+    if (out_reg.IsLow() && spilled_reference_offset.Uint32Value() < kAddSpImmCutoff) {
+      // There is a 16-bit "ADD Rd, SP, <imm>" instruction we can use in IT-block.
+      if (out_reg.Is(in_reg)) {
+        ___ Cmp(in_reg, 0);
       } else {
-        ExactAssemblyScope guard(asm_.GetVIXLAssembler(),
-                                 2 * vixl32::kMaxInstructionSizeInBytes,
-                                 CodeBufferCheckScope::kMaximumSize);
-        ___ it(ne, 0x8);
-        asm_.AddConstantInIt(out_reg, sp, spilled_reference_offset.Int32Value(), ne);
+        ___ Movs(out_reg, in_reg);
       }
+      ExactAssemblyScope guard(asm_.GetVIXLAssembler(),
+                               2 * vixl32::k16BitT32InstructionSizeInBytes);
+      ___ it(ne);
+      ___ add(ne, Narrow, out_reg, sp, spilled_reference_offset.Int32Value());
     } else {
-      // TODO: Implement this (old arm assembler would have crashed here).
-      UNIMPLEMENTED(FATAL);
+      vixl32::Register addr_reg = out_reg.Is(in_reg) ? temps.Acquire() : out_reg;
+      vixl32::Register cond_mov_src_reg = out_reg.Is(in_reg) ? addr_reg : in_reg;
+      vixl32::Condition cond = out_reg.Is(in_reg) ? ne : eq;
+      ___ Add(addr_reg, sp, spilled_reference_offset.Int32Value());
+      ___ Cmp(in_reg, 0);
+      ExactAssemblyScope guard(asm_.GetVIXLAssembler(),
+                               2 * vixl32::k16BitT32InstructionSizeInBytes);
+      ___ it(cond);
+      ___ mov(cond, Narrow, out_reg, cond_mov_src_reg);
     }
   } else {
     asm_.AddConstant(out_reg, sp, spilled_reference_offset.Int32Value());
@@ -920,6 +1015,7 @@
     // e.g. scratch = (scratch == 0) ? 0 : (SP+spilled_reference_offset)
     ___ Cmp(scratch, 0);
 
+    // FIXME: Using 32-bit T32 instruction in IT-block is deprecated.
     if (asm_.ShifterOperandCanHold(ADD, spilled_reference_offset.Int32Value())) {
       ExactAssemblyScope guard(asm_.GetVIXLAssembler(),
                                2 * vixl32::kMaxInstructionSizeInBytes,
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index f4dc1d1..49f5e7c 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -94,7 +94,9 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
 
   // Copying routines.
-  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+  void MoveArguments(ArrayRef<ArgumentLocation> dests,
+                     ArrayRef<ArgumentLocation> srcs,
+                     ArrayRef<FrameOffset> refs) override;
 
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index 57e0823..073c2f0 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -42,6 +42,9 @@
 static constexpr size_t kAapcs64StackAlignment = 16u;
 static_assert(kAapcs64StackAlignment == kStackAlignment);
 
+// STP signed offset for W-register can encode any 4-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStpWOffsetCutoff = 256u;
+
 Arm64JNIMacroAssembler::~Arm64JNIMacroAssembler() {
 }
 
@@ -364,8 +367,36 @@
 
 // Copying routines.
 void Arm64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
-                                           ArrayRef<ArgumentLocation> srcs) {
-  DCHECK_EQ(dests.size(), srcs.size());
+                                           ArrayRef<ArgumentLocation> srcs,
+                                           ArrayRef<FrameOffset> refs) {
+  size_t arg_count = dests.size();
+  DCHECK_EQ(arg_count, srcs.size());
+  DCHECK_EQ(arg_count, refs.size());
+
+  // Spill reference registers. Spill two references together with STP where possible.
+  for (size_t i = 0; i != arg_count; ++i) {
+    if (refs[i] != kInvalidReferenceOffset) {
+      DCHECK_EQ(srcs[i].GetSize(), kObjectReferenceSize);
+      if (srcs[i].IsRegister()) {
+        // Use STP if we're storing 2 consecutive references within the available STP range.
+        if (i + 1u != arg_count &&
+            refs[i + 1u].SizeValue() == refs[i].SizeValue() + kObjectReferenceSize &&
+            srcs[i + 1u].IsRegister() &&
+            refs[i].SizeValue() < kStpWOffsetCutoff) {
+          DCHECK_EQ(srcs[i + 1u].GetSize(), kObjectReferenceSize);
+          ___ Stp(reg_w(srcs[i].GetRegister().AsArm64().AsWRegister()),
+                  reg_w(srcs[i + 1u].GetRegister().AsArm64().AsWRegister()),
+                  MEM_OP(sp, refs[i].SizeValue()));
+          ++i;
+        } else {
+          Store(refs[i], srcs[i].GetRegister(), kObjectReferenceSize);
+        }
+      } else {
+        DCHECK_EQ(srcs[i].GetFrameOffset(), refs[i]);
+      }
+    }
+  }
+
   auto get_mask = [](ManagedRegister reg) -> uint64_t {
     Arm64ManagedRegister arm64_reg = reg.AsArm64();
     if (arm64_reg.IsXRegister()) {
@@ -387,19 +418,34 @@
       return (UINT64_C(1) << 32u) << fp_reg_number;
     }
   };
+
   // Collect registers to move while storing/copying args to stack slots.
+  // Convert processed references to `jobject`.
   // More than 8 core or FP reg args are very rare, so we do not optimize
   // for that case by using LDP/STP.
-  // TODO: LDP/STP will be useful for normal and @FastNative where we need
+  // TODO: LDP/STP will be useful for normal native methods where we need
   // to spill even the leading arguments.
   uint64_t src_regs = 0u;
   uint64_t dest_regs = 0u;
-  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+  for (size_t i = 0; i != arg_count; ++i) {
     const ArgumentLocation& src = srcs[i];
     const ArgumentLocation& dest = dests[i];
-    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    const FrameOffset ref = refs[i];
+    if (ref != kInvalidReferenceOffset) {
+      DCHECK_EQ(src.GetSize(), kObjectReferenceSize);
+      DCHECK_EQ(dest.GetSize(), static_cast<size_t>(kArm64PointerSize));
+    } else {
+      DCHECK_EQ(src.GetSize(), dest.GetSize());
+    }
     if (dest.IsRegister()) {
-      if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+      // Note: For references, `Equals()` returns `false` for overlapping W and X registers.
+      if (ref != kInvalidReferenceOffset &&
+          src.IsRegister() &&
+          src.GetRegister().AsArm64().AsOverlappingXRegister() ==
+              dest.GetRegister().AsArm64().AsXRegister()) {
+        // Just convert to `jobject`. No further processing is needed.
+        CreateJObject(dest.GetRegister(), ref, src.GetRegister(), /*null_allowed=*/ i != 0u);
+      } else if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
         // Nothing to do.
       } else {
         if (src.IsRegister()) {
@@ -407,6 +453,16 @@
         }
         dest_regs |= get_mask(dest.GetRegister());
       }
+    } else if (ref != kInvalidReferenceOffset) {
+      if (src.IsRegister()) {
+        // Note: We can clobber `src` here as the register cannot hold more than one argument.
+        ManagedRegister src_x =
+            CoreRegisterWithSize(src.GetRegister(), static_cast<size_t>(kArm64PointerSize));
+        CreateJObject(src_x, ref, src.GetRegister(), /*null_allowed=*/ i != 0u);
+        Store(dest.GetFrameOffset(), src_x, dest.GetSize());
+      } else {
+        CreateJObject(dest.GetFrameOffset(), ref, /*null_allowed=*/ i != 0u);
+      }
     } else {
       if (src.IsRegister()) {
         Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
@@ -419,9 +475,10 @@
   // There should be no cycles, so this simple algorithm should make progress.
   while (dest_regs != 0u) {
     uint64_t old_dest_regs = dest_regs;
-    for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    for (size_t i = 0; i != arg_count; ++i) {
       const ArgumentLocation& src = srcs[i];
       const ArgumentLocation& dest = dests[i];
+      const FrameOffset ref = refs[i];
       if (!dest.IsRegister()) {
         continue;  // Stored in first loop above.
       }
@@ -433,10 +490,19 @@
         continue;  // Cannot clobber this register yet.
       }
       if (src.IsRegister()) {
-        Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+        if (ref != kInvalidReferenceOffset) {
+          CreateJObject(dest.GetRegister(), ref, src.GetRegister(), /*null_allowed=*/ i != 0u);
+        } else {
+          Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+        }
         src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
       } else {
-        Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+        if (ref != kInvalidReferenceOffset) {
+          CreateJObject(
+              dest.GetRegister(), ref, ManagedRegister::NoRegister(), /*null_allowed=*/ i != 0u);
+        } else {
+          Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+        }
       }
       dest_regs &= ~get_mask(dest.GetRegister());  // Destination register was filled.
     }
@@ -695,23 +761,22 @@
                                            bool null_allowed) {
   Arm64ManagedRegister out_reg = m_out_reg.AsArm64();
   Arm64ManagedRegister in_reg = m_in_reg.AsArm64();
-  // For now we only hold stale handle scope entries in x registers.
-  CHECK(in_reg.IsNoRegister() || in_reg.IsXRegister()) << in_reg;
+  CHECK(in_reg.IsNoRegister() || in_reg.IsWRegister()) << in_reg;
   CHECK(out_reg.IsXRegister()) << out_reg;
   if (null_allowed) {
+    UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+    Register scratch = temps.AcquireX();
+
     // Null values get a jobject value null. Otherwise, the jobject is
     // the address of the spilled reference.
     // e.g. out_reg = (in == 0) ? 0 : (SP+spilled_reference_offset)
     if (in_reg.IsNoRegister()) {
-      LoadWFromOffset(kLoadWord, out_reg.AsOverlappingWRegister(), SP,
-                      spilled_reference_offset.Int32Value());
-      in_reg = out_reg;
+      in_reg = Arm64ManagedRegister::FromWRegister(out_reg.AsOverlappingWRegister());
+      LoadWFromOffset(kLoadWord, in_reg.AsWRegister(), SP, spilled_reference_offset.Int32Value());
     }
-    ___ Cmp(reg_w(in_reg.AsOverlappingWRegister()), 0);
-    if (!out_reg.Equals(in_reg)) {
-      LoadImmediate(out_reg.AsXRegister(), 0, eq);
-    }
-    AddConstant(out_reg.AsXRegister(), SP, spilled_reference_offset.Int32Value(), ne);
+    ___ Add(scratch, reg_x(SP), spilled_reference_offset.Int32Value());
+    ___ Cmp(reg_w(in_reg.AsWRegister()), 0);
+    ___ Csel(reg_x(out_reg.AsXRegister()), scratch, xzr, ne);
   } else {
     AddConstant(out_reg.AsXRegister(), SP, spilled_reference_offset.Int32Value(), al);
   }
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 32f1ea9..b6e31c2 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -88,7 +88,9 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
 
   // Copying routines.
-  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+  void MoveArguments(ArrayRef<ArgumentLocation> dests,
+                     ArrayRef<ArgumentLocation> srcs,
+                     ArrayRef<FrameOffset> refs) override;
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
   void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;
   void CopyRawPtrToThread(ThreadOffset64 thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 79ab025..b2d4dcd 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -171,13 +171,15 @@
   __ Move(hidden_arg_register, method_register, 4);
   __ VerifyObject(scratch_register, false);
 
-  __ CreateJObject(scratch_register, FrameOffset(48), scratch_register, true);
-  __ CreateJObject(scratch_register, FrameOffset(48), scratch_register, false);
-  __ CreateJObject(method_register, FrameOffset(48), scratch_register, true);
+  // Note: `CreateJObject()` may need the scratch register IP. Test with another high register.
+  const ManagedRegister high_register = ArmManagedRegister::FromCoreRegister(R11);
+  __ CreateJObject(high_register, FrameOffset(48), high_register, true);
+  __ CreateJObject(high_register, FrameOffset(48), high_register, false);
+  __ CreateJObject(method_register, FrameOffset(48), high_register, true);
   __ CreateJObject(FrameOffset(48), FrameOffset(64), true);
-  __ CreateJObject(method_register, FrameOffset(0), scratch_register, true);
-  __ CreateJObject(method_register, FrameOffset(1025), scratch_register, true);
-  __ CreateJObject(scratch_register, FrameOffset(1025), scratch_register, true);
+  __ CreateJObject(method_register, FrameOffset(0), high_register, true);
+  __ CreateJObject(method_register, FrameOffset(1028), high_register, true);
+  __ CreateJObject(high_register, FrameOffset(1028), high_register, true);
 
   std::unique_ptr<JNIMacroLabel> exception_slow_path = __ CreateLabel();
   __ ExceptionPoll(exception_slow_path.get());
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index 9b5b6e2..541458b 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -52,30 +52,30 @@
   "      a8: 48 46         mov r0, r9\n"
   "      aa: cd f8 30 90   str.w r9, [sp, #48]\n"
   "      ae: 04 46         mov r4, r0\n"
-  "      b0: bc f1 00 0f   cmp.w r12, #0\n"
-  "      b4: 18 bf         it ne\n"
-  "      b6: 0d f1 30 0c   addne.w r12, sp, #48\n"
-  "      ba: 0d f1 30 0c   add.w r12, sp, #48\n"
-  "      be: bc f1 00 0f   cmp.w r12, #0\n"
-  "      c2: 0c bf         ite eq\n"
-  "      c4: 00 20         moveq r0, #0\n"
+  "      b0: 0d f1 30 0c   add.w r12, sp, #48\n"
+  "      b4: bb f1 00 0f   cmp.w r11, #0\n"
+  "      b8: 18 bf         it ne\n"
+  "      ba: e3 46         movne r11, r12\n"
+  "      bc: 0d f1 30 0b   add.w r11, sp, #48\n"
+  "      c0: 5f ea 0b 00   movs.w r0, r11\n"
+  "      c4: 18 bf         it ne\n"
   "      c6: 0c a8         addne r0, sp, #48\n"
   "      c8: dd f8 40 c0   ldr.w r12, [sp, #64]\n"
   "      cc: bc f1 00 0f   cmp.w r12, #0\n"
   "      d0: 18 bf         it ne\n"
   "      d2: 0d f1 40 0c   addne.w r12, sp, #64\n"
   "      d6: cd f8 30 c0   str.w r12, [sp, #48]\n"
-  "      da: bc f1 00 0f   cmp.w r12, #0\n"
-  "      de: 0c bf         ite eq\n"
-  "      e0: 00 20         moveq r0, #0\n"
-  "      e2: 68 46         movne r0, sp\n"
-  "      e4: bc f1 00 0f   cmp.w r12, #0\n"
-  "      e8: 0c bf         ite eq\n"
-  "      ea: 00 20         moveq r0, #0\n"
-  "      ec: 0d f2 01 40   addwne r0, sp, #1025\n"
-  "      f0: bc f1 00 0f   cmp.w r12, #0\n"
-  "      f4: 18 bf         it ne\n"
-  "      f6: 0d f2 01 4c   addwne r12, sp, #1025\n"
+  "      da: 5f ea 0b 00   movs.w r0, r11\n"
+  "      de: 18 bf         it ne\n"
+  "      e0: 00 a8         addne r0, sp, #0\n"
+  "      e2: 0d f2 04 40   addw r0, sp, #1028\n"
+  "      e6: bb f1 00 0f   cmp.w r11, #0\n"
+  "      ea: 08 bf         it eq\n"
+  "      ec: 58 46         moveq r0, r11\n"
+  "      ee: 0d f2 04 4c   addw r12, sp, #1028\n"
+  "      f2: bb f1 00 0f   cmp.w r11, #0\n"
+  "      f6: 18 bf         it ne\n"
+  "      f8: e3 46         movne r11, r12\n"
   "      fa: d9 f8 8c c0   ldr.w r12, [r9, #140]\n"
   "      fe: bc f1 00 0f   cmp.w r12, #0\n"
   "     102: 71 d1         bne 0x1e8     @ imm = #226\n"
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 7f5dc2f..abb53b7 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -152,7 +152,16 @@
   virtual void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset<kPointerSize> offs) = 0;
 
   // Copying routines
-  virtual void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) = 0;
+
+  // Move arguments from `srcs` locations to `dests` locations.
+  //
+  // References shall be spilled to `refs` frame offsets (kInvalidReferenceOffset indicates
+  // a non-reference type) if they are in registers and corresponding `dests` shall be
+  // filled with `jobject` replacements. If the first argument is a reference, it is
+  // assumed to be `this` and cannot be null, all other reference arguments can be null.
+  virtual void MoveArguments(ArrayRef<ArgumentLocation> dests,
+                             ArrayRef<ArgumentLocation> srcs,
+                             ArrayRef<FrameOffset> refs) = 0;
 
   virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
 
@@ -276,6 +285,8 @@
     emit_run_time_checks_in_debug_mode_ = value;
   }
 
+  static constexpr FrameOffset kInvalidReferenceOffset = FrameOffset(0);
+
  protected:
   JNIMacroAssembler() {}
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 2e7f23d..d0afa72 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -317,35 +317,57 @@
 }
 
 void X86JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
-                                         ArrayRef<ArgumentLocation> srcs) {
-  DCHECK_EQ(dests.size(), srcs.size());
+                                         ArrayRef<ArgumentLocation> srcs,
+                                         ArrayRef<FrameOffset> refs) {
+  size_t arg_count = dests.size();
+  DCHECK_EQ(arg_count, srcs.size());
+  DCHECK_EQ(arg_count, refs.size());
+
+  // Store register args to stack slots. Convert processed references to `jobject`.
   bool found_hidden_arg = false;
-  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+  for (size_t i = 0; i != arg_count; ++i) {
     const ArgumentLocation& src = srcs[i];
     const ArgumentLocation& dest = dests[i];
-    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    const FrameOffset ref = refs[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());  // Even for references.
     if (src.IsRegister()) {
       if (UNLIKELY(dest.IsRegister())) {
         // Native ABI has only stack arguments but we may pass one "hidden arg" in register.
         CHECK(!found_hidden_arg);
         found_hidden_arg = true;
+        DCHECK_EQ(ref, kInvalidReferenceOffset);
         DCHECK(
             !dest.GetRegister().Equals(X86ManagedRegister::FromCpuRegister(GetScratchRegister())));
         Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
       } else {
+        if (ref != kInvalidReferenceOffset) {
+          Store(ref, srcs[i].GetRegister(), kObjectReferenceSize);
+          // Note: We can clobber `src` here as the register cannot hold more than one argument.
+          //       This overload of `CreateJObject()` currently does not use the scratch
+          //       register ECX, so this shall not clobber another argument.
+          CreateJObject(src.GetRegister(), ref, src.GetRegister(), /*null_allowed=*/ i != 0u);
+        }
         Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
       }
     } else {
       // Delay copying until we have spilled all registers, including the scratch register ECX.
     }
   }
-  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+
+  // Copy incoming stack args. Convert processed references to `jobject`.
+  for (size_t i = 0; i != arg_count; ++i) {
     const ArgumentLocation& src = srcs[i];
     const ArgumentLocation& dest = dests[i];
-    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    const FrameOffset ref = refs[i];
+    DCHECK_EQ(src.GetSize(), dest.GetSize());  // Even for references.
     if (!src.IsRegister()) {
       DCHECK(!dest.IsRegister());
-      Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      if (ref != kInvalidReferenceOffset) {
+        DCHECK_EQ(srcs[i].GetFrameOffset(), refs[i]);
+        CreateJObject(dest.GetFrameOffset(), ref, /*null_allowed=*/ i != 0u);
+      } else {
+        Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+      }
     }
   }
 }
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 68822f8..058e040 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -86,7 +86,9 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
 
   // Copying routines
-  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+  void MoveArguments(ArrayRef<ArgumentLocation> dests,
+                     ArrayRef<ArgumentLocation> srcs,
+                     ArrayRef<FrameOffset> refs) override;
 
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index afed413..1425a4c 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -353,8 +353,12 @@
 }
 
 void X86_64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
-                                            ArrayRef<ArgumentLocation> srcs) {
-  DCHECK_EQ(dests.size(), srcs.size());
+                                            ArrayRef<ArgumentLocation> srcs,
+                                            ArrayRef<FrameOffset> refs) {
+  size_t arg_count = dests.size();
+  DCHECK_EQ(arg_count, srcs.size());
+  DCHECK_EQ(arg_count, refs.size());
+
   auto get_mask = [](ManagedRegister reg) -> uint32_t {
     X86_64ManagedRegister x86_64_reg = reg.AsX86_64();
     if (x86_64_reg.IsCpuRegister()) {
@@ -368,14 +372,32 @@
       return (1u << 16u) << xmm_reg_number;
     }
   };
+
   // Collect registers to move while storing/copying args to stack slots.
+  // Convert all register references and copied stack references to `jobject`.
   uint32_t src_regs = 0u;
   uint32_t dest_regs = 0u;
-  for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+  for (size_t i = 0; i != arg_count; ++i) {
     const ArgumentLocation& src = srcs[i];
     const ArgumentLocation& dest = dests[i];
-    DCHECK_EQ(src.GetSize(), dest.GetSize());
+    const FrameOffset ref = refs[i];
+    if (ref != kInvalidReferenceOffset) {
+      DCHECK_EQ(src.GetSize(), kObjectReferenceSize);
+      DCHECK_EQ(dest.GetSize(), static_cast<size_t>(kX86_64PointerSize));
+    } else {
+      DCHECK_EQ(src.GetSize(), dest.GetSize());
+    }
+    if (src.IsRegister() && ref != kInvalidReferenceOffset) {
+      Store(ref, src.GetRegister(), kObjectReferenceSize);
+      // Note: We can clobber `src` here as the register cannot hold more than one argument.
+      //       This overload of `CreateJObject()` is currently implemented as "test and branch";
+      //       if it was using a conditional move, it would be better to do this at move time.
+      CreateJObject(src.GetRegister(), ref, src.GetRegister(), /*null_allowed=*/ i != 0u);
+    }
     if (dest.IsRegister()) {
+      // Note: X86_64ManagedRegister makes no distinction between 32-bit and 64-bit core
+      // registers, so the following `Equals()` can return `true` for references; the
+      // reference has already been converted to `jobject` above.
       if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
         // Nothing to do.
       } else {
@@ -387,18 +409,22 @@
     } else {
       if (src.IsRegister()) {
         Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+      } else if (ref != kInvalidReferenceOffset) {
+        CreateJObject(dest.GetFrameOffset(), ref, /*null_allowed=*/ i != 0u);
       } else {
         Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
       }
     }
   }
-  // Fill destination registers.
+
+  // Fill destination registers. Convert loaded references to `jobject`.
   // There should be no cycles, so this simple algorithm should make progress.
   while (dest_regs != 0u) {
     uint32_t old_dest_regs = dest_regs;
-    for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+    for (size_t i = 0; i != arg_count; ++i) {
       const ArgumentLocation& src = srcs[i];
       const ArgumentLocation& dest = dests[i];
+      const FrameOffset ref = refs[i];
       if (!dest.IsRegister()) {
         continue;  // Stored in first loop above.
       }
@@ -412,6 +438,9 @@
       if (src.IsRegister()) {
         Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
         src_regs &= ~get_mask(src.GetRegister());  // Allow clobbering source register.
+      } else if (ref != kInvalidReferenceOffset) {
+        CreateJObject(
+            dest.GetRegister(), ref, ManagedRegister::NoRegister(), /*null_allowed=*/ i != 0u);
       } else {
         Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
       }
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 71c3035..3e5dfb7 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -89,7 +89,9 @@
   void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
 
   // Copying routines
-  void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+  void MoveArguments(ArrayRef<ArgumentLocation> dests,
+                     ArrayRef<ArgumentLocation> srcs,
+                     ArrayRef<FrameOffset> refs) override;
 
   void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;