Move @CriticalNative arguments in registers.
And spill stack arguments directly to the right location.
Do not spill to the reserved space in the caller's frame.
Preliminary Golem results for art-opt-cc:
x86 x86-64 arm arm64
NativeDowncallCritical6: n/a +14.3% +17.2% +26.1%
(x86 seems to be currently providing results that are worse
than interpreter, so something is not working.)
Test: Additional tests in 178-app-image-native-method test.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: I709c52ab2585a8f5f441f53ad2bf4a01d2b25dca
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index ac060cc..036cdbb 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -24,6 +24,7 @@
#include "art_method.h"
#include "base/arena_allocator.h"
+#include "base/arena_containers.h"
#include "base/enums.h"
#include "base/logging.h" // For VLOG.
#include "base/macros.h"
@@ -227,13 +228,10 @@
__ BuildFrame(current_frame_size, method_register, callee_save_regs);
DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
- {
+ if (LIKELY(!is_critical_native)) {
// Spill all register arguments.
// TODO: Spill reference args directly to the HandleScope.
// TODO: Spill native stack args straight to their stack locations (adjust SP earlier).
- // TODO: Move args in registers without spilling for @CriticalNative.
- // TODO: Provide assembler API for batched moves to allow moving multiple arguments
- // with single instruction (arm: LDRD/STRD/LDMIA/STMIA, arm64: LDP/STP).
mr_conv->ResetIterator(FrameOffset(current_frame_size));
for (; mr_conv->HasNext(); mr_conv->Next()) {
if (mr_conv->IsCurrentParamInRegister()) {
@@ -241,9 +239,7 @@
__ Store(mr_conv->CurrentParamStackOffset(), mr_conv->CurrentParamRegister(), size);
}
}
- }
- if (LIKELY(!is_critical_native)) {
// NOTE: @CriticalNative methods don't have a HandleScope
// because they can't have any reference parameters or return values.
@@ -320,10 +316,6 @@
size_t current_out_arg_size = main_out_arg_size;
if (UNLIKELY(is_critical_native)) {
DCHECK_EQ(main_out_arg_size, current_frame_size);
- // Move the method pointer to the hidden argument register.
- __ Move(main_jni_conv->HiddenArgumentRegister(),
- mr_conv->MethodRegister(),
- static_cast<size_t>(main_jni_conv->GetFramePointerSize()));
} else {
__ IncreaseFrameSize(main_out_arg_size);
current_frame_size += main_out_arg_size;
@@ -434,65 +426,86 @@
__ Store(saved_cookie_offset, main_jni_conv->IntReturnRegister(), 4 /* sizeof cookie */);
}
- // 7. Iterate over arguments placing values from managed calling convention in
- // to the convention required for a native call (shuffling). For references
- // place an index/pointer to the reference after checking whether it is
- // null (which must be encoded as null).
- // Note: we do this prior to materializing the JNIEnv* and static's jclass to
- // give as many free registers for the shuffle as possible.
- mr_conv->ResetIterator(FrameOffset(current_frame_size));
- uint32_t args_count = 0;
- while (mr_conv->HasNext()) {
- args_count++;
- mr_conv->Next();
- }
-
- // Do a backward pass over arguments, so that the generated code will be "mov
- // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
- // TODO: A reverse iterator to improve readability.
- // TODO: This is currently useless as all archs spill args when building the frame.
- // To avoid the full spilling, we would have to do one pass before the BuildFrame()
- // to determine which arg registers are clobbered before they are needed.
- // TODO: For @CriticalNative, do a forward pass because there are no JNIEnv* and jclass* args.
- for (uint32_t i = 0; i < args_count; ++i) {
+ // 7. Fill arguments.
+ if (UNLIKELY(is_critical_native)) {
+ ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
+ ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
+ // Move the method pointer to the hidden argument register.
+ size_t pointer_size = static_cast<size_t>(kPointerSize);
+ dest_args.push_back(ArgumentLocation(main_jni_conv->HiddenArgumentRegister(), pointer_size));
+ src_args.push_back(ArgumentLocation(mr_conv->MethodRegister(), pointer_size));
+ // Move normal arguments to their locations.
mr_conv->ResetIterator(FrameOffset(current_frame_size));
main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+ for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
+ DCHECK(main_jni_conv->HasNext());
+ size_t size = mr_conv->IsCurrentParamALongOrDouble() ? 8u : 4u;
+ src_args.push_back(mr_conv->IsCurrentParamInRegister()
+ ? ArgumentLocation(mr_conv->CurrentParamRegister(), size)
+ : ArgumentLocation(mr_conv->CurrentParamStackOffset(), size));
+ dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
+ ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), size)
+ : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), size));
+ }
+ DCHECK(!main_jni_conv->HasNext());
+ __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args), ArrayRef<ArgumentLocation>(src_args));
+ } else {
+ // Iterate over arguments placing values from managed calling convention in
+ // to the convention required for a native call (shuffling). For references
+ // place an index/pointer to the reference after checking whether it is
+ // null (which must be encoded as null).
+ // Note: we do this prior to materializing the JNIEnv* and static's jclass to
+ // give as many free registers for the shuffle as possible.
+ mr_conv->ResetIterator(FrameOffset(current_frame_size));
+ uint32_t args_count = 0;
+ while (mr_conv->HasNext()) {
+ args_count++;
+ mr_conv->Next();
+ }
- // Skip the extra JNI parameters for now.
- if (LIKELY(!is_critical_native)) {
+ // Do a backward pass over arguments, so that the generated code will be "mov
+ // R2, R3; mov R1, R2" instead of "mov R1, R2; mov R2, R3."
+ // TODO: A reverse iterator to improve readability.
+ // TODO: This is currently useless as all archs spill args when building the frame.
+ // To avoid the full spilling, we would have to do one pass before the BuildFrame()
+ // to determine which arg registers are clobbered before they are needed.
+ for (uint32_t i = 0; i < args_count; ++i) {
+ mr_conv->ResetIterator(FrameOffset(current_frame_size));
+ main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+
+ // Skip the extra JNI parameters for now.
main_jni_conv->Next(); // Skip JNIEnv*.
if (is_static) {
main_jni_conv->Next(); // Skip Class for now.
}
+ // Skip to the argument we're interested in.
+ for (uint32_t j = 0; j < args_count - i - 1; ++j) {
+ mr_conv->Next();
+ main_jni_conv->Next();
+ }
+ CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
}
- // Skip to the argument we're interested in.
- for (uint32_t j = 0; j < args_count - i - 1; ++j) {
- mr_conv->Next();
- main_jni_conv->Next();
- }
- CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get());
- }
- if (is_static && !is_critical_native) {
- // Create argument for Class
- mr_conv->ResetIterator(FrameOffset(current_frame_size));
- main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
- main_jni_conv->Next(); // Skip JNIEnv*
- FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
- if (main_jni_conv->IsCurrentParamOnStack()) {
- FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
- __ CreateHandleScopeEntry(out_off, handle_scope_offset, /*null_allowed=*/ false);
- } else {
- ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
- __ CreateHandleScopeEntry(out_reg,
- handle_scope_offset,
- ManagedRegister::NoRegister(),
+ if (is_static) {
+ // Create argument for Class
+ mr_conv->ResetIterator(FrameOffset(current_frame_size));
+ main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+ main_jni_conv->Next(); // Skip JNIEnv*
+ FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
+ if (main_jni_conv->IsCurrentParamOnStack()) {
+ FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
+ __ CreateHandleScopeEntry(out_off, handle_scope_offset, /*null_allowed=*/ false);
+ } else {
+ ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
+ __ CreateHandleScopeEntry(out_reg,
+ handle_scope_offset,
+ ManagedRegister::NoRegister(),
/*null_allowed=*/ false);
+ }
}
- }
- // Set the iterator back to the incoming Method*.
- main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
- if (LIKELY(!is_critical_native)) {
+ // Set the iterator back to the incoming Method*.
+ main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+
// 8. Create 1st argument, the JNI environment ptr.
// Register that will hold local indirect reference table
if (main_jni_conv->IsCurrentParamInRegister()) {
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 5a355be..85b253c 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -41,6 +41,9 @@
static constexpr size_t kAapcsStackAlignment = 8u;
static_assert(kAapcsStackAlignment < kStackAlignment);
+// STRD immediate can encode any 4-byte aligned offset smaller than this cutoff.
+static constexpr size_t kStrdOffsetCutoff = 1024u;
+
vixl::aarch32::Register AsVIXLRegister(ArmManagedRegister reg) {
CHECK(reg.IsCoreRegister());
return vixl::aarch32::Register(reg.RegId());
@@ -223,8 +226,9 @@
asm_.StoreToOffset(kStoreWord, AsVIXLRegister(src), sp, dest.Int32Value());
} else if (src.IsRegisterPair()) {
CHECK_EQ(8u, size);
- asm_.StoreToOffset(kStoreWord, AsVIXLRegisterPairLow(src), sp, dest.Int32Value());
- asm_.StoreToOffset(kStoreWord, AsVIXLRegisterPairHigh(src), sp, dest.Int32Value() + 4);
+ ___ Strd(AsVIXLRegisterPairLow(src),
+ AsVIXLRegisterPairHigh(src),
+ MemOperand(sp, dest.Int32Value()));
} else if (src.IsSRegister()) {
CHECK_EQ(4u, size);
asm_.StoreSToOffset(AsVIXLSRegister(src), sp, dest.Int32Value());
@@ -365,6 +369,310 @@
UNIMPLEMENTED(FATAL) << "no zero extension necessary for arm";
}
+static inline bool IsCoreRegisterOrPair(ArmManagedRegister reg) {
+ return reg.IsCoreRegister() || reg.IsRegisterPair();
+}
+
+static inline bool NoSpillGap(const ArgumentLocation& loc1, const ArgumentLocation& loc2) {
+ DCHECK(!loc1.IsRegister());
+ DCHECK(!loc2.IsRegister());
+ uint32_t loc1_offset = loc1.GetFrameOffset().Uint32Value();
+ uint32_t loc2_offset = loc2.GetFrameOffset().Uint32Value();
+ DCHECK_LT(loc1_offset, loc2_offset);
+ return loc1_offset + loc1.GetSize() == loc2_offset;
+}
+
+static inline uint32_t GetSRegisterNumber(ArmManagedRegister reg) {
+ if (reg.IsSRegister()) {
+ return static_cast<uint32_t>(reg.AsSRegister());
+ } else {
+ DCHECK(reg.IsDRegister());
+ return 2u * static_cast<uint32_t>(reg.AsDRegister());
+ }
+}
+
+// Get the number of locations to spill together.
+static inline size_t GetSpillChunkSize(ArrayRef<ArgumentLocation> dests,
+ ArrayRef<ArgumentLocation> srcs,
+ size_t start,
+ bool have_extra_temp) {
+ DCHECK_LT(start, dests.size());
+ DCHECK_ALIGNED(dests[start].GetFrameOffset().Uint32Value(), 4u);
+ const ArgumentLocation& first_src = srcs[start];
+ if (!first_src.IsRegister()) {
+ DCHECK_ALIGNED(first_src.GetFrameOffset().Uint32Value(), 4u);
+ // If we have an extra temporary, look for opportunities to move 2 words
+ // at a time with LDRD/STRD when the source types are word-sized.
+ if (have_extra_temp &&
+ start + 1u != dests.size() &&
+ !srcs[start + 1u].IsRegister() &&
+ first_src.GetSize() == 4u &&
+ srcs[start + 1u].GetSize() == 4u &&
+ NoSpillGap(first_src, srcs[start + 1u]) &&
+ NoSpillGap(dests[start], dests[start + 1u]) &&
+ dests[start].GetFrameOffset().Uint32Value() < kStrdOffsetCutoff) {
+ // Note: The source and destination may not be 8B aligned (but they are 4B aligned).
+ return 2u;
+ }
+ return 1u;
+ }
+ ArmManagedRegister first_src_reg = first_src.GetRegister().AsArm();
+ size_t end = start + 1u;
+ if (IsCoreRegisterOrPair(first_src_reg)) {
+ while (end != dests.size() &&
+ NoSpillGap(dests[end - 1u], dests[end]) &&
+ srcs[end].IsRegister() &&
+ IsCoreRegisterOrPair(srcs[end].GetRegister().AsArm())) {
+ ++end;
+ }
+ } else {
+ DCHECK(first_src_reg.IsSRegister() || first_src_reg.IsDRegister());
+ uint32_t next_sreg = GetSRegisterNumber(first_src_reg) + first_src.GetSize() / kSRegSizeInBytes;
+ while (end != dests.size() &&
+ NoSpillGap(dests[end - 1u], dests[end]) &&
+ srcs[end].IsRegister() &&
+ !IsCoreRegisterOrPair(srcs[end].GetRegister().AsArm()) &&
+ GetSRegisterNumber(srcs[end].GetRegister().AsArm()) == next_sreg) {
+ next_sreg += srcs[end].GetSize() / kSRegSizeInBytes;
+ ++end;
+ }
+ }
+ return end - start;
+}
+
+static inline uint32_t GetCoreRegisterMask(ArmManagedRegister reg) {
+ if (reg.IsCoreRegister()) {
+ return 1u << static_cast<size_t>(reg.AsCoreRegister());
+ } else {
+ DCHECK(reg.IsRegisterPair());
+ DCHECK_LT(reg.AsRegisterPairLow(), reg.AsRegisterPairHigh());
+ return (1u << static_cast<size_t>(reg.AsRegisterPairLow())) |
+ (1u << static_cast<size_t>(reg.AsRegisterPairHigh()));
+ }
+}
+
+static inline uint32_t GetCoreRegisterMask(ArrayRef<ArgumentLocation> srcs) {
+ uint32_t mask = 0u;
+ for (const ArgumentLocation& loc : srcs) {
+ DCHECK(loc.IsRegister());
+ mask |= GetCoreRegisterMask(loc.GetRegister().AsArm());
+ }
+ return mask;
+}
+
+static inline bool UseStrdForChunk(ArrayRef<ArgumentLocation> srcs, size_t start, size_t length) {
+ DCHECK_GE(length, 2u);
+ DCHECK(srcs[start].IsRegister());
+ DCHECK(srcs[start + 1u].IsRegister());
+ // The destination may not be 8B aligned (but it is 4B aligned).
+ // Allow arbitrary destination offset, macro assembler will use a temp if needed.
+ // Note: T32 allows unrelated registers in STRD. (A32 does not.)
+ return length == 2u &&
+ srcs[start].GetRegister().AsArm().IsCoreRegister() &&
+ srcs[start + 1u].GetRegister().AsArm().IsCoreRegister();
+}
+
+static inline bool UseVstrForChunk(ArrayRef<ArgumentLocation> srcs, size_t start, size_t length) {
+ DCHECK_GE(length, 2u);
+ DCHECK(srcs[start].IsRegister());
+ DCHECK(srcs[start + 1u].IsRegister());
+ // The destination may not be 8B aligned (but it is 4B aligned).
+ // Allow arbitrary destination offset, macro assembler will use a temp if needed.
+ return length == 2u &&
+ srcs[start].GetRegister().AsArm().IsSRegister() &&
+ srcs[start + 1u].GetRegister().AsArm().IsSRegister() &&
+ IsAligned<2u>(static_cast<size_t>(srcs[start].GetRegister().AsArm().AsSRegister()));
+}
+
+void ArmVIXLJNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+ ArrayRef<ArgumentLocation> srcs) {
+ DCHECK_EQ(dests.size(), srcs.size());
+
+ // Native ABI is soft-float, so all destinations should be core registers or stack offsets.
+ // And register locations should be first, followed by stack locations with increasing offset.
+ auto is_register = [](const ArgumentLocation& loc) { return loc.IsRegister(); };
+ DCHECK(std::is_partitioned(dests.begin(), dests.end(), is_register));
+ size_t num_reg_dests =
+ std::distance(dests.begin(), std::partition_point(dests.begin(), dests.end(), is_register));
+ DCHECK(std::is_sorted(
+ dests.begin() + num_reg_dests,
+ dests.end(),
+ [](const ArgumentLocation& lhs, const ArgumentLocation& rhs) {
+ return lhs.GetFrameOffset().Uint32Value() < rhs.GetFrameOffset().Uint32Value();
+ }));
+
+ // Collect registers to move. No need to record FP regs as destinations are only core regs.
+ uint32_t src_regs = 0u;
+ uint32_t dest_regs = 0u;
+ for (size_t i = 0; i != num_reg_dests; ++i) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ DCHECK(dest.IsRegister() && IsCoreRegisterOrPair(dest.GetRegister().AsArm()));
+ if (src.IsRegister() && IsCoreRegisterOrPair(src.GetRegister().AsArm())) {
+ if (src.GetRegister().Equals(dest.GetRegister())) {
+ continue;
+ }
+ src_regs |= GetCoreRegisterMask(src.GetRegister().AsArm());
+ }
+ dest_regs |= GetCoreRegisterMask(dest.GetRegister().AsArm());
+ }
+
+ // Spill args first. Look for opportunities to spill multiple arguments at once.
+ {
+ UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+ vixl32::Register xtemp; // Extra temp register;
+ if ((dest_regs & ~src_regs) != 0u) {
+ xtemp = vixl32::Register(CTZ(dest_regs & ~src_regs));
+ DCHECK(!temps.IsAvailable(xtemp));
+ }
+ auto move_two_words = [&](FrameOffset dest_offset, FrameOffset src_offset) {
+ DCHECK(xtemp.IsValid());
+ DCHECK_LT(dest_offset.Uint32Value(), kStrdOffsetCutoff);
+ // VIXL macro assembler can use destination registers for loads from large offsets.
+ UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
+ vixl32::Register temp2 = temps2.Acquire();
+ ___ Ldrd(xtemp, temp2, MemOperand(sp, src_offset.Uint32Value()));
+ ___ Strd(xtemp, temp2, MemOperand(sp, dest_offset.Uint32Value()));
+ };
+ for (size_t i = num_reg_dests, arg_count = dests.size(); i != arg_count; ) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ DCHECK_EQ(src.GetSize(), dest.GetSize());
+ DCHECK(!dest.IsRegister());
+ uint32_t frame_offset = dest.GetFrameOffset().Uint32Value();
+ size_t chunk_size = GetSpillChunkSize(dests, srcs, i, xtemp.IsValid());
+ DCHECK_NE(chunk_size, 0u);
+ if (chunk_size == 1u) {
+ if (src.IsRegister()) {
+ Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+ } else if (dest.GetSize() == 8u && xtemp.IsValid() && frame_offset < kStrdOffsetCutoff) {
+ move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
+ } else {
+ Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+ }
+ } else if (!src.IsRegister()) {
+ DCHECK_EQ(chunk_size, 2u);
+ DCHECK_EQ(dest.GetSize(), 4u);
+ DCHECK_EQ(dests[i + 1u].GetSize(), 4u);
+ move_two_words(dest.GetFrameOffset(), src.GetFrameOffset());
+ } else if (UseStrdForChunk(srcs, i, chunk_size)) {
+ ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
+ AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
+ MemOperand(sp, frame_offset));
+ } else if (UseVstrForChunk(srcs, i, chunk_size)) {
+ size_t sreg = GetSRegisterNumber(src.GetRegister().AsArm());
+ DCHECK_ALIGNED(sreg, 2u);
+ ___ Vstr(vixl32::DRegister(sreg / 2u), MemOperand(sp, frame_offset));
+ } else {
+ UseScratchRegisterScope temps2(asm_.GetVIXLAssembler());
+ vixl32::Register base_reg;
+ if (frame_offset == 0u) {
+ base_reg = sp;
+ } else {
+ base_reg = temps2.Acquire();
+ ___ Add(base_reg, sp, frame_offset);
+ }
+
+ ArmManagedRegister src_reg = src.GetRegister().AsArm();
+ if (IsCoreRegisterOrPair(src_reg)) {
+ uint32_t core_reg_mask = GetCoreRegisterMask(srcs.SubArray(i, chunk_size));
+ ___ Stm(base_reg, NO_WRITE_BACK, RegisterList(core_reg_mask));
+ } else {
+ uint32_t start_sreg = GetSRegisterNumber(src_reg);
+ const ArgumentLocation& last_dest = dests[i + chunk_size - 1u];
+ uint32_t total_size =
+ last_dest.GetFrameOffset().Uint32Value() + last_dest.GetSize() - frame_offset;
+ if (IsAligned<2u>(start_sreg) &&
+ IsAligned<kDRegSizeInBytes>(frame_offset) &&
+ IsAligned<kDRegSizeInBytes>(total_size)) {
+ uint32_t dreg_count = total_size / kDRegSizeInBytes;
+ DRegisterList dreg_list(vixl32::DRegister(start_sreg / 2u), dreg_count);
+ ___ Vstm(F64, base_reg, NO_WRITE_BACK, dreg_list);
+ } else {
+ uint32_t sreg_count = total_size / kSRegSizeInBytes;
+ SRegisterList sreg_list(vixl32::SRegister(start_sreg), sreg_count);
+ ___ Vstm(F32, base_reg, NO_WRITE_BACK, sreg_list);
+ }
+ }
+ }
+ i += chunk_size;
+ }
+ }
+
+ // Fill destination registers from source core registers.
+ // There should be no cycles, so this algorithm should make progress.
+ while (src_regs != 0u) {
+ uint32_t old_src_regs = src_regs;
+ for (size_t i = 0; i != num_reg_dests; ++i) {
+ DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+ if (!srcs[i].IsRegister() || !IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
+ continue;
+ }
+ uint32_t dest_reg_mask = GetCoreRegisterMask(dests[i].GetRegister().AsArm());
+ if ((dest_reg_mask & dest_regs) == 0u) {
+ continue; // Equals source, or already filled in one of previous iterations.
+ }
+ // There are no partial overlaps of 8-byte arguments, otherwise we would have to
+ // tweak this check; Move() can deal with partial overlap for historical reasons.
+ if ((dest_reg_mask & src_regs) != 0u) {
+ continue; // Cannot clobber this register yet.
+ }
+ Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+ uint32_t src_reg_mask = GetCoreRegisterMask(srcs[i].GetRegister().AsArm());
+ DCHECK_EQ(src_regs & src_reg_mask, src_reg_mask);
+ src_regs &= ~src_reg_mask; // Allow clobbering the source register or pair.
+ dest_regs &= ~dest_reg_mask; // Destination register or pair was filled.
+ }
+ CHECK_NE(old_src_regs, src_regs);
+ DCHECK_EQ(0u, src_regs & ~old_src_regs);
+ }
+
+ // Now fill destination registers from FP registers or stack slots, looking for
+ // opportunities to use LDRD/VMOV to fill 2 registers with one instruction.
+ for (size_t i = 0, j; i != num_reg_dests; i = j) {
+ j = i + 1u;
+ DCHECK(dests[i].IsRegister() && IsCoreRegisterOrPair(dests[i].GetRegister().AsArm()));
+ if (srcs[i].IsRegister() && IsCoreRegisterOrPair(srcs[i].GetRegister().AsArm())) {
+ DCHECK_EQ(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+ continue; // Equals destination or moved above.
+ }
+ DCHECK_NE(GetCoreRegisterMask(dests[i].GetRegister().AsArm()) & dest_regs, 0u);
+ if (dests[i].GetSize() == 4u) {
+ // Find next register to load.
+ while (j != num_reg_dests &&
+ (srcs[j].IsRegister() && IsCoreRegisterOrPair(srcs[j].GetRegister().AsArm()))) {
+ DCHECK_EQ(GetCoreRegisterMask(dests[j].GetRegister().AsArm()) & dest_regs, 0u);
+ ++j; // Equals destination or moved above.
+ }
+ if (j != num_reg_dests && dests[j].GetSize() == 4u) {
+ if (!srcs[i].IsRegister() && !srcs[j].IsRegister() && NoSpillGap(srcs[i], srcs[j])) {
+ ___ Ldrd(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+ AsVIXLRegister(dests[j].GetRegister().AsArm()),
+ MemOperand(sp, srcs[i].GetFrameOffset().Uint32Value()));
+ ++j;
+ continue;
+ }
+ if (srcs[i].IsRegister() && srcs[j].IsRegister()) {
+ uint32_t first_sreg = GetSRegisterNumber(srcs[i].GetRegister().AsArm());
+ if (IsAligned<2u>(first_sreg) &&
+ first_sreg + 1u == GetSRegisterNumber(srcs[j].GetRegister().AsArm())) {
+ ___ Vmov(AsVIXLRegister(dests[i].GetRegister().AsArm()),
+ AsVIXLRegister(dests[j].GetRegister().AsArm()),
+ vixl32::DRegister(first_sreg / 2u));
+ ++j;
+ continue;
+ }
+ }
+ }
+ }
+ if (srcs[i].IsRegister()) {
+ Move(dests[i].GetRegister(), srcs[i].GetRegister(), dests[i].GetSize());
+ } else {
+ Load(dests[i].GetRegister(), srcs[i].GetFrameOffset(), dests[i].GetSize());
+ }
+ }
+}
+
void ArmVIXLJNIMacroAssembler::Move(ManagedRegister mdst,
ManagedRegister msrc,
size_t size ATTRIBUTE_UNUSED) {
@@ -387,8 +695,12 @@
ArmManagedRegister src = msrc.AsArm();
if (!dst.Equals(src)) {
if (dst.IsCoreRegister()) {
- CHECK(src.IsCoreRegister()) << src;
- ___ Mov(AsVIXLRegister(dst), AsVIXLRegister(src));
+ if (src.IsCoreRegister()) {
+ ___ Mov(AsVIXLRegister(dst), AsVIXLRegister(src));
+ } else {
+ CHECK(src.IsSRegister()) << src;
+ ___ Vmov(AsVIXLRegister(dst), AsVIXLSRegister(src));
+ }
} else if (dst.IsDRegister()) {
if (src.IsDRegister()) {
___ Vmov(F64, AsVIXLDRegister(dst), AsVIXLDRegister(src));
@@ -407,14 +719,18 @@
}
} else {
CHECK(dst.IsRegisterPair()) << dst;
- CHECK(src.IsRegisterPair()) << src;
- // Ensure that the first move doesn't clobber the input of the second.
- if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
- ___ Mov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairLow(src));
- ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+ if (src.IsRegisterPair()) {
+ // Ensure that the first move doesn't clobber the input of the second.
+ if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
+ ___ Mov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairLow(src));
+ ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+ } else {
+ ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
+ ___ Mov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairLow(src));
+ }
} else {
- ___ Mov(AsVIXLRegisterPairHigh(dst), AsVIXLRegisterPairHigh(src));
- ___ Mov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairLow(src));
+ CHECK(src.IsDRegister()) << src;
+ ___ Vmov(AsVIXLRegisterPairLow(dst), AsVIXLRegisterPairHigh(dst), AsVIXLDRegister(src));
}
}
}
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 2bd571e..2f6813a 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -93,6 +93,8 @@
void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
// Copying routines.
+ void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset32 thr_offs) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a31ed93..bb93a96 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -326,6 +326,88 @@
}
// Copying routines.
+void Arm64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+ ArrayRef<ArgumentLocation> srcs) {
+ DCHECK_EQ(dests.size(), srcs.size());
+ auto get_mask = [](ManagedRegister reg) -> uint64_t {
+ Arm64ManagedRegister arm64_reg = reg.AsArm64();
+ if (arm64_reg.IsXRegister()) {
+ size_t core_reg_number = static_cast<size_t>(arm64_reg.AsXRegister());
+ DCHECK_LT(core_reg_number, 31u); // xSP, xZR not allowed.
+ return UINT64_C(1) << core_reg_number;
+ } else if (arm64_reg.IsWRegister()) {
+ size_t core_reg_number = static_cast<size_t>(arm64_reg.AsWRegister());
+ DCHECK_LT(core_reg_number, 31u); // wSP, wZR not allowed.
+ return UINT64_C(1) << core_reg_number;
+ } else if (arm64_reg.IsDRegister()) {
+ size_t fp_reg_number = static_cast<size_t>(arm64_reg.AsDRegister());
+ DCHECK_LT(fp_reg_number, 32u);
+ return (UINT64_C(1) << 32u) << fp_reg_number;
+ } else {
+ DCHECK(arm64_reg.IsSRegister());
+ size_t fp_reg_number = static_cast<size_t>(arm64_reg.AsSRegister());
+ DCHECK_LT(fp_reg_number, 32u);
+ return (UINT64_C(1) << 32u) << fp_reg_number;
+ }
+ };
+ // Collect registers to move while storing/copying args to stack slots.
+ // More than 8 core or FP reg args are very rare, so we do not optimize
+ // for that case by using LDP/STP.
+ // TODO: LDP/STP will be useful for normal and @FastNative where we need
+ // to spill even the leading arguments.
+ uint64_t src_regs = 0u;
+ uint64_t dest_regs = 0u;
+ for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ DCHECK_EQ(src.GetSize(), dest.GetSize());
+ if (dest.IsRegister()) {
+ if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+ // Nothing to do.
+ } else {
+ if (src.IsRegister()) {
+ src_regs |= get_mask(src.GetRegister());
+ }
+ dest_regs |= get_mask(dest.GetRegister());
+ }
+ } else {
+ if (src.IsRegister()) {
+ Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+ } else {
+ Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+ }
+ }
+ }
+ // Fill destination registers.
+ // There should be no cycles, so this simple algorithm should make progress.
+ while (dest_regs != 0u) {
+ uint64_t old_dest_regs = dest_regs;
+ for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ if (!dest.IsRegister()) {
+ continue; // Stored in first loop above.
+ }
+ uint64_t dest_reg_mask = get_mask(dest.GetRegister());
+ if ((dest_reg_mask & dest_regs) == 0u) {
+ continue; // Equals source, or already filled in one of previous iterations.
+ }
+ if ((dest_reg_mask & src_regs) != 0u) {
+ continue; // Cannot clobber this register yet.
+ }
+ if (src.IsRegister()) {
+ Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+ src_regs &= ~get_mask(src.GetRegister()); // Allow clobbering source register.
+ } else {
+ Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+ }
+ dest_regs &= ~get_mask(dest.GetRegister()); // Destination register was filled.
+ }
+ CHECK_NE(old_dest_regs, dest_regs);
+ DCHECK_EQ(0u, dest_regs & ~old_dest_regs);
+ }
+}
+
void Arm64JNIMacroAssembler::Move(ManagedRegister m_dst, ManagedRegister m_src, size_t size) {
Arm64ManagedRegister dst = m_dst.AsArm64();
if (kIsDebugBuild) {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 64b5595..9f3eea2 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -85,6 +85,7 @@
void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
// Copying routines.
+ void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;
void CopyRawPtrToThread(ThreadOffset64 thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 48b3f01..3490959 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -43,6 +43,40 @@
kNotZero
};
+class ArgumentLocation {
+ public:
+ ArgumentLocation(ManagedRegister reg, size_t size)
+ : reg_(reg), frame_offset_(0u), size_(size) {
+ DCHECK(reg.IsRegister());
+ }
+
+ ArgumentLocation(FrameOffset frame_offset, size_t size)
+ : reg_(ManagedRegister::NoRegister()), frame_offset_(frame_offset), size_(size) {}
+
+ bool IsRegister() const {
+ return reg_.IsRegister();
+ }
+
+ ManagedRegister GetRegister() const {
+ DCHECK(IsRegister());
+ return reg_;
+ }
+
+ FrameOffset GetFrameOffset() const {
+ DCHECK(!IsRegister());
+ return frame_offset_;
+ }
+
+ size_t GetSize() const {
+ return size_;
+ }
+
+ private:
+ ManagedRegister reg_;
+ FrameOffset frame_offset_;
+ size_t size_;
+};
+
template <PointerSize kPointerSize>
class JNIMacroAssembler : public DeletableArenaObject<kArenaAllocAssembler> {
public:
@@ -112,6 +146,8 @@
virtual void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset<kPointerSize> offs) = 0;
// Copying routines
+ virtual void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) = 0;
+
virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset<kPointerSize> thr_offs) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 1adcc20..67ec93d 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -300,6 +300,30 @@
}
}
+void X86JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+ ArrayRef<ArgumentLocation> srcs) {
+ DCHECK_EQ(dests.size(), srcs.size());
+ bool found_hidden_arg = false;
+ for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ DCHECK_EQ(src.GetSize(), dest.GetSize());
+ if (UNLIKELY(dest.IsRegister())) {
+ // Native ABI has only stack arguments but we may pass one "hidden arg" in register.
+ CHECK(!found_hidden_arg);
+ found_hidden_arg = true;
+ CHECK(src.IsRegister());
+ Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+ } else {
+ if (src.IsRegister()) {
+ Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+ } else {
+ Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+ }
+ }
+ }
+}
+
void X86JNIMacroAssembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
DCHECK(!mdest.Equals(X86ManagedRegister::FromCpuRegister(GetScratchRegister())));
X86ManagedRegister dest = mdest.AsX86();
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 1223471..0239ff7 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -82,6 +82,8 @@
void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) override;
// Copying routines
+ void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset32 thr_offs) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index d57ea41..2649084 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -338,6 +338,76 @@
}
}
+void X86_64JNIMacroAssembler::MoveArguments(ArrayRef<ArgumentLocation> dests,
+ ArrayRef<ArgumentLocation> srcs) {
+ DCHECK_EQ(dests.size(), srcs.size());
+ auto get_mask = [](ManagedRegister reg) -> uint32_t {
+ X86_64ManagedRegister x86_64_reg = reg.AsX86_64();
+ if (x86_64_reg.IsCpuRegister()) {
+ size_t cpu_reg_number = static_cast<size_t>(x86_64_reg.AsCpuRegister().AsRegister());
+ DCHECK_LT(cpu_reg_number, 16u);
+ return 1u << cpu_reg_number;
+ } else {
+ DCHECK(x86_64_reg.IsXmmRegister());
+ size_t xmm_reg_number = static_cast<size_t>(x86_64_reg.AsXmmRegister().AsFloatRegister());
+ DCHECK_LT(xmm_reg_number, 16u);
+ return (1u << 16u) << xmm_reg_number;
+ }
+ };
+ // Collect registers to move while storing/copying args to stack slots.
+ uint32_t src_regs = 0u;
+ uint32_t dest_regs = 0u;
+ for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ DCHECK_EQ(src.GetSize(), dest.GetSize());
+ if (dest.IsRegister()) {
+ if (src.IsRegister() && src.GetRegister().Equals(dest.GetRegister())) {
+ // Nothing to do.
+ } else {
+ if (src.IsRegister()) {
+ src_regs |= get_mask(src.GetRegister());
+ }
+ dest_regs |= get_mask(dest.GetRegister());
+ }
+ } else {
+ if (src.IsRegister()) {
+ Store(dest.GetFrameOffset(), src.GetRegister(), dest.GetSize());
+ } else {
+ Copy(dest.GetFrameOffset(), src.GetFrameOffset(), dest.GetSize());
+ }
+ }
+ }
+ // Fill destination registers.
+ // There should be no cycles, so this simple algorithm should make progress.
+ while (dest_regs != 0u) {
+ uint32_t old_dest_regs = dest_regs;
+ for (size_t i = 0, arg_count = srcs.size(); i != arg_count; ++i) {
+ const ArgumentLocation& src = srcs[i];
+ const ArgumentLocation& dest = dests[i];
+ if (!dest.IsRegister()) {
+ continue; // Stored in first loop above.
+ }
+ uint32_t dest_reg_mask = get_mask(dest.GetRegister());
+ if ((dest_reg_mask & dest_regs) == 0u) {
+ continue; // Equals source, or already filled in one of previous iterations.
+ }
+ if ((dest_reg_mask & src_regs) != 0u) {
+ continue; // Cannot clobber this register yet.
+ }
+ if (src.IsRegister()) {
+ Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
+ src_regs &= ~get_mask(src.GetRegister()); // Allow clobbering source register.
+ } else {
+ Load(dest.GetRegister(), src.GetFrameOffset(), dest.GetSize());
+ }
+ dest_regs &= ~get_mask(dest.GetRegister()); // Destination register was filled.
+ }
+ CHECK_NE(old_dest_regs, dest_regs);
+ DCHECK_EQ(0u, dest_regs & ~old_dest_regs);
+ }
+}
+
void X86_64JNIMacroAssembler::Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) {
DCHECK(!mdest.Equals(X86_64ManagedRegister::FromCpuRegister(GetScratchRegister().AsRegister())));
X86_64ManagedRegister dest = mdest.AsX86_64();
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 4592eba..6589544 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -85,6 +85,8 @@
void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset64 offs) override;
// Copying routines
+ void MoveArguments(ArrayRef<ArgumentLocation> dests, ArrayRef<ArgumentLocation> srcs) override;
+
void Move(ManagedRegister dest, ManagedRegister src, size_t size) override;
void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset64 thr_offs) override;