ARM/ARM64: Use trampolines for slow-path entrypoint calls.
This reduces the size of the generated code. We do this only
for AOT compilation where we get the most benefit.
Sizes of aosp_taimen-userdebug prebuilts:
- before:
arm/boot*.oat: 19624804
arm64/boot*.oat: 23265752
oat/arm64/services.odex: 22417968
- after:
arm/boot*.oat: 19460500 (-160KiB)
arm64/boot*.oat: 22957928 (-301KiB)
oat/arm64/services.odex: 21957864 (-449KiB)
Test: m test-art-host-gtest
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 12607709
Change-Id: Ie9dbd1ba256173e4e439e8bbb8832a791965cbe6
diff --git a/compiler/driver/compiled_method_storage.cc b/compiler/driver/compiled_method_storage.cc
index 31062fb..03c906b 100644
--- a/compiler/driver/compiled_method_storage.cc
+++ b/compiler/driver/compiled_method_storage.cc
@@ -216,6 +216,9 @@
uint32_t custom_value1 = 0u;
uint32_t custom_value2 = 0u;
switch (linker_patch.GetType()) {
+ case linker::LinkerPatch::Type::kCallEntrypoint:
+ custom_value1 = linker_patch.EntrypointOffset();
+ break;
case linker::LinkerPatch::Type::kBakerReadBarrierBranch:
custom_value1 = linker_patch.GetBakerCustomValue1();
custom_value2 = linker_patch.GetBakerCustomValue2();
diff --git a/compiler/linker/linker_patch.h b/compiler/linker/linker_patch.h
index f9e3930..1c523de 100644
--- a/compiler/linker/linker_patch.h
+++ b/compiler/linker/linker_patch.h
@@ -52,6 +52,7 @@
kTypeBssEntry,
kStringRelative,
kStringBssEntry,
+ kCallEntrypoint,
kBakerReadBarrierBranch,
};
@@ -141,6 +142,15 @@
return patch;
}
+ static LinkerPatch CallEntrypointPatch(size_t literal_offset,
+ uint32_t entrypoint_offset) {
+ LinkerPatch patch(literal_offset,
+ Type::kCallEntrypoint,
+ /* target_dex_file= */ nullptr);
+ patch.entrypoint_offset_ = entrypoint_offset;
+ return patch;
+ }
+
static LinkerPatch BakerReadBarrierBranchPatch(size_t literal_offset,
uint32_t custom_value1 = 0u,
uint32_t custom_value2 = 0u) {
@@ -216,6 +226,11 @@
return pc_insn_offset_;
}
+ uint32_t EntrypointOffset() const {
+ DCHECK(patch_type_ == Type::kCallEntrypoint);
+ return entrypoint_offset_;
+ }
+
uint32_t GetBakerCustomValue1() const {
DCHECK(patch_type_ == Type::kBakerReadBarrierBranch);
return baker_custom_value1_;
@@ -249,6 +264,7 @@
uint32_t type_idx_; // Type index for Type patches.
uint32_t string_idx_; // String index for String patches.
uint32_t intrinsic_data_; // Data for IntrinsicObjects.
+ uint32_t entrypoint_offset_; // Entrypoint offset in the Thread object.
uint32_t baker_custom_value1_;
static_assert(sizeof(method_idx_) == sizeof(cmp1_), "needed by relational operators");
static_assert(sizeof(type_idx_) == sizeof(cmp1_), "needed by relational operators");
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 177d982..651a3f7 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -887,10 +887,6 @@
move_resolver_(graph->GetAllocator(), this),
assembler_(graph->GetAllocator(),
compiler_options.GetInstructionSetFeatures()->AsArm64InstructionSetFeatures()),
- uint32_literals_(std::less<uint32_t>(),
- graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
- uint64_literals_(std::less<uint64_t>(),
- graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
boot_image_method_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
method_bss_entry_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
boot_image_type_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
@@ -898,7 +894,12 @@
boot_image_string_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
string_bss_entry_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
boot_image_intrinsic_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
+ call_entrypoint_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
baker_read_barrier_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
+ uint32_literals_(std::less<uint32_t>(),
+ graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
+ uint64_literals_(std::less<uint64_t>(),
+ graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
jit_string_patches_(StringReferenceValueComparator(),
graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
jit_class_patches_(TypeReferenceValueComparator(),
@@ -1687,14 +1688,25 @@
SlowPathCode* slow_path) {
ValidateInvokeRuntime(entrypoint, instruction, slow_path);
- __ Ldr(lr, MemOperand(tr, GetThreadOffset<kArm64PointerSize>(entrypoint).Int32Value()));
- {
+ ThreadOffset64 entrypoint_offset = GetThreadOffset<kArm64PointerSize>(entrypoint);
+ // Reduce code size for AOT by using shared trampolines for slow path runtime calls across the
+ // entire oat file. This adds an extra branch and we do not want to slow down the main path.
+ // For JIT, thunk sharing is per-method, so the gains would be smaller or even negative.
+ if (slow_path == nullptr || Runtime::Current()->UseJitCompilation()) {
+ __ Ldr(lr, MemOperand(tr, entrypoint_offset.Int32Value()));
// Ensure the pc position is recorded immediately after the `blr` instruction.
ExactAssemblyScope eas(GetVIXLAssembler(), kInstructionSize, CodeBufferCheckScope::kExactSize);
__ blr(lr);
if (EntrypointRequiresStackMap(entrypoint)) {
RecordPcInfo(instruction, dex_pc, slow_path);
}
+ } else {
+ // Ensure the pc position is recorded immediately after the `bl` instruction.
+ ExactAssemblyScope eas(GetVIXLAssembler(), kInstructionSize, CodeBufferCheckScope::kExactSize);
+ EmitEntrypointThunkCall(entrypoint_offset);
+ if (EntrypointRequiresStackMap(entrypoint)) {
+ RecordPcInfo(instruction, dex_pc, slow_path);
+ }
}
}
@@ -4250,6 +4262,15 @@
return NewPcRelativePatch(&dex_file, string_index.index_, adrp_label, &string_bss_entry_patches_);
}
+void CodeGeneratorARM64::EmitEntrypointThunkCall(ThreadOffset64 entrypoint_offset) {
+ DCHECK(!__ AllowMacroInstructions()); // In ExactAssemblyScope.
+ DCHECK(!Runtime::Current()->UseJitCompilation());
+ call_entrypoint_patches_.emplace_back(/*dex_file*/ nullptr, entrypoint_offset.Uint32Value());
+ vixl::aarch64::Label* bl_label = &call_entrypoint_patches_.back().label;
+ __ bind(bl_label);
+ __ bl(static_cast<int64_t>(0)); // Placeholder, patched at link-time.
+}
+
void CodeGeneratorARM64::EmitBakerReadBarrierCbnz(uint32_t custom_data) {
DCHECK(!__ AllowMacroInstructions()); // In ExactAssemblyScope.
if (Runtime::Current()->UseJitCompilation()) {
@@ -4406,6 +4427,7 @@
boot_image_string_patches_.size() +
string_bss_entry_patches_.size() +
boot_image_intrinsic_patches_.size() +
+ call_entrypoint_patches_.size() +
baker_read_barrier_patches_.size();
linker_patches->reserve(size);
if (GetCompilerOptions().IsBootImage()) {
@@ -4430,6 +4452,11 @@
type_bss_entry_patches_, linker_patches);
EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
string_bss_entry_patches_, linker_patches);
+ for (const PatchInfo<vixl::aarch64::Label>& info : call_entrypoint_patches_) {
+ DCHECK(info.target_dex_file == nullptr);
+ linker_patches->push_back(linker::LinkerPatch::CallEntrypointPatch(
+ info.label.GetLocation(), info.offset_or_index));
+ }
for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
linker_patches->push_back(linker::LinkerPatch::BakerReadBarrierBranchPatch(
info.label.GetLocation(), info.custom_data));
@@ -4438,7 +4465,8 @@
}
bool CodeGeneratorARM64::NeedsThunkCode(const linker::LinkerPatch& patch) const {
- return patch.GetType() == linker::LinkerPatch::Type::kBakerReadBarrierBranch ||
+ return patch.GetType() == linker::LinkerPatch::Type::kCallEntrypoint ||
+ patch.GetType() == linker::LinkerPatch::Type::kBakerReadBarrierBranch ||
patch.GetType() == linker::LinkerPatch::Type::kCallRelative;
}
@@ -4458,6 +4486,14 @@
}
break;
}
+ case linker::LinkerPatch::Type::kCallEntrypoint: {
+ Offset offset(patch.EntrypointOffset());
+ assembler.JumpTo(ManagedRegister(arm64::TR), offset, ManagedRegister(arm64::IP0));
+ if (GetCompilerOptions().GenerateAnyDebugInfo()) {
+ *debug_name = "EntrypointCallThunk_" + std::to_string(offset.Uint32Value());
+ }
+ break;
+ }
case linker::LinkerPatch::Type::kBakerReadBarrierBranch: {
DCHECK_EQ(patch.GetBakerCustomValue2(), 0u);
CompileBakerReadBarrierThunk(assembler, patch.GetBakerCustomValue1(), debug_name);
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index ada5742..2680bd0 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -629,6 +629,9 @@
dex::StringIndex string_index,
vixl::aarch64::Label* adrp_label = nullptr);
+ // Emit the BL instruction for entrypoint thunk call and record the associated patch for AOT.
+ void EmitEntrypointThunkCall(ThreadOffset64 entrypoint_offset);
+
// Emit the CBNZ instruction for baker read barrier and record
// the associated patch for AOT or slow path for JIT.
void EmitBakerReadBarrierCbnz(uint32_t custom_data);
@@ -887,10 +890,6 @@
ParallelMoveResolverARM64 move_resolver_;
Arm64Assembler assembler_;
- // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
- Uint32ToLiteralMap uint32_literals_;
- // Deduplication map for 64-bit literals, used for non-patchable method address or method code.
- Uint64ToLiteralMap uint64_literals_;
// PC-relative method patch info for kBootImageLinkTimePcRelative/BootImageRelRo.
// Also used for type/string patches for kBootImageRelRo (same linker patch as for methods).
ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_;
@@ -906,9 +905,15 @@
ArenaDeque<PcRelativePatchInfo> string_bss_entry_patches_;
// PC-relative patch info for IntrinsicObjects.
ArenaDeque<PcRelativePatchInfo> boot_image_intrinsic_patches_;
+ // Patch info for calls to entrypoint dispatch thunks. Used for slow paths.
+ ArenaDeque<PatchInfo<vixl::aarch64::Label>> call_entrypoint_patches_;
// Baker read barrier patch info.
ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_;
+ // Deduplication map for 32-bit literals, used for JIT for boot image addresses.
+ Uint32ToLiteralMap uint32_literals_;
+ // Deduplication map for 64-bit literals, used for JIT for method address or method code.
+ Uint64ToLiteralMap uint64_literals_;
// Patches for string literals in JIT compiled code.
StringToLiteralMap jit_string_patches_;
// Patches for class literals in JIT compiled code.
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 19d04c9..ac09183 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -1856,8 +1856,6 @@
instruction_visitor_(graph, this),
move_resolver_(graph->GetAllocator(), this),
assembler_(graph->GetAllocator()),
- uint32_literals_(std::less<uint32_t>(),
- graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
boot_image_method_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
method_bss_entry_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
boot_image_type_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
@@ -1865,7 +1863,10 @@
boot_image_string_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
string_bss_entry_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
boot_image_intrinsic_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
+ call_entrypoint_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
baker_read_barrier_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
+ uint32_literals_(std::less<uint32_t>(),
+ graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
jit_string_patches_(StringReferenceValueComparator(),
graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
jit_class_patches_(TypeReferenceValueComparator(),
@@ -2383,15 +2384,31 @@
uint32_t dex_pc,
SlowPathCode* slow_path) {
ValidateInvokeRuntime(entrypoint, instruction, slow_path);
- __ Ldr(lr, MemOperand(tr, GetThreadOffset<kArmPointerSize>(entrypoint).Int32Value()));
- // Ensure the pc position is recorded immediately after the `blx` instruction.
- // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
- ExactAssemblyScope aas(GetVIXLAssembler(),
- vixl32::k16BitT32InstructionSizeInBytes,
- CodeBufferCheckScope::kExactSize);
- __ blx(lr);
- if (EntrypointRequiresStackMap(entrypoint)) {
- RecordPcInfo(instruction, dex_pc, slow_path);
+
+ ThreadOffset32 entrypoint_offset = GetThreadOffset<kArmPointerSize>(entrypoint);
+ // Reduce code size for AOT by using shared trampolines for slow path runtime calls across the
+ // entire oat file. This adds an extra branch and we do not want to slow down the main path.
+ // For JIT, thunk sharing is per-method, so the gains would be smaller or even negative.
+ if (slow_path == nullptr || Runtime::Current()->UseJitCompilation()) {
+ __ Ldr(lr, MemOperand(tr, entrypoint_offset.Int32Value()));
+ // Ensure the pc position is recorded immediately after the `blx` instruction.
+ // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
+ ExactAssemblyScope aas(GetVIXLAssembler(),
+ vixl32::k16BitT32InstructionSizeInBytes,
+ CodeBufferCheckScope::kExactSize);
+ __ blx(lr);
+ if (EntrypointRequiresStackMap(entrypoint)) {
+ RecordPcInfo(instruction, dex_pc, slow_path);
+ }
+ } else {
+ // Ensure the pc position is recorded immediately after the `bl` instruction.
+ ExactAssemblyScope aas(GetVIXLAssembler(),
+ vixl32::k32BitT32InstructionSizeInBytes,
+ CodeBufferCheckScope::kExactSize);
+ EmitEntrypointThunkCall(entrypoint_offset);
+ if (EntrypointRequiresStackMap(entrypoint)) {
+ RecordPcInfo(instruction, dex_pc, slow_path);
+ }
}
}
@@ -8858,6 +8875,17 @@
return &patches->back();
}
+void CodeGeneratorARMVIXL::EmitEntrypointThunkCall(ThreadOffset32 entrypoint_offset) {
+ DCHECK(!__ AllowMacroInstructions()); // In ExactAssemblyScope.
+ DCHECK(!Runtime::Current()->UseJitCompilation());
+ call_entrypoint_patches_.emplace_back(/*dex_file*/ nullptr, entrypoint_offset.Uint32Value());
+ vixl::aarch32::Label* bl_label = &call_entrypoint_patches_.back().label;
+ __ bind(bl_label);
+ vixl32::Label placeholder_label;
+ __ bl(&placeholder_label); // Placeholder, patched at link-time.
+ __ bind(&placeholder_label);
+}
+
void CodeGeneratorARMVIXL::EmitBakerReadBarrierBne(uint32_t custom_data) {
DCHECK(!__ AllowMacroInstructions()); // In ExactAssemblyScope.
if (Runtime::Current()->UseJitCompilation()) {
@@ -8980,6 +9008,7 @@
/* MOVW+MOVT for each entry */ 2u * boot_image_string_patches_.size() +
/* MOVW+MOVT for each entry */ 2u * string_bss_entry_patches_.size() +
/* MOVW+MOVT for each entry */ 2u * boot_image_intrinsic_patches_.size() +
+ call_entrypoint_patches_.size() +
baker_read_barrier_patches_.size();
linker_patches->reserve(size);
if (GetCompilerOptions().IsBootImage()) {
@@ -9004,6 +9033,11 @@
type_bss_entry_patches_, linker_patches);
EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
string_bss_entry_patches_, linker_patches);
+ for (const PatchInfo<vixl32::Label>& info : call_entrypoint_patches_) {
+ DCHECK(info.target_dex_file == nullptr);
+ linker_patches->push_back(linker::LinkerPatch::CallEntrypointPatch(
+ info.label.GetLocation(), info.offset_or_index));
+ }
for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
linker_patches->push_back(linker::LinkerPatch::BakerReadBarrierBranchPatch(
info.label.GetLocation(), info.custom_data));
@@ -9012,7 +9046,8 @@
}
bool CodeGeneratorARMVIXL::NeedsThunkCode(const linker::LinkerPatch& patch) const {
- return patch.GetType() == linker::LinkerPatch::Type::kBakerReadBarrierBranch ||
+ return patch.GetType() == linker::LinkerPatch::Type::kCallEntrypoint ||
+ patch.GetType() == linker::LinkerPatch::Type::kBakerReadBarrierBranch ||
patch.GetType() == linker::LinkerPatch::Type::kCallRelative;
}
@@ -9021,23 +9056,30 @@
/*out*/ std::string* debug_name) {
arm::ArmVIXLAssembler assembler(GetGraph()->GetAllocator());
switch (patch.GetType()) {
- case linker::LinkerPatch::Type::kCallRelative:
+ case linker::LinkerPatch::Type::kCallRelative: {
// The thunk just uses the entry point in the ArtMethod. This works even for calls
// to the generic JNI and interpreter trampolines.
- assembler.LoadFromOffset(
- arm::kLoadWord,
- vixl32::pc,
- vixl32::r0,
- ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
+ MemberOffset offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize);
+ assembler.LoadFromOffset(arm::kLoadWord, vixl32::pc, vixl32::r0, offset.Int32Value());
assembler.GetVIXLAssembler()->Bkpt(0);
if (GetCompilerOptions().GenerateAnyDebugInfo()) {
*debug_name = "MethodCallThunk";
}
break;
- case linker::LinkerPatch::Type::kBakerReadBarrierBranch:
+ }
+ case linker::LinkerPatch::Type::kCallEntrypoint: {
+ assembler.LoadFromOffset(arm::kLoadWord, vixl32::pc, tr, patch.EntrypointOffset());
+ assembler.GetVIXLAssembler()->Bkpt(0);
+ if (GetCompilerOptions().GenerateAnyDebugInfo()) {
+ *debug_name = "EntrypointCallThunk_" + std::to_string(patch.EntrypointOffset());
+ }
+ break;
+ }
+ case linker::LinkerPatch::Type::kBakerReadBarrierBranch: {
DCHECK_EQ(patch.GetBakerCustomValue2(), 0u);
CompileBakerReadBarrierThunk(assembler, patch.GetBakerCustomValue1(), debug_name);
break;
+ }
default:
LOG(FATAL) << "Unexpected patch type " << patch.GetType();
UNREACHABLE();
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 5edca87..4742f78 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -589,6 +589,9 @@
PcRelativePatchInfo* NewStringBssEntryPatch(const DexFile& dex_file,
dex::StringIndex string_index);
+ // Emit the BL instruction for entrypoint thunk call and record the associated patch for AOT.
+ void EmitEntrypointThunkCall(ThreadOffset32 entrypoint_offset);
+
// Emit the BNE instruction for baker read barrier and record
// the associated patch for AOT or slow path for JIT.
void EmitBakerReadBarrierBne(uint32_t custom_data);
@@ -869,8 +872,6 @@
ArmVIXLAssembler assembler_;
- // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
- Uint32ToLiteralMap uint32_literals_;
// PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo.
// Also used for type/string patches for kBootImageRelRo (same linker patch as for methods).
ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_;
@@ -886,9 +887,13 @@
ArenaDeque<PcRelativePatchInfo> string_bss_entry_patches_;
// PC-relative patch info for IntrinsicObjects.
ArenaDeque<PcRelativePatchInfo> boot_image_intrinsic_patches_;
+ // Patch info for calls to entrypoint dispatch thunks. Used for slow paths.
+ ArenaDeque<PatchInfo<vixl::aarch32::Label>> call_entrypoint_patches_;
// Baker read barrier patch info.
ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_;
+ // Deduplication map for 32-bit literals, used for JIT for boot image addresses.
+ Uint32ToLiteralMap uint32_literals_;
// Patches for string literals in JIT compiled code.
StringToLiteralMap jit_string_patches_;
// Patches for class literals in JIT compiled code.