diff options
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 380 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.h | 18 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 428 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.h | 18 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm.cc | 2 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 2 |
6 files changed, 748 insertions, 100 deletions
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index ebd578c5cd..3c6e277ff9 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -16,6 +16,7 @@ #include "code_generator_arm.h" +#include "arch/arm/asm_support_arm.h" #include "arch/arm/instruction_set_features_arm.h" #include "art_method.h" #include "code_generator_utils.h" @@ -25,6 +26,7 @@ #include "gc/accounting/card_table.h" #include "intrinsics.h" #include "intrinsics_arm.h" +#include "linker/arm/relative_patcher_thumb2.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "thread.h" @@ -60,10 +62,41 @@ static constexpr DRegister DTMP = D31; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle +// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// The reserved entrypoint register for link-time generated thunks. +const Register kBakerCcEntrypointRegister = R4; + // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value() +static inline void CheckLastTempIsBakerCcEntrypointRegister(HInstruction* instruction) { + DCHECK_EQ(static_cast<uint32_t>(kBakerCcEntrypointRegister), + linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister); + DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u); + DCHECK_EQ(kBakerCcEntrypointRegister, + instruction->GetLocations()->GetTemp( + instruction->GetLocations()->GetTempCount() - 1u).AsRegister<Register>()); +} + +static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) { + DCHECK(down_cast<Thumb2Assembler*>(codegen->GetAssembler())->IsForced32Bit()); + __ BindTrackedLabel(bne_label); + Label placeholder_label; + __ b(&placeholder_label, NE); // Placeholder, patched at link-time. + __ Bind(&placeholder_label); +} + static constexpr int kRegListThreshold = 4; // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers, @@ -1962,6 +1995,7 @@ CodeGeneratorARM::CodeGeneratorARM(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -5281,7 +5315,18 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else { + locations->AddTemp(Location::RequiresRegister()); + } } } @@ -5747,11 +5792,35 @@ void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) { Location::RequiresRegister(), object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap); } - // We need a temporary register for the read barrier marking slow - // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier. - // Also need for String compression feature. - if ((object_array_get_with_read_barrier && kUseBakerReadBarrier) - || (mirror::kUseStringCompression && instruction->IsStringCharAt())) { + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier. + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation() && + !instruction->GetIndex()->IsConstant()) { + // We need a non-scratch temporary for the array data pointer. + locations->AddTemp(Location::RequiresRegister()); + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else { + locations->AddTemp(Location::RequiresRegister()); + } + } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + // Also need a temporary for String compression feature. locations->AddTemp(Location::RequiresRegister()); } } @@ -5863,8 +5932,20 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { Location temp = locations->GetTemp(0); // Note that a potential implicit null check is handled in this // CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true); + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + data_offset += helpers::Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + locations->GetTemp(0), + /* needs_null_check */ false); + } else { + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false); + } } else { Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { @@ -6701,6 +6782,13 @@ void LocationsBuilderARM::VisitLoadClass(HLoadClass* cls) { // For non-Baker read barrier we have a temp-clobbering call. } } + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + if (load_kind == HLoadClass::LoadKind::kBssEntry || + (load_kind == HLoadClass::LoadKind::kReferrersClass && + !Runtime::Current()->UseJitCompilation())) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not @@ -6880,6 +6968,9 @@ void LocationsBuilderARM::VisitLoadString(HLoadString* load) { // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK() // that the the kPrimNot result register is the same as the first argument register. locations->SetCustomSlowPathCallerSaves(caller_saves); + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } } else { // For non-Baker read barrier we have a temp-clobbering call. } @@ -7050,6 +7141,9 @@ void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) { // Note that TypeCheckSlowPathARM uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + codegen_->MaybeAddBakerCcEntrypointTempForFields(locations); + } } void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { @@ -7923,48 +8017,93 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Location temp = Location::RegisterLocation(LR); - SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( - instruction, root, /* entrypoint */ temp); - codegen_->AddSlowPath(slow_path); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: + + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg); + Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + static_assert( + BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8, + "GC root LDR must be 2 32-bit instructions (8B) before the return address label."); + // Currently the offset is always within range. If that changes, + // we shall have to split the load the same way as for fields. + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + EmitPlaceholderBne(codegen_, bne_label); + __ Bind(&return_address); + } else { + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - __ LoadFromOffset(kLoadWord, root_reg, obj, offset); - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -7982,6 +8121,16 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct } } +void CodeGeneratorARM::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields) { + if (!Runtime::Current()->UseJitCompilation()) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } + } +} + void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, Register obj, @@ -7991,6 +8140,69 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + base = temp.AsRegister<Register>(); + DCHECK_NE(base, kBakerCcEntrypointRegister); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u)); + offset &= (kReferenceLoadMinFarOffset - 1u); + } + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj); + Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Field LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + Register ref_reg = ref.AsRegister<Register>(); + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + __ LoadFromOffset(kLoadWord, ref_reg, base, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) Location no_index = Location::NoLocation(); ScaleFactor no_scale_factor = TIMES_1; @@ -8011,9 +8223,67 @@ void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instr static_assert( sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + ScaleFactor scale_factor = TIMES_4; + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + Register index_reg = index.AsRegister<Register>(); + Register ref_reg = ref.AsRegister<Register>(); + Register data_reg = temp.AsRegister<Register>(); + DCHECK_NE(data_reg, kBakerCcEntrypointRegister); + + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg); + Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + __ AddConstant(data_reg, obj, data_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Array LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - ScaleFactor scale_factor = TIMES_4; GenerateReferenceLoadWithBakerReadBarrier( instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check); } @@ -8379,6 +8649,11 @@ CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativePatch( return &patches->back(); } +Label* CodeGeneratorARM::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; +} + Literal* CodeGeneratorARM::DeduplicateBootImageStringLiteral(const DexFile& dex_file, dex::StringIndex string_index) { return boot_image_string_patches_.GetOrCreate( @@ -8445,7 +8720,8 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + boot_image_type_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -8479,6 +8755,10 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche target_type.dex_file, target_type.type_index.index_)); } + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.Position(), + info.custom_data)); + } DCHECK_EQ(size, linker_patches->size()); } diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index 86f2f21df7..6f007e100b 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -488,6 +488,11 @@ class CodeGeneratorARM : public CodeGenerator { PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); + + // Add a new baker read barrier patch and return the label to be bound + // before the BNE instruction. + Label* NewBakerReadBarrierPatch(uint32_t custom_data); + Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, dex::StringIndex string_index); Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index); @@ -503,6 +508,10 @@ class CodeGeneratorARM : public CodeGenerator { void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Maybe add the reserved entrypoint register as a temporary for field load. This temp + // is added only for AOT compilation if link-time generated thunks for fields are enabled. + void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations); + // Fast path implementation of ReadBarrier::Barrier for a heap // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, @@ -616,6 +625,13 @@ class CodeGeneratorARM : public CodeGenerator { Literal*, TypeReferenceValueComparator>; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + Label label; + uint32_t custom_data; + }; + Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file, @@ -648,6 +664,8 @@ class CodeGeneratorARM : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 8744cc8210..86f4cd22fa 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -16,6 +16,7 @@ #include "code_generator_arm_vixl.h" +#include "arch/arm/asm_support_arm.h" #include "arch/arm/instruction_set_features_arm.h" #include "art_method.h" #include "code_generator_utils.h" @@ -24,6 +25,7 @@ #include "entrypoints/quick/quick_entrypoints.h" #include "gc/accounting/card_table.h" #include "intrinsics_arm_vixl.h" +#include "linker/arm/relative_patcher_thumb2.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "thread.h" @@ -77,6 +79,20 @@ static constexpr size_t kArmBitsPerWord = kArmWordSize * kBitsPerByte; static constexpr int kCurrentMethodStackOffset = 0; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle +// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// The reserved entrypoint register for link-time generated thunks. +const vixl32::Register kBakerCcEntrypointRegister = r4; + #ifdef __ #error "ARM Codegen VIXL macro-assembler macro already defined." #endif @@ -88,6 +104,56 @@ static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; // Marker that code is yet to be, and must, be implemented. #define TODO_VIXL32(level) LOG(level) << __PRETTY_FUNCTION__ << " unimplemented " +static inline void ExcludeIPAndBakerCcEntrypointRegister(UseScratchRegisterScope* temps, + HInstruction* instruction) { + DCHECK(temps->IsAvailable(ip)); + temps->Exclude(ip); + DCHECK(!temps->IsAvailable(kBakerCcEntrypointRegister)); + DCHECK_EQ(kBakerCcEntrypointRegister.GetCode(), + linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister); + DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u); + DCHECK(RegisterFrom(instruction->GetLocations()->GetTemp( + instruction->GetLocations()->GetTempCount() - 1u)).Is(kBakerCcEntrypointRegister)); +} + +static inline void EmitPlaceholderBne(CodeGeneratorARMVIXL* codegen, vixl32::Label* patch_label) { + ExactAssemblyScope eas(codegen->GetVIXLAssembler(), kMaxInstructionSizeInBytes); + __ bind(patch_label); + vixl32::Label placeholder_label; + __ b(ne, EncodingSize(Wide), &placeholder_label); // Placeholder, patched at link-time. + __ bind(&placeholder_label); +} + +class EmitAdrCode { + public: + EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label) + : assembler_(assembler), rd_(rd), label_(label) { + ExactAssemblyScope aas(assembler, kMaxInstructionSizeInBytes); + adr_location_ = assembler->GetCursorOffset(); + assembler->adr(EncodingSize(Wide), rd, label); + } + + ~EmitAdrCode() { + DCHECK(label_->IsBound()); + // The ADR emitted by the assembler does not set the Thumb mode bit we need. + // TODO: Maybe extend VIXL to allow ADR for return address? + uint8_t* raw_adr = assembler_->GetBuffer()->GetOffsetAddress<uint8_t*>(adr_location_); + // Expecting ADR encoding T3 with `(offset & 1) == 0`. + DCHECK_EQ(raw_adr[1] & 0xfbu, 0xf2u); // Check bits 24-31, except 26. + DCHECK_EQ(raw_adr[0] & 0xffu, 0x0fu); // Check bits 16-23. + DCHECK_EQ(raw_adr[3] & 0x8fu, rd_.GetCode()); // Check bits 8-11 and 15. + DCHECK_EQ(raw_adr[2] & 0x01u, 0x00u); // Check bit 0, i.e. the `offset & 1`. + // Add the Thumb mode bit. + raw_adr[2] |= 0x01u; + } + + private: + ArmVIXLMacroAssembler* const assembler_; + vixl32::Register rd_; + vixl32::Label* const label_; + int32_t adr_location_; +}; + // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers, // for each live D registers they treat two corresponding S registers as live ones. // @@ -2012,6 +2078,7 @@ CodeGeneratorARMVIXL::CodeGeneratorARMVIXL(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -5289,7 +5356,18 @@ void LocationsBuilderARMVIXL::HandleFieldGet(HInstruction* instruction, } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else { + locations->AddTemp(Location::RequiresRegister()); + } } } @@ -5756,11 +5834,35 @@ void LocationsBuilderARMVIXL::VisitArrayGet(HArrayGet* instruction) { Location::RequiresRegister(), object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap); } - // We need a temporary register for the read barrier marking slow - // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier. - // Also need for String compression feature. - if ((object_array_get_with_read_barrier && kUseBakerReadBarrier) - || (mirror::kUseStringCompression && instruction->IsStringCharAt())) { + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier. + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation() && + !instruction->GetIndex()->IsConstant()) { + // We need a non-scratch temporary for the array data pointer. + locations->AddTemp(Location::RequiresRegister()); + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else { + locations->AddTemp(Location::RequiresRegister()); + } + } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + // Also need a temporary for String compression feature. locations->AddTemp(Location::RequiresRegister()); } } @@ -5871,8 +5973,20 @@ void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) { Location temp = locations->GetTemp(0); // Note that a potential implicit null check is handled in this // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true); + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + data_offset += Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + locations->GetTemp(0), + /* needs_null_check */ false); + } else { + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false); + } } else { vixl32::Register out = OutputRegister(instruction); if (index.IsConstant()) { @@ -6762,6 +6876,13 @@ void LocationsBuilderARMVIXL::VisitLoadClass(HLoadClass* cls) { // For non-Baker read barrier we have a temp-clobbering call. } } + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + if (load_kind == HLoadClass::LoadKind::kBssEntry || + (load_kind == HLoadClass::LoadKind::kReferrersClass && + !Runtime::Current()->UseJitCompilation())) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not @@ -6938,6 +7059,9 @@ void LocationsBuilderARMVIXL::VisitLoadString(HLoadString* load) { // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK() // that the the kPrimNot result register is the same as the first argument register. locations->SetCustomSlowPathCallerSaves(caller_saves); + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } } else { // For non-Baker read barrier we have a temp-clobbering call. } @@ -7100,6 +7224,9 @@ void LocationsBuilderARMVIXL::VisitInstanceOf(HInstanceOf* instruction) { // Note that TypeCheckSlowPathARM uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + codegen_->MaybeAddBakerCcEntrypointTempForFields(locations); + } } void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) { @@ -7998,48 +8125,96 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Location temp = LocationFrom(lr); - SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( - instruction, root, /* entrypoint */ temp); - codegen_->AddSlowPath(slow_path); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode()); + vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + + vixl::EmissionCheckScope guard(GetVIXLAssembler(), + 4 * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + static_assert( + BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8, + "GC root LDR must be 2 32-bit instructions (8B) before the return address label."); + // Currently the offset is always within range. If that changes, + // we shall have to split the load the same way as for fields. + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + __ ldr(EncodingSize(Wide), root_reg, MemOperand(obj, offset)); + EmitPlaceholderBne(codegen_, bne_label); + __ Bind(&return_address); + } else { + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -8057,6 +8232,16 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( } } +void CodeGeneratorARMVIXL::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields) { + if (!Runtime::Current()->UseJitCompilation()) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } + } +} + void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, vixl32::Register obj, @@ -8066,6 +8251,75 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + vixl32::Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + base = RegisterFrom(temp); + DCHECK(!base.Is(kBakerCcEntrypointRegister)); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u))); + offset &= (kReferenceLoadMinFarOffset - 1u); + } + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData( + base.GetCode(), + obj.GetCode()); + vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + + vixl::EmissionCheckScope guard( + GetVIXLAssembler(), + (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Field LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + __ ldr(EncodingSize(Wide), ref_reg, MemOperand(base, offset)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + // Note: We need a Wide NEG for the unpoisoning. + if (kPoisonHeapReferences) { + __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0)); + } + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) Location no_index = Location::NoLocation(); ScaleFactor no_scale_factor = TIMES_1; @@ -8086,9 +8340,73 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i static_assert( sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + ScaleFactor scale_factor = TIMES_4; + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + vixl32::Register index_reg = RegisterFrom(index, Primitive::kPrimInt); + vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + vixl32::Register data_reg = RegisterFrom(temp, Primitive::kPrimInt); // Raw pointer. + DCHECK(!data_reg.Is(kBakerCcEntrypointRegister)); + + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg.GetCode()); + vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + __ Add(data_reg, obj, Operand(data_offset)); + + vixl::EmissionCheckScope guard( + GetVIXLAssembler(), + (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Array LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + // Note: We need a Wide NEG for the unpoisoning. + if (kPoisonHeapReferences) { + __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0)); + } + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - ScaleFactor scale_factor = TIMES_4; GenerateReferenceLoadWithBakerReadBarrier( instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check); } @@ -8497,6 +8815,11 @@ CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativePa return &patches->back(); } +vixl::aarch32::Label* CodeGeneratorARMVIXL::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; +} + VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageStringLiteral( const DexFile& dex_file, dex::StringIndex string_index) { @@ -8578,7 +8901,8 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + boot_image_type_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -8612,6 +8936,10 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa target_type.dex_file, target_type.type_index.index_)); } + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(), + info.custom_data)); + } DCHECK_EQ(size, linker_patches->size()); } diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 1e9669dc38..9d56cc32cb 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -572,6 +572,11 @@ class CodeGeneratorARMVIXL : public CodeGenerator { PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); + + // Add a new baker read barrier patch and return the label to be bound + // before the BNE instruction. + vixl::aarch32::Label* NewBakerReadBarrierPatch(uint32_t custom_data); + VIXLUInt32Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, dex::StringIndex string_index); VIXLUInt32Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, @@ -589,6 +594,10 @@ class CodeGeneratorARMVIXL : public CodeGenerator { void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Maybe add the reserved entrypoint register as a temporary for field load. This temp + // is added only for AOT compilation if link-time generated thunks for fields are enabled. + void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations); + // Fast path implementation of ReadBarrier::Barrier for a heap // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, @@ -713,6 +722,13 @@ class CodeGeneratorARMVIXL : public CodeGenerator { VIXLUInt32Literal*, TypeReferenceValueComparator>; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + vixl::aarch32::Label label; + uint32_t custom_data; + }; + VIXLUInt32Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); VIXLUInt32Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); @@ -750,6 +766,8 @@ class CodeGeneratorARMVIXL : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 750f9cc213..c784171fd7 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -1648,6 +1648,8 @@ void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { // is clobbered by ReadBarrierMarkRegX entry points). Get an extra // temporary register from the register allocator. locations->AddTemp(Location::RequiresRegister()); + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen_); + arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations); } } diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index fd8a37ae05..77d870bec2 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -2026,6 +2026,8 @@ void IntrinsicLocationsBuilderARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) { // is clobbered by ReadBarrierMarkRegX entry points). Get an extra // temporary register from the register allocator. locations->AddTemp(Location::RequiresRegister()); + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen_); + arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations); } } |