diff options
author | 2017-04-21 17:58:41 +0100 | |
---|---|---|
committer | 2017-05-05 12:51:11 +0100 | |
commit | eee1c0ec2b08a6be642b329dc2fe885391127da3 (patch) | |
tree | 960bb4df48b4a320df3c58682449abb24b5fb122 | |
parent | c7cee403ad9a3f7097f5157a621a6a8cb991222e (diff) |
ARM: Link-time generated thunks for Baker CC read barrier.
Remaining work for follow-up CLs:
- use implicit null check in field thunk,
- use 16-bit LDRs for fields and GC roots.
Test: m test-art-target-gtest
Test: testrunner.py --target on Nexus 6P.
Test: testrunner.py --target on Nexus 6P with heap poisoning enabled.
Test: Repeat the above tests with ART_USE_OLD_ARM_BACKEND=true.
Bug: 29516974
Bug: 30126666
Bug: 36141117
Change-Id: Iad5addab72d790a9d61879f61f2e75b246bcdf5a
-rw-r--r-- | compiler/linker/arm/relative_patcher_thumb2.cc | 281 | ||||
-rw-r--r-- | compiler/linker/arm/relative_patcher_thumb2.h | 51 | ||||
-rw-r--r-- | compiler/linker/arm/relative_patcher_thumb2_test.cc | 700 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 380 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.h | 18 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 428 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.h | 18 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm.cc | 2 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 2 | ||||
-rw-r--r-- | compiler/utils/arm/assembler_arm.h | 3 | ||||
-rw-r--r-- | compiler/utils/arm/assembler_thumb2.cc | 153 | ||||
-rw-r--r-- | compiler/utils/arm/assembler_thumb2.h | 39 | ||||
-rw-r--r-- | runtime/arch/arch_test.cc | 10 | ||||
-rw-r--r-- | runtime/arch/arm/asm_support_arm.h | 22 | ||||
-rw-r--r-- | runtime/arch/arm/entrypoints_init_arm.cc | 21 | ||||
-rw-r--r-- | runtime/arch/arm/quick_entrypoints_arm.S | 210 | ||||
-rw-r--r-- | runtime/oat.h | 3 |
17 files changed, 2142 insertions, 199 deletions
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc index 1a5d79ce70..f2ccc4bcd3 100644 --- a/compiler/linker/arm/relative_patcher_thumb2.cc +++ b/compiler/linker/arm/relative_patcher_thumb2.cc @@ -16,9 +16,15 @@ #include "linker/arm/relative_patcher_thumb2.h" +#include "arch/arm/asm_support_arm.h" #include "art_method.h" #include "compiled_method.h" -#include "utils/arm/assembler_thumb2.h" +#include "entrypoints/quick/quick_entrypoints_enum.h" +#include "lock_word.h" +#include "mirror/object.h" +#include "mirror/array-inl.h" +#include "read_barrier.h" +#include "utils/arm/assembler_arm_vixl.h" namespace art { namespace linker { @@ -32,6 +38,12 @@ static constexpr int32_t kPcDisplacement = 4; constexpr uint32_t kMaxMethodCallPositiveDisplacement = (1u << 24) - 2 + kPcDisplacement; constexpr uint32_t kMaxMethodCallNegativeDisplacement = (1u << 24) - kPcDisplacement; +// Maximum positive and negative displacement for a conditional branch measured from the patch +// location. (Signed 21 bit displacement with the last bit 0 has range [-2^20, 2^20-2] measured +// from the Thumb2 PC pointing right after the B.cond, i.e. 4 bytes later than the patch location.) +constexpr uint32_t kMaxBcondPositiveDisplacement = (1u << 20) - 2u + kPcDisplacement; +constexpr uint32_t kMaxBcondNegativeDisplacement = (1u << 20) - kPcDisplacement; + Thumb2RelativePatcher::Thumb2RelativePatcher(RelativePatcherTargetProvider* provider) : ArmBaseRelativePatcher(provider, kThumb2) { } @@ -84,29 +96,244 @@ void Thumb2RelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code, SetInsn32(code, literal_offset, insn); } -void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* code ATTRIBUTE_UNUSED, - const LinkerPatch& patch ATTRIBUTE_UNUSED, - uint32_t patch_offset ATTRIBUTE_UNUSED) { - LOG(FATAL) << "UNIMPLEMENTED"; +void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* code, + const LinkerPatch& patch, + uint32_t patch_offset) { + DCHECK_ALIGNED(patch_offset, 2u); + uint32_t literal_offset = patch.LiteralOffset(); + DCHECK_ALIGNED(literal_offset, 2u); + DCHECK_LT(literal_offset, code->size()); + uint32_t insn = GetInsn32(code, literal_offset); + DCHECK_EQ(insn, 0xf0408000); // BNE +0 (unpatched) + ThunkKey key = GetBakerReadBarrierKey(patch); + if (kIsDebugBuild) { + // Check that the next instruction matches the expected LDR. + switch (key.GetType()) { + case ThunkType::kBakerReadBarrierField: { + DCHECK_GE(code->size() - literal_offset, 8u); + uint32_t next_insn = GetInsn32(code, literal_offset + 4u); + // LDR (immediate) with correct base_reg. + CheckValidReg((next_insn >> 12) & 0xfu); // Check destination register. + CHECK_EQ(next_insn & 0xffff0000u, 0xf8d00000u | (key.GetFieldParams().base_reg << 16)); + break; + } + case ThunkType::kBakerReadBarrierArray: { + DCHECK_GE(code->size() - literal_offset, 8u); + uint32_t next_insn = GetInsn32(code, literal_offset + 4u); + // LDR (register) with correct base_reg, S=1 and option=011 (LDR Wt, [Xn, Xm, LSL #2]). + CheckValidReg((next_insn >> 12) & 0xfu); // Check destination register. + CHECK_EQ(next_insn & 0xffff0ff0u, 0xf8500020u | (key.GetArrayParams().base_reg << 16)); + CheckValidReg(next_insn & 0xf); // Check index register + break; + } + case ThunkType::kBakerReadBarrierRoot: { + DCHECK_GE(literal_offset, 4u); + uint32_t prev_insn = GetInsn32(code, literal_offset - 4u); + // LDR (immediate) with correct root_reg. + CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (key.GetRootParams().root_reg << 12)); + break; + } + default: + LOG(FATAL) << "Unexpected type: " << static_cast<uint32_t>(key.GetType()); + UNREACHABLE(); + } + } + uint32_t target_offset = GetThunkTargetOffset(key, patch_offset); + DCHECK_ALIGNED(target_offset, 4u); + uint32_t disp = target_offset - (patch_offset + kPcDisplacement); + DCHECK((disp >> 20) == 0u || (disp >> 20) == 0xfffu); // 21-bit signed. + insn |= ((disp << (26 - 20)) & 0x04000000u) | // Shift bit 20 to 26, "S". + ((disp >> (19 - 11)) & 0x00000800u) | // Shift bit 19 to 13, "J1". + ((disp >> (18 - 13)) & 0x00002000u) | // Shift bit 18 to 11, "J2". + ((disp << (16 - 12)) & 0x003f0000u) | // Shift bits 12-17 to 16-25, "imm6". + ((disp >> (1 - 0)) & 0x000007ffu); // Shift bits 1-12 to 0-11, "imm11". + SetInsn32(code, literal_offset, insn); } ArmBaseRelativePatcher::ThunkKey Thumb2RelativePatcher::GetBakerReadBarrierKey( - const LinkerPatch& patch ATTRIBUTE_UNUSED) { - LOG(FATAL) << "UNIMPLEMENTED"; - UNREACHABLE(); + const LinkerPatch& patch) { + DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kBakerReadBarrierBranch); + uint32_t value = patch.GetBakerCustomValue1(); + BakerReadBarrierKind type = BakerReadBarrierKindField::Decode(value); + ThunkParams params; + switch (type) { + case BakerReadBarrierKind::kField: + params.field_params.base_reg = BakerReadBarrierFirstRegField::Decode(value); + CheckValidReg(params.field_params.base_reg); + params.field_params.holder_reg = BakerReadBarrierSecondRegField::Decode(value); + CheckValidReg(params.field_params.holder_reg); + break; + case BakerReadBarrierKind::kArray: + params.array_params.base_reg = BakerReadBarrierFirstRegField::Decode(value); + CheckValidReg(params.array_params.base_reg); + params.array_params.dummy = 0u; + DCHECK_EQ(BakerReadBarrierSecondRegField::Decode(value), kInvalidEncodedReg); + break; + case BakerReadBarrierKind::kGcRoot: + params.root_params.root_reg = BakerReadBarrierFirstRegField::Decode(value); + CheckValidReg(params.root_params.root_reg); + params.root_params.dummy = 0u; + DCHECK_EQ(BakerReadBarrierSecondRegField::Decode(value), kInvalidEncodedReg); + break; + default: + LOG(FATAL) << "Unexpected type: " << static_cast<uint32_t>(type); + UNREACHABLE(); + } + constexpr uint8_t kTypeTranslationOffset = 1u; + static_assert(static_cast<uint32_t>(BakerReadBarrierKind::kField) + kTypeTranslationOffset == + static_cast<uint32_t>(ThunkType::kBakerReadBarrierField), + "Thunk type translation check."); + static_assert(static_cast<uint32_t>(BakerReadBarrierKind::kArray) + kTypeTranslationOffset == + static_cast<uint32_t>(ThunkType::kBakerReadBarrierArray), + "Thunk type translation check."); + static_assert(static_cast<uint32_t>(BakerReadBarrierKind::kGcRoot) + kTypeTranslationOffset == + static_cast<uint32_t>(ThunkType::kBakerReadBarrierRoot), + "Thunk type translation check."); + return ThunkKey(static_cast<ThunkType>(static_cast<uint32_t>(type) + kTypeTranslationOffset), + params); +} + +#define __ assembler.GetVIXLAssembler()-> + +static void EmitGrayCheckAndFastPath(arm::ArmVIXLAssembler& assembler, + vixl::aarch32::Register base_reg, + vixl::aarch32::MemOperand& lock_word, + vixl::aarch32::Label* slow_path) { + using namespace vixl::aarch32; // NOLINT(build/namespaces) + // Load the lock word containing the rb_state. + __ Ldr(ip, lock_word); + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Tst(ip, Operand(LockWord::kReadBarrierStateMaskShifted)); + __ B(ne, slow_path, /* is_far_target */ false); + static_assert( + BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET, + "Field and array LDR offsets must be the same to reuse the same code."); + // Adjust the return address back to the LDR (1 instruction; 2 for heap poisoning). + static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Field LDR must be 1 instruction (4B) before the return address label; " + " 2 instructions (8B) for heap poisoning."); + __ Add(lr, lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET); + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + __ Add(base_reg, base_reg, Operand(ip, LSR, 32)); + __ Bx(lr); // And return back to the function. + // Note: The fake dependency is unnecessary for the slow path. } std::vector<uint8_t> Thumb2RelativePatcher::CompileThunk(const ThunkKey& key) { - DCHECK(key.GetType() == ThunkType::kMethodCall); - // The thunk just uses the entry point in the ArtMethod. This works even for calls - // to the generic JNI and interpreter trampolines. + using namespace vixl::aarch32; // NOLINT(build/namespaces) ArenaPool pool; ArenaAllocator arena(&pool); - arm::Thumb2Assembler assembler(&arena); - assembler.LoadFromOffset( - arm::kLoadWord, arm::PC, arm::R0, - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value()); - assembler.bkpt(0); + arm::ArmVIXLAssembler assembler(&arena); + + switch (key.GetType()) { + case ThunkType::kMethodCall: + // The thunk just uses the entry point in the ArtMethod. This works even for calls + // to the generic JNI and interpreter trampolines. + assembler.LoadFromOffset( + arm::kLoadWord, + vixl::aarch32::pc, + vixl::aarch32::r0, + ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value()); + __ Bkpt(0); + break; + case ThunkType::kBakerReadBarrierField: { + // Check if the holder is gray and, if not, add fake dependency to the base register + // and return to the LDR instruction to load the reference. Otherwise, use introspection + // to load the reference and call the entrypoint (in kBakerCcEntrypointRegister) + // that performs further checks on the reference and marks it if needed. + Register holder_reg(key.GetFieldParams().holder_reg); + Register base_reg(key.GetFieldParams().base_reg); + UseScratchRegisterScope temps(assembler.GetVIXLAssembler()); + temps.Exclude(ip); + // If base_reg differs from holder_reg, the offset was too large and we must have + // emitted an explicit null check before the load. Otherwise, we need to null-check + // the holder as we do not necessarily do that check before going to the thunk. + vixl::aarch32::Label throw_npe; + if (holder_reg.Is(base_reg)) { + __ CompareAndBranchIfZero(holder_reg, &throw_npe, /* is_far_target */ false); + } + vixl::aarch32::Label slow_path; + MemOperand lock_word(holder_reg, mirror::Object::MonitorOffset().Int32Value()); + EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path); + __ Bind(&slow_path); + const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 + + BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET; + MemOperand ldr_half_address(lr, ldr_offset + 2); + __ Ldrh(ip, ldr_half_address); // Load the LDR immediate half-word with "Rt | imm12". + __ Ubfx(ip, ip, 0, 12); // Extract the offset imm12. + __ Ldr(ip, MemOperand(base_reg, ip)); // Load the reference. + // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference. + __ Bx(Register(kBakerCcEntrypointRegister)); // Jump to the entrypoint. + if (holder_reg.Is(base_reg)) { + // Add null check slow path. The stack map is at the address pointed to by LR. + __ Bind(&throw_npe); + int32_t offset = GetThreadOffset<kArmPointerSize>(kQuickThrowNullPointer).Int32Value(); + __ Ldr(ip, MemOperand(/* Thread* */ vixl::aarch32::r9, offset)); + __ Bx(ip); + } + break; + } + case ThunkType::kBakerReadBarrierArray: { + Register base_reg(key.GetArrayParams().base_reg); + UseScratchRegisterScope temps(assembler.GetVIXLAssembler()); + temps.Exclude(ip); + vixl::aarch32::Label slow_path; + int32_t data_offset = + mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value(); + MemOperand lock_word(base_reg, mirror::Object::MonitorOffset().Int32Value() - data_offset); + DCHECK_LT(lock_word.GetOffsetImmediate(), 0); + EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path); + __ Bind(&slow_path); + const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 + + BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET; + MemOperand ldr_address(lr, ldr_offset + 2); + __ Ldrb(ip, ldr_address); // Load the LDR (register) byte with "00 | imm2 | Rm", + // i.e. Rm+32 because the scale in imm2 is 2. + Register ep_reg(kBakerCcEntrypointRegister); // Insert ip to the entrypoint address to create + __ Bfi(ep_reg, ip, 3, 6); // a switch case target based on the index register. + __ Mov(ip, base_reg); // Move the base register to ip0. + __ Bx(ep_reg); // Jump to the entrypoint's array switch case. + break; + } + case ThunkType::kBakerReadBarrierRoot: { + // Check if the reference needs to be marked and if so (i.e. not null, not marked yet + // and it does not have a forwarding address), call the correct introspection entrypoint; + // otherwise return the reference (or the extracted forwarding address). + // There is no gray bit check for GC roots. + Register root_reg(key.GetRootParams().root_reg); + UseScratchRegisterScope temps(assembler.GetVIXLAssembler()); + temps.Exclude(ip); + vixl::aarch32::Label return_label, not_marked, forwarding_address; + __ CompareAndBranchIfZero(root_reg, &return_label, /* is_far_target */ false); + MemOperand lock_word(root_reg, mirror::Object::MonitorOffset().Int32Value()); + __ Ldr(ip, lock_word); + __ Tst(ip, LockWord::kMarkBitStateMaskShifted); + __ B(eq, ¬_marked); + __ Bind(&return_label); + __ Bx(lr); + __ Bind(¬_marked); + static_assert(LockWord::kStateShift == 30 && LockWord::kStateForwardingAddress == 3, + "To use 'CMP ip, #modified-immediate; BHS', we need the lock word state in " + " the highest bits and the 'forwarding address' state to have all bits set"); + __ Cmp(ip, Operand(0xc0000000)); + __ B(hs, &forwarding_address); + // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister + // to art_quick_read_barrier_mark_introspection_gc_roots. + Register ep_reg(kBakerCcEntrypointRegister); + __ Add(ep_reg, ep_reg, Operand(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET)); + __ Mov(ip, root_reg); + __ Bx(ep_reg); + __ Bind(&forwarding_address); + __ Lsl(root_reg, ip, LockWord::kForwardingAddressShift); + __ Bx(lr); + break; + } + } + assembler.FinalizeCode(); std::vector<uint8_t> thunk_code(assembler.CodeSize()); MemoryRegion code(thunk_code.data(), thunk_code.size()); @@ -114,14 +341,28 @@ std::vector<uint8_t> Thumb2RelativePatcher::CompileThunk(const ThunkKey& key) { return thunk_code; } +#undef __ + uint32_t Thumb2RelativePatcher::MaxPositiveDisplacement(ThunkType type) { - DCHECK(type == ThunkType::kMethodCall); - return kMaxMethodCallPositiveDisplacement; + switch (type) { + case ThunkType::kMethodCall: + return kMaxMethodCallPositiveDisplacement; + case ThunkType::kBakerReadBarrierField: + case ThunkType::kBakerReadBarrierArray: + case ThunkType::kBakerReadBarrierRoot: + return kMaxBcondPositiveDisplacement; + } } uint32_t Thumb2RelativePatcher::MaxNegativeDisplacement(ThunkType type) { - DCHECK(type == ThunkType::kMethodCall); - return kMaxMethodCallNegativeDisplacement; + switch (type) { + case ThunkType::kMethodCall: + return kMaxMethodCallNegativeDisplacement; + case ThunkType::kBakerReadBarrierField: + case ThunkType::kBakerReadBarrierArray: + case ThunkType::kBakerReadBarrierRoot: + return kMaxBcondNegativeDisplacement; + } } void Thumb2RelativePatcher::SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value) { diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h index ab37802d0f..9eb06894d3 100644 --- a/compiler/linker/arm/relative_patcher_thumb2.h +++ b/compiler/linker/arm/relative_patcher_thumb2.h @@ -17,6 +17,10 @@ #ifndef ART_COMPILER_LINKER_ARM_RELATIVE_PATCHER_THUMB2_H_ #define ART_COMPILER_LINKER_ARM_RELATIVE_PATCHER_THUMB2_H_ +#include "arch/arm/registers_arm.h" +#include "base/array_ref.h" +#include "base/bit_field.h" +#include "base/bit_utils.h" #include "linker/arm/relative_patcher_arm_base.h" namespace art { @@ -24,6 +28,37 @@ namespace linker { class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher { public: + static constexpr uint32_t kBakerCcEntrypointRegister = 4u; + + enum class BakerReadBarrierKind : uint8_t { + kField, // Field get or array get with constant offset (i.e. constant index). + kArray, // Array get with index in register. + kGcRoot, // GC root load. + kLast + }; + + static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg, uint32_t holder_reg) { + CheckValidReg(base_reg); + CheckValidReg(holder_reg); + return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kField) | + BakerReadBarrierFirstRegField::Encode(base_reg) | + BakerReadBarrierSecondRegField::Encode(holder_reg); + } + + static uint32_t EncodeBakerReadBarrierArrayData(uint32_t base_reg) { + CheckValidReg(base_reg); + return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kArray) | + BakerReadBarrierFirstRegField::Encode(base_reg) | + BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg); + } + + static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg) { + CheckValidReg(root_reg); + return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kGcRoot) | + BakerReadBarrierFirstRegField::Encode(root_reg) | + BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg); + } + explicit Thumb2RelativePatcher(RelativePatcherTargetProvider* provider); void PatchCall(std::vector<uint8_t>* code, @@ -45,6 +80,22 @@ class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher { uint32_t MaxNegativeDisplacement(ThunkType type) OVERRIDE; private: + static constexpr uint32_t kInvalidEncodedReg = /* pc is invalid */ 15u; + + static constexpr size_t kBitsForBakerReadBarrierKind = + MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierKind::kLast)); + static constexpr size_t kBitsForRegister = 4u; + using BakerReadBarrierKindField = + BitField<BakerReadBarrierKind, 0, kBitsForBakerReadBarrierKind>; + using BakerReadBarrierFirstRegField = + BitField<uint32_t, kBitsForBakerReadBarrierKind, kBitsForRegister>; + using BakerReadBarrierSecondRegField = + BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>; + + static void CheckValidReg(uint32_t reg) { + DCHECK(reg < 12u && reg != kBakerCcEntrypointRegister); + } + void SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value); static uint32_t GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset); diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc index f08270d934..8bc3eb4505 100644 --- a/compiler/linker/arm/relative_patcher_thumb2_test.cc +++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc @@ -14,8 +14,12 @@ * limitations under the License. */ +#include "base/casts.h" #include "linker/relative_patcher_test.h" #include "linker/arm/relative_patcher_thumb2.h" +#include "lock_word.h" +#include "mirror/array-inl.h" +#include "mirror/object.h" #include "oat_quick_method_header.h" namespace art { @@ -34,13 +38,99 @@ class Thumb2RelativePatcherTest : public RelativePatcherTest { static const ArrayRef<const uint8_t> kUnpatchedPcRelativeCode; static const uint32_t kPcInsnOffset; + // The PC in Thumb mode is 4 bytes after the instruction location. + static constexpr uint32_t kPcAdjustment = 4u; + // Branches within range [-256, 256) can be created from these by adding the low 8 bits. - static constexpr uint32_t kBlPlus0 = 0xf000f800; - static constexpr uint32_t kBlMinus256 = 0xf7ffff00; + static constexpr uint32_t kBlPlus0 = 0xf000f800u; + static constexpr uint32_t kBlMinus256 = 0xf7ffff00u; // Special BL values. - static constexpr uint32_t kBlPlusMax = 0xf3ffd7ff; - static constexpr uint32_t kBlMinusMax = 0xf400d000; + static constexpr uint32_t kBlPlusMax = 0xf3ffd7ffu; + static constexpr uint32_t kBlMinusMax = 0xf400d000u; + + // BNE +0, 32-bit, encoding T3. Bits 0-10, 11, 13, 16-21, 26 are placeholder for target offset. + static constexpr uint32_t kBneWPlus0 = 0xf0408000u; + + // LDR immediate, 32-bit, encoding T3. Bits 0-11 are offset, 12-15 are Rt, 16-20 are Rn. + static constexpr uint32_t kLdrWInsn = 0xf8d00000u; + + // LDR immediate, negative offset, encoding T4. Bits 0-7 are the offset to subtract. + static constexpr uint32_t kLdrNegativeOffset = 0xf8500c00u; + + // LDR register, lsl #2. Bits 4-5 are the imm2, i.e. the lsl shift. + static constexpr uint32_t kLdrRegLsl2 = 0xf8500020u; + + // NOP instructions. + static constexpr uint32_t kNopInsn = 0xbf00u; + static constexpr uint32_t kNopWInsn = 0xf3af8000u; + + void InsertInsn(std::vector<uint8_t>* code, size_t pos, uint32_t insn) { + CHECK_LE(pos, code->size()); + if (IsUint<16>(insn)) { + const uint8_t insn_code[] = { + static_cast<uint8_t>(insn), + static_cast<uint8_t>(insn >> 8), + }; + static_assert(sizeof(insn_code) == 2u, "Invalid sizeof(insn_code)."); + code->insert(code->begin() + pos, insn_code, insn_code + sizeof(insn_code)); + } else { + const uint8_t insn_code[] = { + static_cast<uint8_t>(insn >> 16), + static_cast<uint8_t>(insn >> 24), + static_cast<uint8_t>(insn), + static_cast<uint8_t>(insn >> 8), + }; + static_assert(sizeof(insn_code) == 4u, "Invalid sizeof(insn_code)."); + code->insert(code->begin() + pos, insn_code, insn_code + sizeof(insn_code)); + } + } + + void PushBackInsn(std::vector<uint8_t>* code, uint32_t insn) { + InsertInsn(code, code->size(), insn); + } + + std::vector<uint8_t> GenNops(size_t num_nops) { + std::vector<uint8_t> result; + result.reserve(num_nops * 2u); + for (size_t i = 0; i != num_nops; ++i) { + PushBackInsn(&result, kNopInsn); + } + return result; + } + + std::vector<uint8_t> RawCode(std::initializer_list<uint32_t> insns) { + std::vector<uint8_t> raw_code; + size_t number_of_16_bit_insns = + std::count_if(insns.begin(), insns.end(), [](uint32_t x) { return IsUint<16>(x); }); + raw_code.reserve(insns.size() * 4u - number_of_16_bit_insns * 2u); + for (uint32_t insn : insns) { + PushBackInsn(&raw_code, insn); + } + return raw_code; + } + + uint32_t BneWWithOffset(uint32_t bne_offset, uint32_t target_offset) { + if (!IsAligned<2u>(bne_offset)) { + LOG(ERROR) << "Unaligned bne_offset: " << bne_offset; + return 0xffffffffu; // Fails code diff later. + } + if (!IsAligned<2u>(target_offset)) { + LOG(ERROR) << "Unaligned target_offset: " << target_offset; + return 0xffffffffu; // Fails code diff later. + } + uint32_t diff = target_offset - bne_offset - kPcAdjustment; + DCHECK_ALIGNED(diff, 2u); + if ((diff >> 20) != 0 && (diff >> 20) != 0xfffu) { + LOG(ERROR) << "Target out of range: " << diff; + return 0xffffffffu; // Fails code diff later. + } + return kBneWPlus0 | ((diff >> 1) & 0x7ffu) // imm11 + | (((diff >> 12) & 0x3fu) << 16) // imm6 + | (((diff >> 18) & 1) << 13) // J1 + | (((diff >> 19) & 1) << 11) // J2 + | (((diff >> 20) & 1) << 26); // S + } bool Create2MethodsWithGap(const ArrayRef<const uint8_t>& method1_code, const ArrayRef<const LinkerPatch>& method1_patches, @@ -125,19 +215,57 @@ class Thumb2RelativePatcherTest : public RelativePatcherTest { std::vector<uint8_t> result; result.reserve(num_nops * 2u + 4u); for (size_t i = 0; i != num_nops; ++i) { - result.push_back(0x00); - result.push_back(0xbf); + PushBackInsn(&result, kNopInsn); } - result.push_back(static_cast<uint8_t>(bl >> 16)); - result.push_back(static_cast<uint8_t>(bl >> 24)); - result.push_back(static_cast<uint8_t>(bl)); - result.push_back(static_cast<uint8_t>(bl >> 8)); + PushBackInsn(&result, bl); return result; } void TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset); void TestStringReference(uint32_t string_offset); void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset); + + std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg, uint32_t holder_reg) { + const LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch( + 0u, Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg)); + auto* patcher = down_cast<Thumb2RelativePatcher*>(patcher_.get()); + ArmBaseRelativePatcher::ThunkKey key = patcher->GetBakerReadBarrierKey(patch); + return patcher->CompileThunk(key); + } + + std::vector<uint8_t> CompileBakerArrayThunk(uint32_t base_reg) { + LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch( + 0u, Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(base_reg)); + auto* patcher = down_cast<Thumb2RelativePatcher*>(patcher_.get()); + ArmBaseRelativePatcher::ThunkKey key = patcher->GetBakerReadBarrierKey(patch); + return patcher->CompileThunk(key); + } + + std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg) { + LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch( + 0u, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg)); + auto* patcher = down_cast<Thumb2RelativePatcher*>(patcher_.get()); + ArmBaseRelativePatcher::ThunkKey key = patcher->GetBakerReadBarrierKey(patch); + return patcher->CompileThunk(key); + } + + uint32_t GetOutputInsn32(uint32_t offset) { + CHECK_LE(offset, output_.size()); + CHECK_GE(output_.size() - offset, 4u); + return (static_cast<uint32_t>(output_[offset]) << 16) | + (static_cast<uint32_t>(output_[offset + 1]) << 24) | + (static_cast<uint32_t>(output_[offset + 2]) << 0) | + (static_cast<uint32_t>(output_[offset + 3]) << 8); + } + + uint16_t GetOutputInsn16(uint32_t offset) { + CHECK_LE(offset, output_.size()); + CHECK_GE(output_.size() - offset, 2u); + return (static_cast<uint32_t>(output_[offset]) << 0) | + (static_cast<uint32_t>(output_[offset + 1]) << 8); + } + + void TestBakerField(uint32_t offset, uint32_t ref_reg); }; const uint8_t Thumb2RelativePatcherTest::kCallRawCode[] = { @@ -164,7 +292,7 @@ const uint32_t Thumb2RelativePatcherTest::kPcInsnOffset = 8u; void Thumb2RelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_begin, uint32_t element_offset) { dex_cache_arrays_begin_ = dex_cache_arrays_begin; - LinkerPatch patches[] = { + const LinkerPatch patches[] = { LinkerPatch::DexCacheArrayPatch(0u, nullptr, kPcInsnOffset, element_offset), LinkerPatch::DexCacheArrayPatch(4u, nullptr, kPcInsnOffset, element_offset), }; @@ -175,7 +303,7 @@ void Thumb2RelativePatcherTest::TestDexCacheReference(uint32_t dex_cache_arrays_ void Thumb2RelativePatcherTest::TestStringReference(uint32_t string_offset) { constexpr uint32_t kStringIndex = 1u; string_index_to_offset_map_.Put(kStringIndex, string_offset); - LinkerPatch patches[] = { + const LinkerPatch patches[] = { LinkerPatch::RelativeStringPatch(0u, nullptr, kPcInsnOffset, kStringIndex), LinkerPatch::RelativeStringPatch(4u, nullptr, kPcInsnOffset, kStringIndex), }; @@ -214,7 +342,7 @@ void Thumb2RelativePatcherTest::CheckPcRelativePatch(const ArrayRef<const Linker } TEST_F(Thumb2RelativePatcherTest, CallSelf) { - LinkerPatch patches[] = { + const LinkerPatch patches[] = { LinkerPatch::RelativeCodePatch(0u, nullptr, 1u), }; AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches)); @@ -227,11 +355,11 @@ TEST_F(Thumb2RelativePatcherTest, CallSelf) { } TEST_F(Thumb2RelativePatcherTest, CallOther) { - LinkerPatch method1_patches[] = { + const LinkerPatch method1_patches[] = { LinkerPatch::RelativeCodePatch(0u, nullptr, 2u), }; AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(method1_patches)); - LinkerPatch method2_patches[] = { + const LinkerPatch method2_patches[] = { LinkerPatch::RelativeCodePatch(0u, nullptr, 1u), }; AddCompiledMethod(MethodRef(2u), kCallCode, ArrayRef<const LinkerPatch>(method2_patches)); @@ -254,7 +382,7 @@ TEST_F(Thumb2RelativePatcherTest, CallOther) { } TEST_F(Thumb2RelativePatcherTest, CallTrampoline) { - LinkerPatch patches[] = { + const LinkerPatch patches[] = { LinkerPatch::RelativeCodePatch(0u, nullptr, 2u), }; AddCompiledMethod(MethodRef(1u), kCallCode, ArrayRef<const LinkerPatch>(patches)); @@ -274,7 +402,7 @@ TEST_F(Thumb2RelativePatcherTest, CallTrampolineTooFar) { constexpr uint32_t bl_offset_in_method3 = 3u * 2u; // After NOPs. ArrayRef<const uint8_t> method3_code(method3_raw_code); ASSERT_EQ(bl_offset_in_method3 + 4u, method3_code.size()); - LinkerPatch method3_patches[] = { + const LinkerPatch method3_patches[] = { LinkerPatch::RelativeCodePatch(bl_offset_in_method3, nullptr, missing_method_index), }; @@ -303,7 +431,7 @@ TEST_F(Thumb2RelativePatcherTest, CallOtherAlmostTooFarAfter) { constexpr uint32_t bl_offset_in_method1 = 3u * 2u; // After NOPs. ArrayRef<const uint8_t> method1_code(method1_raw_code); ASSERT_EQ(bl_offset_in_method1 + 4u, method1_code.size()); - LinkerPatch method1_patches[] = { + const LinkerPatch method1_patches[] = { LinkerPatch::RelativeCodePatch(bl_offset_in_method1, nullptr, 3u), }; @@ -325,7 +453,7 @@ TEST_F(Thumb2RelativePatcherTest, CallOtherAlmostTooFarBefore) { constexpr uint32_t bl_offset_in_method3 = 2u * 2u; // After NOPs. ArrayRef<const uint8_t> method3_code(method3_raw_code); ASSERT_EQ(bl_offset_in_method3 + 4u, method3_code.size()); - LinkerPatch method3_patches[] = { + const LinkerPatch method3_patches[] = { LinkerPatch::RelativeCodePatch(bl_offset_in_method3, nullptr, 1u), }; @@ -347,7 +475,7 @@ TEST_F(Thumb2RelativePatcherTest, CallOtherJustTooFarAfter) { constexpr uint32_t bl_offset_in_method1 = 2u * 2u; // After NOPs. ArrayRef<const uint8_t> method1_code(method1_raw_code); ASSERT_EQ(bl_offset_in_method1 + 4u, method1_code.size()); - LinkerPatch method1_patches[] = { + const LinkerPatch method1_patches[] = { LinkerPatch::RelativeCodePatch(bl_offset_in_method1, nullptr, 3u), }; @@ -382,7 +510,7 @@ TEST_F(Thumb2RelativePatcherTest, CallOtherJustTooFarBefore) { constexpr uint32_t bl_offset_in_method3 = 3u * 2u; // After NOPs. ArrayRef<const uint8_t> method3_code(method3_raw_code); ASSERT_EQ(bl_offset_in_method3 + 4u, method3_code.size()); - LinkerPatch method3_patches[] = { + const LinkerPatch method3_patches[] = { LinkerPatch::RelativeCodePatch(bl_offset_in_method3, nullptr, 1u), }; @@ -445,5 +573,535 @@ TEST_F(Thumb2RelativePatcherTest, StringReference4) { ASSERT_LT(GetMethodOffset(1u), 0xfcu); } +void Thumb2RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg) { + uint32_t valid_regs[] = { + 0, 1, 2, 3, 5, 6, 7, // R4 is reserved for entrypoint address. + 8, 9, 10, 11, // IP, SP, LR and PC are reserved. + }; + DCHECK_ALIGNED(offset, 4u); + DCHECK_LT(offset, 4 * KB); + constexpr size_t kMethodCodeSize = 8u; + constexpr size_t kLiteralOffset = 0u; + uint32_t method_idx = 0u; + for (uint32_t base_reg : valid_regs) { + for (uint32_t holder_reg : valid_regs) { + uint32_t ldr = kLdrWInsn | offset | (base_reg << 16) | (ref_reg << 12); + const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr}); + ASSERT_EQ(kMethodCodeSize, raw_code.size()); + ArrayRef<const uint8_t> code(raw_code); + uint32_t encoded_data = + Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg); + const LinkerPatch patches[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data), + }; + ++method_idx; + AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches)); + } + } + Link(); + + // All thunks are at the end. + uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment); + method_idx = 0u; + for (uint32_t base_reg : valid_regs) { + for (uint32_t holder_reg : valid_regs) { + ++method_idx; + uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset); + uint32_t ldr = kLdrWInsn | offset | (base_reg << 16) | (ref_reg << 12); + const std::vector<uint8_t> expected_code = RawCode({bne, ldr}); + ASSERT_EQ(kMethodCodeSize, expected_code.size()) << "bne=0x" << std::hex << bne; + ASSERT_TRUE( + CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code))); + + std::vector<uint8_t> expected_thunk = CompileBakerOffsetThunk(base_reg, holder_reg); + ASSERT_GT(output_.size(), thunk_offset); + ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size()); + ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset, + expected_thunk.size()); + if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) { + DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk); + ASSERT_TRUE(false); + } + + size_t gray_check_offset = thunk_offset; + if (holder_reg == base_reg) { + // Verify that the null-check uses the correct register, i.e. holder_reg. + if (holder_reg < 8) { + ASSERT_GE(output_.size() - gray_check_offset, 2u); + ASSERT_EQ(0xb100 | holder_reg, GetOutputInsn16(thunk_offset) & 0xfd07u); + gray_check_offset +=2u; + } else { + ASSERT_GE(output_.size() - gray_check_offset, 6u); + ASSERT_EQ(0xf1b00f00u | (holder_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u); + ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u); // BEQ + gray_check_offset += 6u; + } + } + // Verify that the lock word for gray bit check is loaded from the holder address. + ASSERT_GE(output_.size() - gray_check_offset, + 4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u); + const uint32_t load_lock_word = + kLdrWInsn | + (holder_reg << 16) | + (/* IP */ 12 << 12) | + mirror::Object::MonitorOffset().Uint32Value(); + ASSERT_EQ(load_lock_word, GetOutputInsn32(gray_check_offset)); + // Verify the gray bit check. + DCHECK_GE(LockWord::kReadBarrierStateShift, 8u); // ROR modified immediate. + uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift); + const uint32_t tst_gray_bit_without_offset = + 0xf0100f00 | (/* IP */ 12 << 16) + | (((ror_shift >> 4) & 1) << 26) // i + | (((ror_shift >> 1) & 7) << 12) // imm3 + | ((ror_shift & 1) << 7); // imm8, ROR('1':imm8<7:0>, ror_shift). + EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(gray_check_offset + 4u)); + EXPECT_EQ(0xd100u, GetOutputInsn16(gray_check_offset + 8u) & 0xff00u); // BNE + // Verify the fake dependency (skip "ADD LR, LR, #ldr_offset"). + const uint32_t fake_dependency = + 0xeb000010 | // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00) + (/* IP */ 12) | // Rm = IP + (base_reg << 16) | // Rn = base_reg + (base_reg << 8); // Rd = base_reg + EXPECT_EQ(fake_dependency, GetOutputInsn32(gray_check_offset + 14u)); + // Do not check the rest of the implementation. + + // The next thunk follows on the next aligned offset. + thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment); + } + } +} + +#define TEST_BAKER_FIELD(offset, ref_reg) \ + TEST_F(Thumb2RelativePatcherTest, \ + BakerOffset##offset##_##ref_reg) { \ + TestBakerField(offset, ref_reg); \ + } + +TEST_BAKER_FIELD(/* offset */ 0, /* ref_reg */ 0) +TEST_BAKER_FIELD(/* offset */ 8, /* ref_reg */ 7) +TEST_BAKER_FIELD(/* offset */ 0xffc, /* ref_reg */ 11) + +TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddle) { + // One thunk in the middle with maximum distance branches to it from both sides. + // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`. + constexpr uint32_t kLiteralOffset1 = 6u; + const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn}); + ArrayRef<const uint8_t> code1(raw_code1); + uint32_t encoded_data = + Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0); + const LinkerPatch patches1[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data), + }; + AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(patches1)); + + constexpr uint32_t expected_thunk_offset = + kLiteralOffset1 + kPcAdjustment + /* kMaxBcondPositiveDisplacement */ ((1 << 20) - 2u); + static_assert(IsAligned<kArmAlignment>(expected_thunk_offset), "Target offset must be aligned."); + size_t filler1_size = expected_thunk_offset - + RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment); + std::vector<uint8_t> raw_filler1_code = GenNops(filler1_size / 2u); + ArrayRef<const uint8_t> filler1_code(raw_filler1_code); + AddCompiledMethod(MethodRef(2u), filler1_code); + + // Enforce thunk reservation with a tiny method. + AddCompiledMethod(MethodRef(3u), kNopCode); + + constexpr uint32_t kLiteralOffset2 = 4; + static_assert(IsAligned<kArmAlignment>(kLiteralOffset2 + kPcAdjustment), + "PC for BNE must be aligned."); + + // Allow reaching the thunk from the very beginning of a method almost 1MiB away. Backward branch + // reaches the full 1MiB but we need to take PC adjustment into account. Things to subtract: + // - thunk size and method 3 pre-header, rounded up (padding in between if needed) + // - method 3 code and method 4 pre-header, rounded up (padding in between if needed) + // - method 4 header (let there be no padding between method 4 code and method 5 pre-header). + size_t thunk_size = CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0).size(); + size_t filler2_size = + 1 * MB - (kLiteralOffset2 + kPcAdjustment) + - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment) + - RoundUp(kNopCode.size() + sizeof(OatQuickMethodHeader), kArmAlignment) + - sizeof(OatQuickMethodHeader); + std::vector<uint8_t> raw_filler2_code = GenNops(filler2_size / 2u); + ArrayRef<const uint8_t> filler2_code(raw_filler2_code); + AddCompiledMethod(MethodRef(4u), filler2_code); + + const std::vector<uint8_t> raw_code2 = RawCode({kNopWInsn, kBneWPlus0, kLdrWInsn}); + ArrayRef<const uint8_t> code2(raw_code2); + const LinkerPatch patches2[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset2, encoded_data), + }; + AddCompiledMethod(MethodRef(5u), code2, ArrayRef<const LinkerPatch>(patches2)); + + Link(); + + uint32_t first_method_offset = GetMethodOffset(1u); + uint32_t last_method_offset = GetMethodOffset(5u); + EXPECT_EQ(2 * MB, last_method_offset - first_method_offset); + + const uint32_t bne_max_forward = kBneWPlus0 | 0x003f2fff; + const uint32_t bne_max_backward = kBneWPlus0 | 0x04000000; + const std::vector<uint8_t> expected_code1 = + RawCode({kNopWInsn, kNopInsn, bne_max_forward, kLdrWInsn}); + const std::vector<uint8_t> expected_code2 = RawCode({kNopWInsn, bne_max_backward, kLdrWInsn}); + ASSERT_TRUE(CheckLinkedMethod(MethodRef(1), ArrayRef<const uint8_t>(expected_code1))); + ASSERT_TRUE(CheckLinkedMethod(MethodRef(5), ArrayRef<const uint8_t>(expected_code2))); +} + +TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkBeforeFiller) { + // Based on the first part of BakerOffsetThunkInTheMiddle but the BNE is one instruction + // earlier, so the thunk is emitted before the filler. + // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`. + constexpr uint32_t kLiteralOffset1 = 4u; + const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kBneWPlus0, kLdrWInsn, kNopInsn}); + ArrayRef<const uint8_t> code1(raw_code1); + uint32_t encoded_data = + Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0); + const LinkerPatch patches1[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data), + }; + AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(patches1)); + + constexpr uint32_t expected_thunk_offset = + kLiteralOffset1 + kPcAdjustment + /* kMaxBcondPositiveDisplacement + 2 */ (1u << 20); + static_assert(IsAligned<kArmAlignment>(expected_thunk_offset), "Target offset must be aligned."); + size_t filler1_size = expected_thunk_offset - + RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment); + std::vector<uint8_t> raw_filler1_code = GenNops(filler1_size / 2u); + ArrayRef<const uint8_t> filler1_code(raw_filler1_code); + AddCompiledMethod(MethodRef(2u), filler1_code); + + Link(); + + const uint32_t bne = BneWWithOffset(kLiteralOffset1, RoundUp(raw_code1.size(), kArmAlignment)); + const std::vector<uint8_t> expected_code1 = RawCode({kNopWInsn, bne, kLdrWInsn, kNopInsn}); + ASSERT_TRUE(CheckLinkedMethod(MethodRef(1), ArrayRef<const uint8_t>(expected_code1))); +} + +TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddleUnreachableFromLast) { + // Based on the BakerOffsetThunkInTheMiddle but the BNE in the last method is preceded + // by NOP and cannot reach the thunk in the middle, so we emit an extra thunk at the end. + // Use offset = 0, base_reg = 0, ref_reg = 0, the LDR is simply `kLdrWInsn`. + constexpr uint32_t kLiteralOffset1 = 6u; + const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn}); + ArrayRef<const uint8_t> code1(raw_code1); + uint32_t encoded_data = + Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0); + const LinkerPatch patches1[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data), + }; + AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(patches1)); + + constexpr uint32_t expected_thunk_offset = + kLiteralOffset1 + kPcAdjustment + /* kMaxBcondPositiveDisplacement */ ((1 << 20) - 2u); + static_assert(IsAligned<kArmAlignment>(expected_thunk_offset), "Target offset must be aligned."); + size_t filler1_size = expected_thunk_offset - + RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment); + std::vector<uint8_t> raw_filler1_code = GenNops(filler1_size / 2u); + ArrayRef<const uint8_t> filler1_code(raw_filler1_code); + AddCompiledMethod(MethodRef(2u), filler1_code); + + // Enforce thunk reservation with a tiny method. + AddCompiledMethod(MethodRef(3u), kNopCode); + + constexpr uint32_t kReachableFromOffset2 = 4; + constexpr uint32_t kLiteralOffset2 = kReachableFromOffset2 + 2; + static_assert(IsAligned<kArmAlignment>(kReachableFromOffset2 + kPcAdjustment), + "PC for BNE must be aligned."); + + // If not for the extra NOP, this would allow reaching the thunk from the BNE + // of a method 1MiB away. Backward branch reaches the full 1MiB but we need to take + // PC adjustment into account. Things to subtract: + // - thunk size and method 3 pre-header, rounded up (padding in between if needed) + // - method 3 code and method 4 pre-header, rounded up (padding in between if needed) + // - method 4 header (let there be no padding between method 4 code and method 5 pre-header). + size_t thunk_size = CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0).size(); + size_t filler2_size = + 1 * MB - (kReachableFromOffset2 + kPcAdjustment) + - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment) + - RoundUp(kNopCode.size() + sizeof(OatQuickMethodHeader), kArmAlignment) + - sizeof(OatQuickMethodHeader); + std::vector<uint8_t> raw_filler2_code = GenNops(filler2_size / 2u); + ArrayRef<const uint8_t> filler2_code(raw_filler2_code); + AddCompiledMethod(MethodRef(4u), filler2_code); + + // Extra 16-bit NOP compared to BakerOffsetThunkInTheMiddle. + const std::vector<uint8_t> raw_code2 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn}); + ArrayRef<const uint8_t> code2(raw_code2); + const LinkerPatch patches2[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset2, encoded_data), + }; + AddCompiledMethod(MethodRef(5u), code2, ArrayRef<const LinkerPatch>(patches2)); + + Link(); + + uint32_t first_method_offset = GetMethodOffset(1u); + uint32_t last_method_offset = GetMethodOffset(5u); + EXPECT_EQ(2 * MB, last_method_offset - first_method_offset); + + const uint32_t bne_max_forward = kBneWPlus0 | 0x003f2fff; + const uint32_t bne_last = + BneWWithOffset(kLiteralOffset2, RoundUp(raw_code2.size(), kArmAlignment)); + const std::vector<uint8_t> expected_code1 = + RawCode({kNopWInsn, kNopInsn, bne_max_forward, kLdrWInsn}); + const std::vector<uint8_t> expected_code2 = + RawCode({kNopWInsn, kNopInsn, bne_last, kLdrWInsn}); + ASSERT_TRUE(CheckLinkedMethod(MethodRef(1), ArrayRef<const uint8_t>(expected_code1))); + ASSERT_TRUE(CheckLinkedMethod(MethodRef(5), ArrayRef<const uint8_t>(expected_code2))); +} + +TEST_F(Thumb2RelativePatcherTest, BakerArray) { + uint32_t valid_regs[] = { + 0, 1, 2, 3, 5, 6, 7, // R4 is reserved for entrypoint address. + 8, 9, 10, 11, // IP, SP, LR and PC are reserved. + }; + auto ldr = [](uint32_t base_reg) { + uint32_t index_reg = (base_reg == 0u) ? 1u : 0u; + uint32_t ref_reg = (base_reg == 2) ? 3u : 2u; + return kLdrRegLsl2 | index_reg | (base_reg << 16) | (ref_reg << 12); + }; + constexpr size_t kMethodCodeSize = 8u; + constexpr size_t kLiteralOffset = 0u; + uint32_t method_idx = 0u; + for (uint32_t base_reg : valid_regs) { + ++method_idx; + const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr(base_reg)}); + ASSERT_EQ(kMethodCodeSize, raw_code.size()); + ArrayRef<const uint8_t> code(raw_code); + const LinkerPatch patches[] = { + LinkerPatch::BakerReadBarrierBranchPatch( + kLiteralOffset, Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(base_reg)), + }; + AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches)); + } + Link(); + + // All thunks are at the end. + uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment); + method_idx = 0u; + for (uint32_t base_reg : valid_regs) { + ++method_idx; + uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset); + const std::vector<uint8_t> expected_code = RawCode({bne, ldr(base_reg)}); + ASSERT_EQ(kMethodCodeSize, expected_code.size()); + EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code))); + + std::vector<uint8_t> expected_thunk = CompileBakerArrayThunk(base_reg); + ASSERT_GT(output_.size(), thunk_offset); + ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size()); + ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset, + expected_thunk.size()); + if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) { + DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk); + ASSERT_TRUE(false); + } + + // Verify that the lock word for gray bit check is loaded from the correct address + // before the base_reg which points to the array data. + ASSERT_GE(output_.size() - thunk_offset, + 4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u); + int32_t data_offset = + mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value(); + int32_t offset = mirror::Object::MonitorOffset().Int32Value() - data_offset; + ASSERT_LT(offset, 0); + ASSERT_GT(offset, -256); + const uint32_t load_lock_word = + kLdrNegativeOffset | + (-offset & 0xffu) | + (base_reg << 16) | + (/* IP */ 12 << 12); + EXPECT_EQ(load_lock_word, GetOutputInsn32(thunk_offset)); + // Verify the gray bit check. + DCHECK_GE(LockWord::kReadBarrierStateShift, 8u); // ROR modified immediate. + uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift); + const uint32_t tst_gray_bit_without_offset = + 0xf0100f00 | (/* IP */ 12 << 16) + | (((ror_shift >> 4) & 1) << 26) // i + | (((ror_shift >> 1) & 7) << 12) // imm3 + | ((ror_shift & 1) << 7); // imm8, ROR('1':imm8<7:0>, ror_shift). + EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(thunk_offset + 4u)); + EXPECT_EQ(0xd100u, GetOutputInsn16(thunk_offset + 8u) & 0xff00u); // BNE + // Verify the fake dependency. + const uint32_t fake_dependency = + 0xeb000010 | // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00) + (/* IP */ 12) | // Rm = IP + (base_reg << 16) | // Rn = base_reg + (base_reg << 8); // Rd = base_reg + EXPECT_EQ(fake_dependency, GetOutputInsn32(thunk_offset + 14u)); + // Do not check the rest of the implementation. + + // The next thunk follows on the next aligned offset. + thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment); + } +} + +TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) { + uint32_t valid_regs[] = { + 0, 1, 2, 3, 5, 6, 7, // R4 is reserved for entrypoint address. + 8, 9, 10, 11, // IP, SP, LR and PC are reserved. + }; + constexpr size_t kMethodCodeSize = 8u; + constexpr size_t kLiteralOffset = 4u; + uint32_t method_idx = 0u; + for (uint32_t root_reg : valid_regs) { + ++method_idx; + uint32_t ldr = kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (root_reg << 12); + const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0}); + ASSERT_EQ(kMethodCodeSize, raw_code.size()); + ArrayRef<const uint8_t> code(raw_code); + const LinkerPatch patches[] = { + LinkerPatch::BakerReadBarrierBranchPatch( + kLiteralOffset, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg)), + }; + AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches)); + } + Link(); + + // All thunks are at the end. + uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment); + method_idx = 0u; + for (uint32_t root_reg : valid_regs) { + ++method_idx; + uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset); + uint32_t ldr = kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (root_reg << 12); + const std::vector<uint8_t> expected_code = RawCode({ldr, bne}); + ASSERT_EQ(kMethodCodeSize, expected_code.size()); + EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code))); + + std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg); + ASSERT_GT(output_.size(), thunk_offset); + ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size()); + ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset, + expected_thunk.size()); + if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) { + DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk); + ASSERT_TRUE(false); + } + + // Verify that the fast-path null-check CBZ uses the correct register, i.e. root_reg. + if (root_reg < 8) { + ASSERT_GE(output_.size() - thunk_offset, 2u); + ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u); + } else { + ASSERT_GE(output_.size() - thunk_offset, 6u); + ASSERT_EQ(0xf1b00f00u | (root_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u); + ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u); // BEQ + } + // Do not check the rest of the implementation. + + // The next thunk follows on the next aligned offset. + thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment); + } +} + +TEST_F(Thumb2RelativePatcherTest, BakerGcRootOffsetBits) { + // Test 1MiB of patches to the same thunk to stress-test different large offsets. + // (The low bits are not that important but the location of the high bits is easy to get wrong.) + std::vector<uint8_t> code; + code.reserve(1 * MB); + const size_t num_patches = 1 * MB / 8u; + std::vector<LinkerPatch> patches; + patches.reserve(num_patches); + const uint32_t ldr = + kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (/* root_reg */ 0 << 12); + uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 0); + for (size_t i = 0; i != num_patches; ++i) { + PushBackInsn(&code, ldr); + PushBackInsn(&code, kBneWPlus0); + patches.push_back(LinkerPatch::BakerReadBarrierBranchPatch(8u * i + 4u, encoded_data)); + } + ASSERT_EQ(1 * MB, code.size()); + ASSERT_EQ(num_patches, patches.size()); + AddCompiledMethod(MethodRef(1u), + ArrayRef<const uint8_t>(code), + ArrayRef<const LinkerPatch>(patches)); + Link(); + + // The thunk is right after the method code. + DCHECK_ALIGNED(1 * MB, kArmAlignment); + std::vector<uint8_t> expected_code; + for (size_t i = 0; i != num_patches; ++i) { + PushBackInsn(&expected_code, ldr); + PushBackInsn(&expected_code, BneWWithOffset(8u * i + 4u, 1 * MB)); + patches.push_back(LinkerPatch::BakerReadBarrierBranchPatch(8u * i + 4u, encoded_data)); + } + EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code))); +} + +TEST_F(Thumb2RelativePatcherTest, BakerAndMethodCallInteraction) { + // During development, there was a `DCHECK_LE(MaxNextOffset(), next_thunk.MaxNextOffset());` + // in `ArmBaseRelativePatcher::ThunkData::MakeSpaceBefore()` which does not necessarily + // hold when we're reserving thunks of different sizes. This test exposes the situation + // by using Baker thunks and a method call thunk. + + // Add a method call patch that can reach to method 1 offset + 16MiB. + uint32_t method_idx = 0u; + constexpr size_t kMethodCallLiteralOffset = 2u; + constexpr uint32_t kMissingMethodIdx = 2u; + const std::vector<uint8_t> raw_code1 = RawCode({kNopInsn, kBlPlus0}); + const LinkerPatch method1_patches[] = { + LinkerPatch::RelativeCodePatch(kMethodCallLiteralOffset, nullptr, 2u), + }; + ArrayRef<const uint8_t> code1(raw_code1); + ++method_idx; + AddCompiledMethod(MethodRef(1u), code1, ArrayRef<const LinkerPatch>(method1_patches)); + + // Skip kMissingMethodIdx. + ++method_idx; + ASSERT_EQ(kMissingMethodIdx, method_idx); + // Add a method with the right size that the method code for the next one starts 1MiB + // after code for method 1. + size_t filler_size = + 1 * MB - RoundUp(raw_code1.size() + sizeof(OatQuickMethodHeader), kArmAlignment) + - sizeof(OatQuickMethodHeader); + std::vector<uint8_t> filler_code = GenNops(filler_size / 2u); + ++method_idx; + AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(filler_code)); + // Add 14 methods with 1MiB code+header, making the code for the next method start 1MiB + // before the currently scheduled MaxNextOffset() for the method call thunk. + for (uint32_t i = 0; i != 14; ++i) { + filler_size = 1 * MB - sizeof(OatQuickMethodHeader); + filler_code = GenNops(filler_size / 2u); + ++method_idx; + AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(filler_code)); + } + + // Add 2 Baker GC root patches to the last method, one that would allow the thunk at + // 1MiB + kArmAlignment, i.e. kArmAlignment after the method call thunk, and the + // second that needs it kArmAlignment after that. Given the size of the GC root thunk + // is more than the space required by the method call thunk plus kArmAlignment, + // this pushes the first GC root thunk's pending MaxNextOffset() before the method call + // thunk's pending MaxNextOffset() which needs to be adjusted. + ASSERT_LT(RoundUp(CompileMethodCallThunk().size(), kArmAlignment) + kArmAlignment, + CompileBakerGcRootThunk(/* root_reg */ 0).size()); + static_assert(kArmAlignment == 8, "Code below assumes kArmAlignment == 8"); + constexpr size_t kBakerLiteralOffset1 = kArmAlignment + 2u - kPcAdjustment; + constexpr size_t kBakerLiteralOffset2 = kBakerLiteralOffset1 + kArmAlignment; + // Use offset = 0, base_reg = 0, the LDR is simply `kLdrWInsn | (root_reg << 12)`. + const uint32_t ldr1 = kLdrWInsn | (/* root_reg */ 1 << 12); + const uint32_t ldr2 = kLdrWInsn | (/* root_reg */ 2 << 12); + const std::vector<uint8_t> last_method_raw_code = RawCode({ + kNopInsn, // Padding before first GC root read barrier. + ldr1, kBneWPlus0, // First GC root LDR with read barrier. + ldr2, kBneWPlus0, // Second GC root LDR with read barrier. + }); + uint32_t encoded_data1 = + Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 1); + uint32_t encoded_data2 = + Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 2); + const LinkerPatch last_method_patches[] = { + LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset1, encoded_data1), + LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset2, encoded_data2), + }; + ++method_idx; + AddCompiledMethod(MethodRef(method_idx), + ArrayRef<const uint8_t>(last_method_raw_code), + ArrayRef<const LinkerPatch>(last_method_patches)); + + // The main purpose of the test is to check that Link() does not cause a crash. + Link(); + + ASSERT_EQ(15 * MB, GetMethodOffset(method_idx) - GetMethodOffset(1u)); +} + } // namespace linker } // namespace art diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index ebd578c5cd..3c6e277ff9 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -16,6 +16,7 @@ #include "code_generator_arm.h" +#include "arch/arm/asm_support_arm.h" #include "arch/arm/instruction_set_features_arm.h" #include "art_method.h" #include "code_generator_utils.h" @@ -25,6 +26,7 @@ #include "gc/accounting/card_table.h" #include "intrinsics.h" #include "intrinsics_arm.h" +#include "linker/arm/relative_patcher_thumb2.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "thread.h" @@ -60,10 +62,41 @@ static constexpr DRegister DTMP = D31; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle +// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// The reserved entrypoint register for link-time generated thunks. +const Register kBakerCcEntrypointRegister = R4; + // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value() +static inline void CheckLastTempIsBakerCcEntrypointRegister(HInstruction* instruction) { + DCHECK_EQ(static_cast<uint32_t>(kBakerCcEntrypointRegister), + linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister); + DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u); + DCHECK_EQ(kBakerCcEntrypointRegister, + instruction->GetLocations()->GetTemp( + instruction->GetLocations()->GetTempCount() - 1u).AsRegister<Register>()); +} + +static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) { + DCHECK(down_cast<Thumb2Assembler*>(codegen->GetAssembler())->IsForced32Bit()); + __ BindTrackedLabel(bne_label); + Label placeholder_label; + __ b(&placeholder_label, NE); // Placeholder, patched at link-time. + __ Bind(&placeholder_label); +} + static constexpr int kRegListThreshold = 4; // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers, @@ -1962,6 +1995,7 @@ CodeGeneratorARM::CodeGeneratorARM(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -5281,7 +5315,18 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else { + locations->AddTemp(Location::RequiresRegister()); + } } } @@ -5747,11 +5792,35 @@ void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) { Location::RequiresRegister(), object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap); } - // We need a temporary register for the read barrier marking slow - // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier. - // Also need for String compression feature. - if ((object_array_get_with_read_barrier && kUseBakerReadBarrier) - || (mirror::kUseStringCompression && instruction->IsStringCharAt())) { + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier. + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation() && + !instruction->GetIndex()->IsConstant()) { + // We need a non-scratch temporary for the array data pointer. + locations->AddTemp(Location::RequiresRegister()); + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } else { + locations->AddTemp(Location::RequiresRegister()); + } + } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + // Also need a temporary for String compression feature. locations->AddTemp(Location::RequiresRegister()); } } @@ -5863,8 +5932,20 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { Location temp = locations->GetTemp(0); // Note that a potential implicit null check is handled in this // CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true); + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + data_offset += helpers::Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + locations->GetTemp(0), + /* needs_null_check */ false); + } else { + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false); + } } else { Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { @@ -6701,6 +6782,13 @@ void LocationsBuilderARM::VisitLoadClass(HLoadClass* cls) { // For non-Baker read barrier we have a temp-clobbering call. } } + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + if (load_kind == HLoadClass::LoadKind::kBssEntry || + (load_kind == HLoadClass::LoadKind::kReferrersClass && + !Runtime::Current()->UseJitCompilation())) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not @@ -6880,6 +6968,9 @@ void LocationsBuilderARM::VisitLoadString(HLoadString* load) { // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK() // that the the kPrimNot result register is the same as the first argument register. locations->SetCustomSlowPathCallerSaves(caller_saves); + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } } else { // For non-Baker read barrier we have a temp-clobbering call. } @@ -7050,6 +7141,9 @@ void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) { // Note that TypeCheckSlowPathARM uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + codegen_->MaybeAddBakerCcEntrypointTempForFields(locations); + } } void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { @@ -7923,48 +8017,93 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Location temp = Location::RegisterLocation(LR); - SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( - instruction, root, /* entrypoint */ temp); - codegen_->AddSlowPath(slow_path); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: + + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg); + Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + static_assert( + BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8, + "GC root LDR must be 2 32-bit instructions (8B) before the return address label."); + // Currently the offset is always within range. If that changes, + // we shall have to split the load the same way as for fields. + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + EmitPlaceholderBne(codegen_, bne_label); + __ Bind(&return_address); + } else { + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - __ LoadFromOffset(kLoadWord, root_reg, obj, offset); - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -7982,6 +8121,16 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct } } +void CodeGeneratorARM::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields) { + if (!Runtime::Current()->UseJitCompilation()) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister)); + } + } +} + void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, Register obj, @@ -7991,6 +8140,69 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + base = temp.AsRegister<Register>(); + DCHECK_NE(base, kBakerCcEntrypointRegister); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u)); + offset &= (kReferenceLoadMinFarOffset - 1u); + } + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj); + Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Field LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + Register ref_reg = ref.AsRegister<Register>(); + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + __ LoadFromOffset(kLoadWord, ref_reg, base, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) Location no_index = Location::NoLocation(); ScaleFactor no_scale_factor = TIMES_1; @@ -8011,9 +8223,67 @@ void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instr static_assert( sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + ScaleFactor scale_factor = TIMES_4; + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + Register index_reg = index.AsRegister<Register>(); + Register ref_reg = ref.AsRegister<Register>(); + Register data_reg = temp.AsRegister<Register>(); + DCHECK_NE(data_reg, kBakerCcEntrypointRegister); + + CheckLastTempIsBakerCcEntrypointRegister(instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg); + Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(IP, 12); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); + __ AddConstant(data_reg, obj, data_offset); + + Label return_address; + __ AdrCode(LR, &return_address); + __ CmpConstant(kBakerCcEntrypointRegister, 0); + ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler())); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Array LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - ScaleFactor scale_factor = TIMES_4; GenerateReferenceLoadWithBakerReadBarrier( instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check); } @@ -8379,6 +8649,11 @@ CodeGeneratorARM::PcRelativePatchInfo* CodeGeneratorARM::NewPcRelativePatch( return &patches->back(); } +Label* CodeGeneratorARM::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; +} + Literal* CodeGeneratorARM::DeduplicateBootImageStringLiteral(const DexFile& dex_file, dex::StringIndex string_index) { return boot_image_string_patches_.GetOrCreate( @@ -8445,7 +8720,8 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + boot_image_type_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -8479,6 +8755,10 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche target_type.dex_file, target_type.type_index.index_)); } + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.Position(), + info.custom_data)); + } DCHECK_EQ(size, linker_patches->size()); } diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index 86f2f21df7..6f007e100b 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -488,6 +488,11 @@ class CodeGeneratorARM : public CodeGenerator { PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); + + // Add a new baker read barrier patch and return the label to be bound + // before the BNE instruction. + Label* NewBakerReadBarrierPatch(uint32_t custom_data); + Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, dex::StringIndex string_index); Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index); @@ -503,6 +508,10 @@ class CodeGeneratorARM : public CodeGenerator { void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Maybe add the reserved entrypoint register as a temporary for field load. This temp + // is added only for AOT compilation if link-time generated thunks for fields are enabled. + void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations); + // Fast path implementation of ReadBarrier::Barrier for a heap // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, @@ -616,6 +625,13 @@ class CodeGeneratorARM : public CodeGenerator { Literal*, TypeReferenceValueComparator>; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + Label label; + uint32_t custom_data; + }; + Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file, @@ -648,6 +664,8 @@ class CodeGeneratorARM : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 8744cc8210..86f4cd22fa 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -16,6 +16,7 @@ #include "code_generator_arm_vixl.h" +#include "arch/arm/asm_support_arm.h" #include "arch/arm/instruction_set_features_arm.h" #include "art_method.h" #include "code_generator_utils.h" @@ -24,6 +25,7 @@ #include "entrypoints/quick/quick_entrypoints.h" #include "gc/accounting/card_table.h" #include "intrinsics_arm_vixl.h" +#include "linker/arm/relative_patcher_thumb2.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "thread.h" @@ -77,6 +79,20 @@ static constexpr size_t kArmBitsPerWord = kArmWordSize * kBitsPerByte; static constexpr int kCurrentMethodStackOffset = 0; static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Rt, [Rn, #offset] which can handle +// offset < 4KiB. For offsets >= 4KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 4 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForArrays = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// The reserved entrypoint register for link-time generated thunks. +const vixl32::Register kBakerCcEntrypointRegister = r4; + #ifdef __ #error "ARM Codegen VIXL macro-assembler macro already defined." #endif @@ -88,6 +104,56 @@ static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; // Marker that code is yet to be, and must, be implemented. #define TODO_VIXL32(level) LOG(level) << __PRETTY_FUNCTION__ << " unimplemented " +static inline void ExcludeIPAndBakerCcEntrypointRegister(UseScratchRegisterScope* temps, + HInstruction* instruction) { + DCHECK(temps->IsAvailable(ip)); + temps->Exclude(ip); + DCHECK(!temps->IsAvailable(kBakerCcEntrypointRegister)); + DCHECK_EQ(kBakerCcEntrypointRegister.GetCode(), + linker::Thumb2RelativePatcher::kBakerCcEntrypointRegister); + DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u); + DCHECK(RegisterFrom(instruction->GetLocations()->GetTemp( + instruction->GetLocations()->GetTempCount() - 1u)).Is(kBakerCcEntrypointRegister)); +} + +static inline void EmitPlaceholderBne(CodeGeneratorARMVIXL* codegen, vixl32::Label* patch_label) { + ExactAssemblyScope eas(codegen->GetVIXLAssembler(), kMaxInstructionSizeInBytes); + __ bind(patch_label); + vixl32::Label placeholder_label; + __ b(ne, EncodingSize(Wide), &placeholder_label); // Placeholder, patched at link-time. + __ bind(&placeholder_label); +} + +class EmitAdrCode { + public: + EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label) + : assembler_(assembler), rd_(rd), label_(label) { + ExactAssemblyScope aas(assembler, kMaxInstructionSizeInBytes); + adr_location_ = assembler->GetCursorOffset(); + assembler->adr(EncodingSize(Wide), rd, label); + } + + ~EmitAdrCode() { + DCHECK(label_->IsBound()); + // The ADR emitted by the assembler does not set the Thumb mode bit we need. + // TODO: Maybe extend VIXL to allow ADR for return address? + uint8_t* raw_adr = assembler_->GetBuffer()->GetOffsetAddress<uint8_t*>(adr_location_); + // Expecting ADR encoding T3 with `(offset & 1) == 0`. + DCHECK_EQ(raw_adr[1] & 0xfbu, 0xf2u); // Check bits 24-31, except 26. + DCHECK_EQ(raw_adr[0] & 0xffu, 0x0fu); // Check bits 16-23. + DCHECK_EQ(raw_adr[3] & 0x8fu, rd_.GetCode()); // Check bits 8-11 and 15. + DCHECK_EQ(raw_adr[2] & 0x01u, 0x00u); // Check bit 0, i.e. the `offset & 1`. + // Add the Thumb mode bit. + raw_adr[2] |= 0x01u; + } + + private: + ArmVIXLMacroAssembler* const assembler_; + vixl32::Register rd_; + vixl32::Label* const label_; + int32_t adr_location_; +}; + // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers, // for each live D registers they treat two corresponding S registers as live ones. // @@ -2012,6 +2078,7 @@ CodeGeneratorARMVIXL::CodeGeneratorARMVIXL(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -5289,7 +5356,18 @@ void LocationsBuilderARMVIXL::HandleFieldGet(HInstruction* instruction, } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else { + locations->AddTemp(Location::RequiresRegister()); + } } } @@ -5756,11 +5834,35 @@ void LocationsBuilderARMVIXL::VisitArrayGet(HArrayGet* instruction) { Location::RequiresRegister(), object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap); } - // We need a temporary register for the read barrier marking slow - // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier. - // Also need for String compression feature. - if ((object_array_get_with_read_barrier && kUseBakerReadBarrier) - || (mirror::kUseStringCompression && instruction->IsStringCharAt())) { + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier. + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(Location::RequiresRegister()); + } + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation() && + !instruction->GetIndex()->IsConstant()) { + // We need a non-scratch temporary for the array data pointer. + locations->AddTemp(Location::RequiresRegister()); + // And we always need the reserved entrypoint register. + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } else { + locations->AddTemp(Location::RequiresRegister()); + } + } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + // Also need a temporary for String compression feature. locations->AddTemp(Location::RequiresRegister()); } } @@ -5871,8 +5973,20 @@ void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) { Location temp = locations->GetTemp(0); // Note that a potential implicit null check is handled in this // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true); + DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0))); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + data_offset += Int32ConstantFrom(index) << Primitive::ComponentSizeShift(type); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + locations->GetTemp(0), + /* needs_null_check */ false); + } else { + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false); + } } else { vixl32::Register out = OutputRegister(instruction); if (index.IsConstant()) { @@ -6762,6 +6876,13 @@ void LocationsBuilderARMVIXL::VisitLoadClass(HLoadClass* cls) { // For non-Baker read barrier we have a temp-clobbering call. } } + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + if (load_kind == HLoadClass::LoadKind::kBssEntry || + (load_kind == HLoadClass::LoadKind::kReferrersClass && + !Runtime::Current()->UseJitCompilation())) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } + } } // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not @@ -6938,6 +7059,9 @@ void LocationsBuilderARMVIXL::VisitLoadString(HLoadString* load) { // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK() // that the the kPrimNot result register is the same as the first argument register. locations->SetCustomSlowPathCallerSaves(caller_saves); + if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } } else { // For non-Baker read barrier we have a temp-clobbering call. } @@ -7100,6 +7224,9 @@ void LocationsBuilderARMVIXL::VisitInstanceOf(HInstanceOf* instruction) { // Note that TypeCheckSlowPathARM uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + codegen_->MaybeAddBakerCcEntrypointTempForFields(locations); + } } void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) { @@ -7998,48 +8125,96 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Location temp = LocationFrom(lr); - SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( - instruction, root, /* entrypoint */ temp); - codegen_->AddSlowPath(slow_path); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode()); + vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + + vixl::EmissionCheckScope guard(GetVIXLAssembler(), + 4 * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + static_assert( + BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8, + "GC root LDR must be 2 32-bit instructions (8B) before the return address label."); + // Currently the offset is always within range. If that changes, + // we shall have to split the load the same way as for fields. + DCHECK_LT(offset, kReferenceLoadMinFarOffset); + __ ldr(EncodingSize(Wide), root_reg, MemOperand(obj, offset)); + EmitPlaceholderBne(codegen_, bne_label); + __ Bind(&return_address); + } else { + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -8057,6 +8232,16 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( } } +void CodeGeneratorARMVIXL::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields) { + if (!Runtime::Current()->UseJitCompilation()) { + locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode())); + } + } +} + void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, Location ref, vixl32::Register obj, @@ -8066,6 +8251,75 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + vixl32::Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + base = RegisterFrom(temp); + DCHECK(!base.Is(kBakerCcEntrypointRegister)); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u))); + offset &= (kReferenceLoadMinFarOffset - 1u); + } + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData( + base.GetCode(), + obj.GetCode()); + vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + + vixl::EmissionCheckScope guard( + GetVIXLAssembler(), + (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Field LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + __ ldr(EncodingSize(Wide), ref_reg, MemOperand(base, offset)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + // Note: We need a Wide NEG for the unpoisoning. + if (kPoisonHeapReferences) { + __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0)); + } + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) Location no_index = Location::NoLocation(); ScaleFactor no_scale_factor = TIMES_1; @@ -8086,9 +8340,73 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i static_assert( sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + ScaleFactor scale_factor = TIMES_4; + + if (kBakerReadBarrierLinkTimeThunksEnableForArrays && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded reference or not. Instead, we + // load into `temp` (actually kBakerCcEntrypointRegister) the read + // barrier mark introspection entrypoint. If `temp` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &gray_return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> reference = data[index]; + // gray_return_address: + + DCHECK(index.IsValid()); + vixl32::Register index_reg = RegisterFrom(index, Primitive::kPrimInt); + vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + vixl32::Register data_reg = RegisterFrom(temp, Primitive::kPrimInt); // Raw pointer. + DCHECK(!data_reg.Is(kBakerCcEntrypointRegister)); + + UseScratchRegisterScope temps(GetVIXLAssembler()); + ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction); + uint32_t custom_data = + linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg.GetCode()); + vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data); + + // entrypoint_reg = + // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip.GetCode(), 12u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); + __ Add(data_reg, obj, Operand(data_offset)); + + vixl::EmissionCheckScope guard( + GetVIXLAssembler(), + (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes); + vixl32::Label return_address; + EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address); + __ cmp(kBakerCcEntrypointRegister, Operand(0)); + EmitPlaceholderBne(this, bne_label); + static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Array LDR must be 1 32-bit instruction (4B) before the return address label; " + " 2 32-bit instructions (8B) for heap poisoning."); + __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor)); + DCHECK(!needs_null_check); // The thunk cannot handle the null check. + // Note: We need a Wide NEG for the unpoisoning. + if (kPoisonHeapReferences) { + __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0)); + } + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = // *(obj + data_offset + index * sizeof(HeapReference<Object>)) - ScaleFactor scale_factor = TIMES_4; GenerateReferenceLoadWithBakerReadBarrier( instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check); } @@ -8497,6 +8815,11 @@ CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewPcRelativePa return &patches->back(); } +vixl::aarch32::Label* CodeGeneratorARMVIXL::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; +} + VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageStringLiteral( const DexFile& dex_file, dex::StringIndex string_index) { @@ -8578,7 +8901,8 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + boot_image_type_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -8612,6 +8936,10 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa target_type.dex_file, target_type.type_index.index_)); } + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(), + info.custom_data)); + } DCHECK_EQ(size, linker_patches->size()); } diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 1e9669dc38..9d56cc32cb 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -572,6 +572,11 @@ class CodeGeneratorARMVIXL : public CodeGenerator { PcRelativePatchInfo* NewTypeBssEntryPatch(const DexFile& dex_file, dex::TypeIndex type_index); PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset); + + // Add a new baker read barrier patch and return the label to be bound + // before the BNE instruction. + vixl::aarch32::Label* NewBakerReadBarrierPatch(uint32_t custom_data); + VIXLUInt32Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, dex::StringIndex string_index); VIXLUInt32Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, @@ -589,6 +594,10 @@ class CodeGeneratorARMVIXL : public CodeGenerator { void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Maybe add the reserved entrypoint register as a temporary for field load. This temp + // is added only for AOT compilation if link-time generated thunks for fields are enabled. + void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations); + // Fast path implementation of ReadBarrier::Barrier for a heap // reference field load when Baker's read barriers are used. void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, @@ -713,6 +722,13 @@ class CodeGeneratorARMVIXL : public CodeGenerator { VIXLUInt32Literal*, TypeReferenceValueComparator>; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + vixl::aarch32::Label label; + uint32_t custom_data; + }; + VIXLUInt32Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map); VIXLUInt32Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map); @@ -750,6 +766,8 @@ class CodeGeneratorARMVIXL : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 750f9cc213..c784171fd7 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -1648,6 +1648,8 @@ void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { // is clobbered by ReadBarrierMarkRegX entry points). Get an extra // temporary register from the register allocator. locations->AddTemp(Location::RequiresRegister()); + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen_); + arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations); } } diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index fd8a37ae05..77d870bec2 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -2026,6 +2026,8 @@ void IntrinsicLocationsBuilderARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) { // is clobbered by ReadBarrierMarkRegX entry points). Get an extra // temporary register from the register allocator. locations->AddTemp(Location::RequiresRegister()); + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen_); + arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations); } } diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h index 0ed8a35338..0f24e81be2 100644 --- a/compiler/utils/arm/assembler_arm.h +++ b/compiler/utils/arm/assembler_arm.h @@ -652,6 +652,9 @@ class ArmAssembler : public Assembler { virtual void blx(Register rm, Condition cond = AL) = 0; virtual void bx(Register rm, Condition cond = AL) = 0; + // ADR instruction loading register for branching to the label. + virtual void AdrCode(Register rt, Label* label) = 0; + // Memory barriers. virtual void dmb(DmbOptions flavor) = 0; diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc index 1e71d06b49..d7096b3c87 100644 --- a/compiler/utils/arm/assembler_thumb2.cc +++ b/compiler/utils/arm/assembler_thumb2.cc @@ -214,14 +214,14 @@ void Thumb2Assembler::EmitFixups(uint32_t adjusted_code_size) { DCHECK_GE(dest_end, src_end); for (auto i = fixups_.rbegin(), end = fixups_.rend(); i != end; ++i) { Fixup* fixup = &*i; + size_t old_fixup_location = fixup->GetLocation(); if (fixup->GetOriginalSize() == fixup->GetSize()) { // The size of this Fixup didn't change. To avoid moving the data // in small chunks, emit the code to its original position. - fixup->Emit(&buffer_, adjusted_code_size); fixup->Finalize(dest_end - src_end); + fixup->Emit(old_fixup_location, &buffer_, adjusted_code_size); } else { // Move the data between the end of the fixup and src_end to its final location. - size_t old_fixup_location = fixup->GetLocation(); size_t src_begin = old_fixup_location + fixup->GetOriginalSizeInBytes(); size_t data_size = src_end - src_begin; size_t dest_begin = dest_end - data_size; @@ -230,7 +230,7 @@ void Thumb2Assembler::EmitFixups(uint32_t adjusted_code_size) { dest_end = dest_begin - fixup->GetSizeInBytes(); // Finalize the Fixup and emit the data to the new location. fixup->Finalize(dest_end - src_end); - fixup->Emit(&buffer_, adjusted_code_size); + fixup->Emit(fixup->GetLocation(), &buffer_, adjusted_code_size); } } CHECK_EQ(src_end, dest_end); @@ -1895,6 +1895,9 @@ inline size_t Thumb2Assembler::Fixup::SizeInBytes(Size size) { case kCbxz48Bit: return 6u; + case kCodeAddr4KiB: + return 4u; + case kLiteral1KiB: return 2u; case kLiteral4KiB: @@ -1973,6 +1976,15 @@ inline int32_t Thumb2Assembler::Fixup::GetOffset(uint32_t current_code_size) con diff -= 2; // Extra CMP Rn, #0, 16-bit. break; + case kCodeAddr4KiB: + // The ADR instruction rounds down the PC+4 to a multiple of 4, so if the PC + // isn't a multiple of 2, we need to adjust. + DCHECK_ALIGNED(diff, 2); + diff += location_ & 2; + // Add the Thumb mode bit. + diff += 1; + break; + case kLiteral1KiB: case kLiteral4KiB: case kLongOrFPLiteral1KiB: @@ -1987,8 +1999,8 @@ inline int32_t Thumb2Assembler::Fixup::GetOffset(uint32_t current_code_size) con diff = diff + (diff & 2); DCHECK_GE(diff, 0); break; - case kLiteral1MiB: case kLiteral64KiB: + case kLiteral1MiB: case kLongOrFPLiteral64KiB: case kLiteralAddr64KiB: DCHECK_GE(diff, 4); // The target must be at least 4 bytes after the ADD rX, PC. @@ -2041,6 +2053,10 @@ bool Thumb2Assembler::Fixup::IsCandidateForEmitEarly() const { // We don't support conditional branches beyond +-1MiB. return true; + case kCodeAddr4KiB: + // ADR uses the aligned PC and as such the offset cannot be calculated early. + return false; + case kLiteral1KiB: case kLiteral4KiB: case kLiteral64KiB: @@ -2087,6 +2103,10 @@ uint32_t Thumb2Assembler::Fixup::AdjustSizeIfNeeded(uint32_t current_code_size) // We don't support conditional branches beyond +-1MiB. break; + case kCodeAddr4KiB: + // We don't support Code address ADR beyond +4KiB. + break; + case kLiteral1KiB: DCHECK(!IsHighRegister(rn_)); if (IsUint<10>(GetOffset(current_code_size))) { @@ -2159,13 +2179,15 @@ uint32_t Thumb2Assembler::Fixup::AdjustSizeIfNeeded(uint32_t current_code_size) return current_code_size - old_code_size; } -void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) const { +void Thumb2Assembler::Fixup::Emit(uint32_t emit_location, + AssemblerBuffer* buffer, + uint32_t code_size) const { switch (GetSize()) { case kBranch16Bit: { DCHECK(type_ == kUnconditional || type_ == kConditional); DCHECK_EQ(type_ == kConditional, cond_ != AL); int16_t encoding = BEncoding16(GetOffset(code_size), cond_); - buffer->Store<int16_t>(location_, encoding); + buffer->Store<int16_t>(emit_location, encoding); break; } case kBranch32Bit: { @@ -2180,15 +2202,15 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c DCHECK_NE(encoding & B12, 0); encoding ^= B14 | B12; } - buffer->Store<int16_t>(location_, encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff)); break; } case kCbxz16Bit: { DCHECK(type_ == kCompareAndBranchXZero); int16_t encoding = CbxzEncoding16(rn_, GetOffset(code_size), cond_); - buffer->Store<int16_t>(location_, encoding); + buffer->Store<int16_t>(emit_location, encoding); break; } case kCbxz32Bit: { @@ -2196,8 +2218,8 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c DCHECK(cond_ == EQ || cond_ == NE); int16_t cmp_encoding = CmpRnImm8Encoding16(rn_, 0); int16_t b_encoding = BEncoding16(GetOffset(code_size), cond_); - buffer->Store<int16_t>(location_, cmp_encoding); - buffer->Store<int16_t>(location_ + 2, b_encoding); + buffer->Store<int16_t>(emit_location, cmp_encoding); + buffer->Store<int16_t>(emit_location + 2, b_encoding); break; } case kCbxz48Bit: { @@ -2205,24 +2227,32 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c DCHECK(cond_ == EQ || cond_ == NE); int16_t cmp_encoding = CmpRnImm8Encoding16(rn_, 0); int32_t b_encoding = BEncoding32(GetOffset(code_size), cond_); - buffer->Store<int16_t>(location_, cmp_encoding); - buffer->Store<int16_t>(location_ + 2u, b_encoding >> 16); - buffer->Store<int16_t>(location_ + 4u, static_cast<int16_t>(b_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, cmp_encoding); + buffer->Store<int16_t>(emit_location + 2u, b_encoding >> 16); + buffer->Store<int16_t>(emit_location + 4u, static_cast<int16_t>(b_encoding & 0xffff)); + break; + } + + case kCodeAddr4KiB: { + DCHECK(type_ == kLoadCodeAddr); + int32_t encoding = AdrEncoding32(rn_, GetOffset(code_size)); + buffer->Store<int16_t>(emit_location, encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff)); break; } case kLiteral1KiB: { DCHECK(type_ == kLoadLiteralNarrow); int16_t encoding = LdrLitEncoding16(rn_, GetOffset(code_size)); - buffer->Store<int16_t>(location_, encoding); + buffer->Store<int16_t>(emit_location, encoding); break; } case kLiteral4KiB: { DCHECK(type_ == kLoadLiteralNarrow); // GetOffset() uses PC+4 but load literal uses AlignDown(PC+4, 4). Adjust offset accordingly. int32_t encoding = LdrLitEncoding32(rn_, GetOffset(code_size)); - buffer->Store<int16_t>(location_, encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff)); break; } case kLiteral64KiB: { @@ -2242,11 +2272,11 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c int32_t mov_encoding = MovModImmEncoding32(rn_, offset & ~0xfff); int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC); int32_t ldr_encoding = LdrRtRnImm12Encoding(rn_, rn_, offset & 0xfff); - buffer->Store<int16_t>(location_, mov_encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 4u, add_pc_encoding); - buffer->Store<int16_t>(location_ + 6u, ldr_encoding >> 16); - buffer->Store<int16_t>(location_ + 8u, static_cast<int16_t>(ldr_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, mov_encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(mov_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 4u, add_pc_encoding); + buffer->Store<int16_t>(emit_location + 6u, ldr_encoding >> 16); + buffer->Store<int16_t>(emit_location + 8u, static_cast<int16_t>(ldr_encoding & 0xffff)); break; } case kLiteralFar: { @@ -2256,36 +2286,36 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c int32_t movt_encoding = MovtEncoding32(rn_, offset & ~0xffff); int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC); int32_t ldr_encoding = LdrRtRnImm12Encoding(rn_, rn_, 0); - buffer->Store<int16_t>(location_, movw_encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16); - buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 8u, add_pc_encoding); - buffer->Store<int16_t>(location_ + 10u, ldr_encoding >> 16); - buffer->Store<int16_t>(location_ + 12u, static_cast<int16_t>(ldr_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, movw_encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(movw_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 4u, movt_encoding >> 16); + buffer->Store<int16_t>(emit_location + 6u, static_cast<int16_t>(movt_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 8u, add_pc_encoding); + buffer->Store<int16_t>(emit_location + 10u, ldr_encoding >> 16); + buffer->Store<int16_t>(emit_location + 12u, static_cast<int16_t>(ldr_encoding & 0xffff)); break; } case kLiteralAddr1KiB: { DCHECK(type_ == kLoadLiteralAddr); int16_t encoding = AdrEncoding16(rn_, GetOffset(code_size)); - buffer->Store<int16_t>(location_, encoding); + buffer->Store<int16_t>(emit_location, encoding); break; } case kLiteralAddr4KiB: { DCHECK(type_ == kLoadLiteralAddr); int32_t encoding = AdrEncoding32(rn_, GetOffset(code_size)); - buffer->Store<int16_t>(location_, encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff)); break; } case kLiteralAddr64KiB: { DCHECK(type_ == kLoadLiteralAddr); int32_t mov_encoding = MovwEncoding32(rn_, GetOffset(code_size)); int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC); - buffer->Store<int16_t>(location_, mov_encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 4u, add_pc_encoding); + buffer->Store<int16_t>(emit_location, mov_encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(mov_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 4u, add_pc_encoding); break; } case kLiteralAddrFar: { @@ -2294,29 +2324,29 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c int32_t movw_encoding = MovwEncoding32(rn_, offset & 0xffff); int32_t movt_encoding = MovtEncoding32(rn_, offset & ~0xffff); int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC); - buffer->Store<int16_t>(location_, movw_encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16); - buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 8u, add_pc_encoding); + buffer->Store<int16_t>(emit_location, movw_encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(movw_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 4u, movt_encoding >> 16); + buffer->Store<int16_t>(emit_location + 6u, static_cast<int16_t>(movt_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 8u, add_pc_encoding); break; } case kLongOrFPLiteral1KiB: { int32_t encoding = LoadWideOrFpEncoding(PC, GetOffset(code_size)); // DCHECKs type_. - buffer->Store<int16_t>(location_, encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(encoding & 0xffff)); break; } case kLongOrFPLiteral64KiB: { int32_t mov_encoding = MovwEncoding32(IP, GetOffset(code_size)); int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC); int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0u); // DCHECKs type_. - buffer->Store<int16_t>(location_, mov_encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 4u, add_pc_encoding); - buffer->Store<int16_t>(location_ + 6u, ldr_encoding >> 16); - buffer->Store<int16_t>(location_ + 8u, static_cast<int16_t>(ldr_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, mov_encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(mov_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 4u, add_pc_encoding); + buffer->Store<int16_t>(emit_location + 6u, ldr_encoding >> 16); + buffer->Store<int16_t>(emit_location + 8u, static_cast<int16_t>(ldr_encoding & 0xffff)); break; } case kLongOrFPLiteralFar: { @@ -2325,13 +2355,13 @@ void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) c int32_t movt_encoding = MovtEncoding32(IP, offset & ~0xffff); int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC); int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0); // DCHECKs type_. - buffer->Store<int16_t>(location_, movw_encoding >> 16); - buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16); - buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff)); - buffer->Store<int16_t>(location_ + 8u, add_pc_encoding); - buffer->Store<int16_t>(location_ + 10u, ldr_encoding >> 16); - buffer->Store<int16_t>(location_ + 12u, static_cast<int16_t>(ldr_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location, movw_encoding >> 16); + buffer->Store<int16_t>(emit_location + 2u, static_cast<int16_t>(movw_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 4u, movt_encoding >> 16); + buffer->Store<int16_t>(emit_location + 6u, static_cast<int16_t>(movt_encoding & 0xffff)); + buffer->Store<int16_t>(emit_location + 8u, add_pc_encoding); + buffer->Store<int16_t>(emit_location + 10u, ldr_encoding >> 16); + buffer->Store<int16_t>(emit_location + 12u, static_cast<int16_t>(ldr_encoding & 0xffff)); break; } } @@ -3331,6 +3361,19 @@ void Thumb2Assembler::bx(Register rm, Condition cond) { } +void Thumb2Assembler::AdrCode(Register rt, Label* label) { + uint32_t pc = buffer_.Size(); + FixupId branch_id = AddFixup(Fixup::LoadCodeAddress(pc, rt)); + CHECK(!label->IsBound()); + // ADR target must be an unbound label. Add it to a singly-linked list maintained within + // the code with the label serving as the head. + Emit16(static_cast<uint16_t>(label->position_)); + label->LinkTo(branch_id); + Emit16(0); + DCHECK_EQ(buffer_.Size() - pc, GetFixup(branch_id)->GetSizeInBytes()); +} + + void Thumb2Assembler::Push(Register rd, Condition cond) { str(rd, Address(SP, -kRegisterSize, Address::PreIndex), cond); } @@ -3405,7 +3448,7 @@ void Thumb2Assembler::Bind(Label* label) { break; } } - last_fixup.Emit(&buffer_, buffer_.Size()); + last_fixup.Emit(last_fixup.GetLocation(), &buffer_, buffer_.Size()); fixups_.pop_back(); } } diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h index 1c495aa7a7..5c36110cf6 100644 --- a/compiler/utils/arm/assembler_thumb2.h +++ b/compiler/utils/arm/assembler_thumb2.h @@ -268,6 +268,9 @@ class Thumb2Assembler FINAL : public ArmAssembler { void blx(Register rm, Condition cond = AL) OVERRIDE; void bx(Register rm, Condition cond = AL) OVERRIDE; + // ADR instruction loading register for branching to the label, including the Thumb mode bit. + void AdrCode(Register rt, Label* label) OVERRIDE; + virtual void Lsl(Register rd, Register rm, uint32_t shift_imm, Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE; virtual void Lsr(Register rd, Register rm, uint32_t shift_imm, @@ -377,6 +380,10 @@ class Thumb2Assembler FINAL : public ArmAssembler { force_32bit_ = true; } + void Allow16Bit() { + force_32bit_ = false; + } + // Emit an ADR (or a sequence of instructions) to load the jump table address into base_reg. This // will generate a fixup. JumpTable* CreateJumpTable(std::vector<Label*>&& labels, Register base_reg) OVERRIDE; @@ -422,6 +429,7 @@ class Thumb2Assembler FINAL : public ArmAssembler { kUnconditionalLink, // BL. kUnconditionalLinkX, // BLX. kCompareAndBranchXZero, // cbz/cbnz. + kLoadCodeAddr, // Get address of a code label, used for Baker read barriers. kLoadLiteralNarrow, // Load narrrow integer literal. kLoadLiteralWide, // Load wide integer literal. kLoadLiteralAddr, // Load address of literal (used for jump table). @@ -442,6 +450,10 @@ class Thumb2Assembler FINAL : public ArmAssembler { kCbxz32Bit, // CMP rX, #0 + Bcc label; X < 8; 16-bit Bcc; +-8-bit offset. kCbxz48Bit, // CMP rX, #0 + Bcc label; X < 8; 32-bit Bcc; up to +-1MiB offset. + // ADR variants. + kCodeAddr4KiB, // ADR rX, <label>; label must be after the ADR but within 4KiB range. + // Multi-instruction expansion is not supported. + // Load integer literal variants. // LDR rX, label; X < 8; 16-bit variant up to 1KiB offset; 2 bytes. kLiteral1KiB, @@ -492,6 +504,12 @@ class Thumb2Assembler FINAL : public ArmAssembler { cond, kCompareAndBranchXZero, kCbxz16Bit, location); } + // Code address. + static Fixup LoadCodeAddress(uint32_t location, Register rt) { + return Fixup(rt, kNoRegister, kNoSRegister, kNoDRegister, + AL, kLoadCodeAddr, kCodeAddr4KiB, location); + } + // Load narrow literal. static Fixup LoadNarrowLiteral(uint32_t location, Register rt, Size size) { DCHECK(size == kLiteral1KiB || size == kLiteral4KiB || size == kLiteral64KiB || @@ -550,6 +568,7 @@ class Thumb2Assembler FINAL : public ArmAssembler { switch (GetOriginalSize()) { case kBranch32Bit: case kCbxz48Bit: + case kCodeAddr4KiB: case kLiteralFar: case kLiteralAddrFar: case kLongOrFPLiteralFar: @@ -623,7 +642,7 @@ class Thumb2Assembler FINAL : public ArmAssembler { // Emit the branch instruction into the assembler buffer. This does the // encoding into the thumb instruction. - void Emit(AssemblerBuffer* buffer, uint32_t code_size) const; + void Emit(uint32_t emit_location, AssemblerBuffer* buffer, uint32_t code_size) const; private: Fixup(Register rn, Register rt2, SRegister sd, DRegister dd, @@ -903,6 +922,24 @@ class Thumb2Assembler FINAL : public ArmAssembler { FixupId last_fixup_id_; }; +class ScopedForce32Bit { + public: + explicit ScopedForce32Bit(Thumb2Assembler* assembler) + : assembler_(assembler), old_force_32bit_(assembler->IsForced32Bit()) { + assembler->Force32Bit(); + } + + ~ScopedForce32Bit() { + if (!old_force_32bit_) { + assembler_->Allow16Bit(); + } + } + + private: + Thumb2Assembler* const assembler_; + const bool old_force_32bit_; +}; + } // namespace arm } // namespace art diff --git a/runtime/arch/arch_test.cc b/runtime/arch/arch_test.cc index a857976021..1a5e39f0f7 100644 --- a/runtime/arch/arch_test.cc +++ b/runtime/arch/arch_test.cc @@ -71,6 +71,11 @@ static constexpr size_t kFrameSizeSaveRefsAndArgs = FRAME_SIZE_SAVE_REFS_AND_ARG #undef FRAME_SIZE_SAVE_REFS_AND_ARGS static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING; #undef FRAME_SIZE_SAVE_EVERYTHING +#undef BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET +#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET +#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET +#undef BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET +#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET } // namespace arm namespace arm64 { @@ -83,6 +88,11 @@ static constexpr size_t kFrameSizeSaveRefsAndArgs = FRAME_SIZE_SAVE_REFS_AND_ARG #undef FRAME_SIZE_SAVE_REFS_AND_ARGS static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING; #undef FRAME_SIZE_SAVE_EVERYTHING +#undef BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET +#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET +#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET +#undef BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET +#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET } // namespace arm64 namespace mips { diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h index c03bcae526..f1f1766ad4 100644 --- a/runtime/arch/arm/asm_support_arm.h +++ b/runtime/arch/arm/asm_support_arm.h @@ -24,6 +24,28 @@ #define FRAME_SIZE_SAVE_REFS_AND_ARGS 112 #define FRAME_SIZE_SAVE_EVERYTHING 192 +// The offset from art_quick_read_barrier_mark_introspection to the array switch cases, +// i.e. art_quick_read_barrier_mark_introspection_arrays. +#define BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET 0x100 +// The offset from art_quick_read_barrier_mark_introspection to the GC root entrypoint, +// i.e. art_quick_read_barrier_mark_introspection_gc_roots. +#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET 0xc0 + +// The offset of the reference load LDR from the return address in LR for field loads. +#ifdef USE_HEAP_POISONING +#define BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET -8 +#else +#define BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET -4 +#endif +// The offset of the reference load LDR from the return address in LR for array loads. +#ifdef USE_HEAP_POISONING +#define BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET -8 +#else +#define BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET -4 +#endif +// The offset of the reference load LDR from the return address in LR for GC root loads. +#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET -8 + // Flag for enabling R4 optimization in arm runtime // #define ARM_R4_SUSPEND_FLAG diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc index d21d0c07a2..6b7247773a 100644 --- a/runtime/arch/arm/entrypoints_init_arm.cc +++ b/runtime/arch/arm/entrypoints_init_arm.cc @@ -17,6 +17,7 @@ #include <math.h> #include <string.h> +#include "arch/arm/asm_support_arm.h" #include "entrypoints/jni/jni_entrypoints.h" #include "entrypoints/quick/quick_alloc_entrypoints.h" #include "entrypoints/quick/quick_default_externs.h" @@ -51,6 +52,10 @@ extern "C" mirror::Object* art_quick_read_barrier_mark_reg10(mirror::Object*); extern "C" mirror::Object* art_quick_read_barrier_mark_reg11(mirror::Object*); extern "C" mirror::Object* art_quick_read_barrier_mark_reg12(mirror::Object*); +extern "C" mirror::Object* art_quick_read_barrier_mark_introspection(mirror::Object*); +extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_arrays(mirror::Object*); +extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mirror::Object*); + // Used by soft float. // Single-precision FP arithmetics. extern "C" float fmodf(float a, float b); // REM_FLOAT[_2ADDR] @@ -80,6 +85,22 @@ void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) { qpoints->pReadBarrierMarkReg09 = is_active ? art_quick_read_barrier_mark_reg09 : nullptr; qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr; qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr; + + // Check that array switch cases are at appropriate offsets from the introspection entrypoint. + // For the alignment check, strip the Thumb mode bit. + DCHECK_ALIGNED(reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection) - 1u, 256u); + intptr_t array_diff = + reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_arrays) - + reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection); + DCHECK_EQ(BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET, array_diff); + // Check that the GC root entrypoint is at appropriate offset from the introspection entrypoint. + intptr_t gc_roots_diff = + reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots) - + reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection); + DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET, gc_roots_diff); + // The register 12, i.e. IP, is reserved, so there is no art_quick_read_barrier_mark_reg12. + // We're using the entry to hold a pointer to the introspection entrypoint instead. + qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_introspection : nullptr; } void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) { diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S index a277edfa29..fa21208fcb 100644 --- a/runtime/arch/arm/quick_entrypoints_arm.S +++ b/runtime/arch/arm/quick_entrypoints_arm.S @@ -2146,6 +2146,216 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, r9 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, r10 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11 +// Helper macros for Baker CC read barrier mark introspection (BRBMI). +.macro BRBMI_FOR_12_REGISTERS macro_for_register, macro_for_reserved_register + \macro_for_register r0 + \macro_for_register r1 + \macro_for_register r2 + \macro_for_register r3 + \macro_for_reserved_register // R4 is reserved for the entrypoint address. + \macro_for_register r5 + \macro_for_register r6 + \macro_for_register r7 + \macro_for_register r8 + \macro_for_register r9 + \macro_for_register r10 + \macro_for_register r11 +.endm + +.macro BRBMI_FOR_REGISTERS macro_for_register, macro_for_reserved_register + BRBMI_FOR_12_REGISTERS \macro_for_register, \macro_for_reserved_register + \macro_for_reserved_register // IP is reserved. + \macro_for_reserved_register // SP is reserved. + \macro_for_reserved_register // LR is reserved. + \macro_for_reserved_register // PC is reserved. +.endm + +.macro BRBMI_RETURN_SWITCH_CASE reg +.Lmark_introspection_return_switch_case_\reg: + mov \reg, ip + bx lr +.endm + +.macro BRBMI_BAD_RETURN_SWITCH_CASE +.Lmark_introspection_return_switch_case_bad: + BRBMI_BKPT_FILL_4B +.endm + +.macro BRBMI_RETURN_SWITCH_CASE_OFFSET reg + .byte (.Lmark_introspection_return_switch_case_\reg - .Lmark_introspection_return_table) / 2 +.endm + +.macro BRBMI_BAD_RETURN_SWITCH_CASE_OFFSET + .byte (.Lmark_introspection_return_switch_case_bad - .Lmark_introspection_return_table) / 2 +.endm + +#if BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET +#error "Array and field introspection code sharing requires same LDR offset." +#endif +.macro BRBMI_ARRAY_LOAD index_reg + ldr ip, [ip, \index_reg, lsl #2] // 4 bytes. + b art_quick_read_barrier_mark_introspection // Should be 2 bytes, encoding T2. + .balign 8 // Add padding to 8 bytes. +.endm + +.macro BRBMI_BKPT_FILL_4B + bkpt 0 + bkpt 0 +.endm + +.macro BRBMI_BKPT_FILL_8B + BRBMI_BKPT_FILL_4B + BRBMI_BKPT_FILL_4B +.endm + +.macro BRBMI_SLOW_PATH ldr_offset + push {r0-r3, r7, lr} // Save return address and caller-save registers. + .cfi_adjust_cfa_offset 24 + .cfi_rel_offset r0, 0 + .cfi_rel_offset r1, 4 + .cfi_rel_offset r2, 8 + .cfi_rel_offset r3, 12 + .cfi_rel_offset r7, 16 + .cfi_rel_offset lr, 20 + + mov r0, ip // Pass the reference. + vpush {s0-s15} // save floating-point caller-save registers + .cfi_adjust_cfa_offset 64 + bl artReadBarrierMark // r0 <- artReadBarrierMark(obj) + vpop {s0-s15} // restore floating-point registers + .cfi_adjust_cfa_offset -64 + mov ip, r0 // Move reference to ip in preparation for return switch. + + pop {r0-r3, r7, lr} // Restore registers. + .cfi_adjust_cfa_offset -24 + .cfi_restore r0 + .cfi_restore r1 + .cfi_restore r2 + .cfi_restore r3 + .cfi_restore r7 + .cfi_restore lr + + // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR. + ldrh r4, [lr, #(-1 + \ldr_offset + 2)] + lsr r4, r4, #12 // Extract `ref_reg`. + b .Lmark_introspection_return_switch +.endm + + /* + * Use introspection to load a reference from the same address as the LDR + * instruction in generated code would load (unless loaded by the thunk, + * see below), call ReadBarrier::Mark() with that reference if needed + * and return it in the same register as the LDR instruction would load. + * + * The entrypoint is called through a thunk that differs across load kinds. + * For field and array loads the LDR instruction in generated code follows + * the branch to the thunk, i.e. the LDR is at [LR, #(-4 - 1)] where the -1 + * is an adjustment for the Thumb mode bit in LR, and the thunk knows the + * holder and performs the gray bit check, returning to the LDR instruction + * if the object is not gray, so this entrypoint no longer needs to know + * anything about the holder. For GC root loads, the LDR instruction in + * generated code precedes the branch to the thunk, i.e. the LDR is at + * [LR, #(-8 - 1)] where the -1 is again the Thumb mode bit adjustment, and + * the thunk does not do the gray bit check. + * + * For field accesses and array loads with a constant index the thunk loads + * the reference into IP using introspection and calls the main entrypoint, + * art_quick_read_barrier_mark_introspection. With heap poisoning enabled, + * the passed reference is poisoned. + * + * For array accesses with non-constant index, the thunk inserts the bits + * 0-5 of the LDR instruction to the entrypoint address, effectively + * calculating a switch case label based on the index register (bits 0-3) + * and adding an extra offset (bits 4-5 hold the shift which is always 2 + * for reference loads) to differentiate from the main entrypoint, then + * moves the base register to IP and jumps to the switch case. Therefore + * we need to align the main entrypoint to 512 bytes, accounting for + * a 256-byte offset followed by 16 array entrypoints starting at + * art_quick_read_barrier_mark_introspection_arrays, each containing an LDR + * (register) and a branch to the main entrypoint. + * + * For GC root accesses we cannot use the main entrypoint because of the + * different offset where the LDR instruction in generated code is located. + * (And even with heap poisoning enabled, GC roots are not poisoned.) + * To re-use the same entrypoint pointer in generated code, we make sure + * that the gc root entrypoint (a copy of the entrypoint with a different + * offset for introspection loads) is located at a known offset (128 bytes, + * or BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET) from the main + * entrypoint and the GC root thunk adjusts the entrypoint pointer, moves + * the root register to IP and jumps to the customized entrypoint, + * art_quick_read_barrier_mark_introspection_gc_roots. The thunk also + * performs all the fast-path checks, so we need just the slow path. + * + * The code structure is + * art_quick_read_barrier_mark_introspection: + * Over 128 bytes for the main entrypoint code. + * Padding to 192 bytes if needed. + * art_quick_read_barrier_mark_introspection_gc_roots: + * GC root entrypoint code. + * Padding to 256 bytes if needed. + * art_quick_read_barrier_mark_introspection_arrays: + * Exactly 128 bytes for array load switch cases (16x2 instructions). + */ + .balign 512 +ENTRY art_quick_read_barrier_mark_introspection + // At this point, IP contains the reference, R4 can be freely used. + // (R4 is reserved for the entrypoint address.) + // For heap poisoning, the reference is poisoned, so unpoison it first. + UNPOISON_HEAP_REF ip + // If reference is null, just return it in the right register. + cmp ip, #0 + beq .Lmark_introspection_return + // Use R4 as temp and check the mark bit of the reference. + ldr r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET] + tst r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED + beq .Lmark_introspection_unmarked +.Lmark_introspection_return: + // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR. + ldrh r4, [lr, #(-1 + BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET + 2)] + lsr r4, r4, #12 // Extract `ref_reg`. +.Lmark_introspection_return_switch: + tbb [pc, r4] // Jump to the switch case. +.Lmark_introspection_return_table: + BRBMI_FOR_REGISTERS BRBMI_RETURN_SWITCH_CASE_OFFSET, BRBMI_BAD_RETURN_SWITCH_CASE_OFFSET + .balign 16 + BRBMI_FOR_12_REGISTERS BRBMI_RETURN_SWITCH_CASE, BRBMI_BAD_RETURN_SWITCH_CASE + + .balign 16 +.Lmark_introspection_unmarked: + // Check if the top two bits are one, if this is the case it is a forwarding address. +#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3) + // To use "CMP ip, #modified-immediate; BHS", we need the lock word state in + // the highest bits and the "forwarding address" state to have all bits set. +#error "Unexpected lock word state shift or forwarding address state value." +#endif + cmp r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT) + bhs .Lmark_introspection_forwarding_address + BRBMI_SLOW_PATH BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET + + .balign 8 +.Lmark_introspection_forwarding_address: + // Shift left by the forwarding address shift. This clears out the state bits since they are + // in the top 2 bits of the lock word. + lsl ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT + b .Lmark_introspection_return + + .balign 64 + .thumb_func + .type art_quick_read_barrier_mark_introspection_gc_roots, #function + .hidden art_quick_read_barrier_mark_introspection_gc_roots + .global art_quick_read_barrier_mark_introspection_gc_roots +art_quick_read_barrier_mark_introspection_gc_roots: + BRBMI_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET + + .balign 256 + .thumb_func + .type art_quick_read_barrier_mark_introspection_arrays, #function + .hidden art_quick_read_barrier_mark_introspection_arrays + .global art_quick_read_barrier_mark_introspection_arrays +art_quick_read_barrier_mark_introspection_arrays: + BRBMI_FOR_REGISTERS BRBMI_ARRAY_LOAD, BRBMI_BKPT_FILL_8B +END art_quick_read_barrier_mark_introspection + .extern artInvokePolymorphic ENTRY art_quick_invoke_polymorphic SETUP_SAVE_REFS_AND_ARGS_FRAME r2 diff --git a/runtime/oat.h b/runtime/oat.h index 9b2227bc0c..924f77c65b 100644 --- a/runtime/oat.h +++ b/runtime/oat.h @@ -32,8 +32,7 @@ class InstructionSetFeatures; class PACKED(4) OatHeader { public: static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' }; - // Revert concurrent graying for immune spaces. - static constexpr uint8_t kOatVersion[] = { '1', '2', '2', '\0' }; + static constexpr uint8_t kOatVersion[] = { '1', '2', '3', '\0' }; // ARM Baker link-time thunks. static constexpr const char* kImageLocationKey = "image-location"; static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline"; |