diff options
author | 2017-05-11 14:04:03 +0100 | |
---|---|---|
committer | 2017-07-11 17:43:27 +0100 | |
commit | 97c46466aea25ab63a99b3d1afc558f0d9f55abb (patch) | |
tree | afd225f51d28a77329bc2590a025400e088f260c | |
parent | 00cca3a275562d110a8b35094b9b12fac37f67ab (diff) |
Introduce a Marking Register in ARM64 code generation.
When generating code for ARM64, maintain the status of
Thread::Current()->GetIsGcMarking() in register X20,
dubbed MR (Marking Register), and check the value of that
register (instead of loading and checking a read barrier
marking entrypoint) in read barriers.
Test: m test-art-target
Test: m test-art-target with tree built with ART_USE_READ_BARRIER=false
Test: ARM64 device boot test
Bug: 37707231
Change-Id: Ibe9bc5c99a2176b0a0476e9e9ad7fcc9f745017b
33 files changed, 426 insertions, 236 deletions
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc index 2710ae9b53..acb8a57bec 100644 --- a/compiler/jni/jni_cfi_test_expected.inc +++ b/compiler/jni/jni_cfi_test_expected.inc @@ -89,7 +89,8 @@ static constexpr uint8_t expected_asm_kArm64[] = { 0xF3, 0x53, 0x46, 0xA9, 0xF5, 0x5B, 0x47, 0xA9, 0xF7, 0x63, 0x48, 0xA9, 0xF9, 0x6B, 0x49, 0xA9, 0xFB, 0x73, 0x4A, 0xA9, 0xFD, 0x7B, 0x4B, 0xA9, 0xE8, 0x27, 0x42, 0x6D, 0xEA, 0x2F, 0x43, 0x6D, 0xEC, 0x37, 0x44, 0x6D, - 0xEE, 0x3F, 0x45, 0x6D, 0xFF, 0x03, 0x03, 0x91, 0xC0, 0x03, 0x5F, 0xD6, + 0xEE, 0x3F, 0x45, 0x6D, 0x74, 0x36, 0x40, 0xB9, 0xFF, 0x03, 0x03, 0x91, + 0xC0, 0x03, 0x5F, 0xD6, }; static constexpr uint8_t expected_cfi_kArm64[] = { 0x44, 0x0E, 0xC0, 0x01, 0x44, 0x93, 0x18, 0x94, 0x16, 0x44, 0x95, 0x14, @@ -101,7 +102,7 @@ static constexpr uint8_t expected_cfi_kArm64[] = { 0xD3, 0xD4, 0x44, 0xD5, 0xD6, 0x44, 0xD7, 0xD8, 0x44, 0xD9, 0xDA, 0x44, 0xDB, 0xDC, 0x44, 0xDD, 0xDE, 0x44, 0x06, 0x48, 0x06, 0x49, 0x44, 0x06, 0x4A, 0x06, 0x4B, 0x44, 0x06, 0x4C, 0x06, 0x4D, 0x44, 0x06, 0x4E, 0x06, - 0x4F, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0xC0, 0x01, + 0x4F, 0x48, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0xC0, 0x01, }; // 0x00000000: sub sp, sp, #0xc0 (192) // 0x00000004: .cfi_def_cfa_offset: 192 @@ -175,11 +176,12 @@ static constexpr uint8_t expected_cfi_kArm64[] = { // 0x0000006c: ldp d14, d15, [sp, #80] // 0x00000070: .cfi_restore_extended: r78 // 0x00000070: .cfi_restore_extended: r79 -// 0x00000070: add sp, sp, #0xc0 (192) -// 0x00000074: .cfi_def_cfa_offset: 0 -// 0x00000074: ret -// 0x00000078: .cfi_restore_state -// 0x00000078: .cfi_def_cfa_offset: 192 +// 0x00000070: ldr w20, [tr, #52] ; is_gc_marking +// 0x00000074: add sp, sp, #0xc0 (192) +// 0x00000078: .cfi_def_cfa_offset: 0 +// 0x00000078: ret +// 0x0000007c: .cfi_restore_state +// 0x0000007c: .cfi_def_cfa_offset: 192 static constexpr uint8_t expected_asm_kX86[] = { 0x57, 0x56, 0x55, 0x83, 0xC4, 0xE4, 0x50, 0x89, 0x4C, 0x24, 0x34, 0xF3, diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc index b34d9385c8..6ce7d75da6 100644 --- a/compiler/jni/jni_compiler_test.cc +++ b/compiler/jni/jni_compiler_test.cc @@ -49,6 +49,9 @@ extern "C" JNIEXPORT jint JNICALL Java_MyClassNatives_sbar(JNIEnv*, jclass, jint return count + 1; } +// TODO: In the Baker read barrier configuration, add checks to ensure +// the Marking Register's value is correct. + namespace art { enum class JniKind { diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc index 33f4d77fc2..e086455620 100644 --- a/compiler/jni/quick/arm64/calling_convention_arm64.cc +++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc @@ -108,11 +108,25 @@ static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask(); // Calling convention ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() { - return Arm64ManagedRegister::FromXRegister(X20); // saved on entry restored on exit + // X20 is safe to use as a scratch register: + // - with Baker read barriers, it is reserved as Marking Register, + // and thus does not actually need to be saved/restored; it is + // refreshed on exit (see Arm64JNIMacroAssembler::RemoveFrame); + // - in other cases, it is saved on entry (in + // Arm64JNIMacroAssembler::BuildFrame) and restored on exit (in + // Arm64JNIMacroAssembler::RemoveFrame). + return Arm64ManagedRegister::FromXRegister(X20); } ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() { - return Arm64ManagedRegister::FromXRegister(X20); // saved on entry restored on exit + // X20 is safe to use as a scratch register: + // - with Baker read barriers, it is reserved as Marking Register, + // and thus does not actually need to be saved/restored; it is + // refreshed on exit (see Arm64JNIMacroAssembler::RemoveFrame); + // - in other cases, it is saved on entry (in + // Arm64JNIMacroAssembler::BuildFrame) and restored on exit (in + // Arm64JNIMacroAssembler::RemoveFrame). + return Arm64ManagedRegister::FromXRegister(X20); } static ManagedRegister ReturnRegisterForShorty(const char* shorty) { diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc index bc21607c5b..38c732b8ba 100644 --- a/compiler/linker/arm64/relative_patcher_arm64.cc +++ b/compiler/linker/arm64/relative_patcher_arm64.cc @@ -381,6 +381,21 @@ static void EmitGrayCheckAndFastPath(arm64::Arm64Assembler& assembler, // Note: The fake dependency is unnecessary for the slow path. } +// Load the read barrier introspection entrypoint in register `entrypoint`. +static void LoadReadBarrierMarkIntrospectionEntrypoint(arm64::Arm64Assembler& assembler, + vixl::aarch64::Register entrypoint) { + using vixl::aarch64::MemOperand; + using vixl::aarch64::ip0; + // Thread Register. + const vixl::aarch64::Register tr = vixl::aarch64::x19; + + // entrypoint = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip0.GetCode(), 16u); + const int32_t entry_point_offset = + Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); + __ Ldr(entrypoint, MemOperand(tr, entry_point_offset)); +} + void Arm64RelativePatcher::CompileBakerReadBarrierThunk(arm64::Arm64Assembler& assembler, uint32_t encoded_data) { using namespace vixl::aarch64; // NOLINT(build/namespaces) @@ -412,6 +427,7 @@ void Arm64RelativePatcher::CompileBakerReadBarrierThunk(arm64::Arm64Assembler& a __ Bind(&slow_path); MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET); __ Ldr(ip0.W(), ldr_address); // Load the LDR (immediate) unsigned offset. + LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1); __ Ubfx(ip0.W(), ip0.W(), 10, 12); // Extract the offset. __ Ldr(ip0.W(), MemOperand(base_reg, ip0, LSL, 2)); // Load the reference. // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference. @@ -441,6 +457,7 @@ void Arm64RelativePatcher::CompileBakerReadBarrierThunk(arm64::Arm64Assembler& a __ Bind(&slow_path); MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET); __ Ldr(ip0.W(), ldr_address); // Load the LDR (register) unsigned offset. + LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1); __ Ubfx(ip0, ip0, 16, 6); // Extract the index register, plus 32 (bit 21 is set). __ Bfi(ip1, ip0, 3, 6); // Insert ip0 to the entrypoint address to create // a switch case target based on the index register. @@ -469,6 +486,7 @@ void Arm64RelativePatcher::CompileBakerReadBarrierThunk(arm64::Arm64Assembler& a __ Bind(¬_marked); __ Tst(ip0.W(), Operand(ip0.W(), LSL, 1)); __ B(&forwarding_address, mi); + LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1); // Adjust the art_quick_read_barrier_mark_introspection address in IP1 to // art_quick_read_barrier_mark_introspection_gc_roots. __ Add(ip1, ip1, Operand(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET)); diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 7bf43f7971..73202b4fd1 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -404,17 +404,6 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { // accessing the String's `value` field in String intrinsics. static uint32_t GetArrayDataOffset(HArrayGet* array_get); - // Return the entry point offset for ReadBarrierMarkRegX, where X is `reg`. - template <PointerSize pointer_size> - static int32_t GetReadBarrierMarkEntryPointsOffset(size_t reg) { - // The entry point list defines 30 ReadBarrierMarkRegX entry points. - DCHECK_LT(reg, 30u); - // The ReadBarrierMarkRegX entry points are ordered by increasing - // register number in Thread::tls_Ptr_.quick_entrypoints. - return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value() - + static_cast<size_t>(pointer_size) * reg; - } - void EmitParallelMoves(Location from1, Location to1, Primitive::Type type1, diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 6b9f232e8f..92467fe101 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -729,7 +729,7 @@ class ReadBarrierMarkSlowPathBaseARM : public SlowPathCodeARM { } else { // Entrypoint is not already loaded, load from the thread. int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); // This runtime call does not require a stack map. arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); } @@ -8428,7 +8428,7 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. DCHECK_EQ(IP, 12); const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); Label return_address; @@ -8469,7 +8469,7 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); @@ -8572,7 +8572,7 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. DCHECK_EQ(IP, 12); const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); Label return_address; @@ -8655,7 +8655,7 @@ void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instr // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. DCHECK_EQ(IP, 12); const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP); __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset); __ AddConstant(data_reg, obj, data_offset); @@ -8736,7 +8736,7 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. __ LoadFromOffset(kLoadWord, temp2.AsRegister<Register>(), TR, entry_point_offset); @@ -8805,7 +8805,7 @@ void CodeGeneratorARM::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* in // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. __ LoadFromOffset(kLoadWord, temp3.AsRegister<Register>(), TR, entry_point_offset); diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 2561ed0762..7e5b1a0fd1 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -672,7 +672,9 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { // `ref`. // // Argument `entrypoint` must be a register location holding the read -// barrier marking runtime entry point to be invoked. +// barrier marking runtime entry point to be invoked or an empty +// location; in the latter case, the read barrier marking runtime +// entry point will be loaded by the slow path code itself. class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 { protected: ReadBarrierMarkSlowPathBaseARM64(HInstruction* instruction, Location ref, Location entrypoint) @@ -716,7 +718,7 @@ class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 { } else { // Entrypoint is not already loaded, load from the thread. int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); // This runtime call does not require a stack map. arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); } @@ -743,9 +745,10 @@ class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 { // another thread, or if another thread installed another object // reference (different from `ref`) in `obj.field`). // -// If `entrypoint` is a valid location it is assumed to already be -// holding the entrypoint. The case where the entrypoint is passed in -// is when the decision to mark is based on whether the GC is marking. +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked or an empty +// location; in the latter case, the read barrier marking runtime +// entry point will be loaded by the slow path code itself. class ReadBarrierMarkSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { public: ReadBarrierMarkSlowPathARM64(HInstruction* instruction, @@ -791,7 +794,9 @@ class ReadBarrierMarkSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { // reference (different from `ref`) in `obj.field`). // // Argument `entrypoint` must be a register location holding the read -// barrier marking runtime entry point to be invoked. +// barrier marking runtime entry point to be invoked or an empty +// location; in the latter case, the read barrier marking runtime +// entry point will be loaded by the slow path code itself. class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { public: LoadReferenceWithBakerReadBarrierSlowPathARM64(HInstruction* instruction, @@ -803,7 +808,7 @@ class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlo bool needs_null_check, bool use_load_acquire, Register temp, - Location entrypoint) + Location entrypoint = Location::NoLocation()) : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint), obj_(obj), offset_(offset), @@ -947,20 +952,23 @@ class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlo // another object reference (different from `ref`) in `obj.field`). // // Argument `entrypoint` must be a register location holding the read -// barrier marking runtime entry point to be invoked. +// barrier marking runtime entry point to be invoked or an empty +// location; in the latter case, the read barrier marking runtime +// entry point will be loaded by the slow path code itself. class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { public: - LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(HInstruction* instruction, - Location ref, - Register obj, - uint32_t offset, - Location index, - size_t scale_factor, - bool needs_null_check, - bool use_load_acquire, - Register temp, - Location entrypoint) + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64( + HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire, + Register temp, + Location entrypoint = Location::NoLocation()) : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint), obj_(obj), offset_(offset), @@ -1655,7 +1663,7 @@ void CodeGeneratorARM64::SetupBlockedRegisters() const { // Blocked core registers: // lr : Runtime reserved. // tr : Runtime reserved. - // xSuspend : Runtime reserved. TODO: Unblock this when the runtime stops using it. + // mr : Runtime reserved. // ip1 : VIXL core temp. // ip0 : VIXL core temp. // @@ -5921,20 +5929,17 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( // Baker's read barrier are used. if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && !Runtime::Current()->UseJitCompilation()) { - // Note that we do not actually check the value of `GetIsGcMarking()` - // to decide whether to mark the loaded GC root or not. Instead, we - // load into `temp` (actually IP1) the read barrier mark introspection - // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is - // false, and vice versa. + // Query `art::Thread::Current()->GetIsGcMarking()` (stored in + // the Marking Register) to decide whether we need to enter + // the slow path to mark the GC root. // // We use link-time generated thunks for the slow path. That thunk // checks the reference and jumps to the entrypoint if needed. // - // temp = Thread::Current()->pReadBarrierMarkIntrospection // lr = &return_address; // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { - // goto gc_root_thunk<root_reg>(lr) + // if (mr) { // Thread::Current()->GetIsGcMarking() + // goto gc_root_thunk<root_reg>(lr) // } // return_address: @@ -5946,11 +5951,6 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( linker::Arm64RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode()); vixl::aarch64::Label* cbnz_label = codegen_->NewBakerReadBarrierPatch(custom_data); - // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. - DCHECK_EQ(ip0.GetCode(), 16u); - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); - __ Ldr(ip1, MemOperand(tr, entry_point_offset)); EmissionCheckScope guard(GetVIXLAssembler(), 3 * vixl::aarch64::kInstructionSize); vixl::aarch64::Label return_address; __ adr(lr, &return_address); @@ -5961,36 +5961,26 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( "GC root LDR must be 2 instruction (8B) before the return address label."); __ ldr(root_reg, MemOperand(obj.X(), offset)); __ Bind(cbnz_label); - __ cbnz(ip1, static_cast<int64_t>(0)); // Placeholder, patched at link-time. + __ cbnz(mr, static_cast<int64_t>(0)); // Placeholder, patched at link-time. __ Bind(&return_address); } else { - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. + // Query `art::Thread::Current()->GetIsGcMarking()` (stored in + // the Marking Register) to decide whether we need to enter + // the slow path to mark the GC root. // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // if (mr) { // Thread::Current()->GetIsGcMarking() // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // root = entrypoint(root); // root = ReadBarrier::Mark(root); // Entry point call. // } - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Register temp = lr; - SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64( - instruction, root, /* entrypoint */ LocationFrom(temp)); + // Slow path marking the GC root `root`. The entrypoint will + // be loaded by the slow path code. + SlowPathCodeARM64* slow_path = + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root); codegen_->AddSlowPath(slow_path); - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ Ldr(temp, MemOperand(tr, entry_point_offset)); - // /* GcRoot<mirror::Object> */ root = *(obj + offset) if (fixup_label == nullptr) { __ Ldr(root_reg, MemOperand(obj, offset)); @@ -6005,9 +5995,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ Cbnz(temp, slow_path->GetEntryLabel()); + __ Cbnz(mr, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } } else { @@ -6048,20 +6036,19 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins if (kBakerReadBarrierLinkTimeThunksEnableForFields && !use_load_acquire && !Runtime::Current()->UseJitCompilation()) { - // Note that we do not actually check the value of `GetIsGcMarking()` - // to decide whether to mark the loaded reference or not. Instead, we - // load into `temp` (actually IP1) the read barrier mark introspection - // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is - // false, and vice versa. + // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the + // Marking Register) to decide whether we need to enter the slow + // path to mark the reference. Then, in the slow path, check the + // gray bit in the lock word of the reference's holder (`obj`) to + // decide whether to mark `ref` or not. // // We use link-time generated thunks for the slow path. That thunk checks // the holder and jumps to the entrypoint if needed. If the holder is not // gray, it creates a fake dependency and returns to the LDR instruction. // - // temp = Thread::Current()->pReadBarrierMarkIntrospection // lr = &gray_return_address; - // if (temp != nullptr) { - // goto field_thunk<holder_reg, base_reg>(lr) + // if (mr) { // Thread::Current()->GetIsGcMarking() + // goto field_thunk<holder_reg, base_reg>(lr) // } // not_gray_return_address: // // Original reference load. If the offset is too large to fit @@ -6087,17 +6074,12 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins obj.GetCode()); vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data); - // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. - DCHECK_EQ(ip0.GetCode(), 16u); - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); - __ Ldr(ip1, MemOperand(tr, entry_point_offset)); EmissionCheckScope guard(GetVIXLAssembler(), (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize); vixl::aarch64::Label return_address; __ adr(lr, &return_address); __ Bind(cbnz_label); - __ cbnz(ip1, static_cast<int64_t>(0)); // Placeholder, patched at link-time. + __ cbnz(mr, static_cast<int64_t>(0)); // Placeholder, patched at link-time. static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), "Field LDR must be 1 instruction (4B) before the return address label; " " 2 instructions (8B) for heap poisoning."); @@ -6143,20 +6125,19 @@ void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* ins if (kBakerReadBarrierLinkTimeThunksEnableForArrays && !Runtime::Current()->UseJitCompilation()) { - // Note that we do not actually check the value of `GetIsGcMarking()` - // to decide whether to mark the loaded reference or not. Instead, we - // load into `temp` (actually IP1) the read barrier mark introspection - // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is - // false, and vice versa. + // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the + // Marking Register) to decide whether we need to enter the slow + // path to mark the reference. Then, in the slow path, check the + // gray bit in the lock word of the reference's holder (`obj`) to + // decide whether to mark `ref` or not. // // We use link-time generated thunks for the slow path. That thunk checks // the holder and jumps to the entrypoint if needed. If the holder is not // gray, it creates a fake dependency and returns to the LDR instruction. // - // temp = Thread::Current()->pReadBarrierMarkIntrospection // lr = &gray_return_address; - // if (temp != nullptr) { - // goto field_thunk<holder_reg, base_reg>(lr) + // if (mr) { // Thread::Current()->GetIsGcMarking() + // goto array_thunk<base_reg>(lr) // } // not_gray_return_address: // // Original reference load. If the offset is too large to fit @@ -6176,18 +6157,13 @@ void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* ins linker::Arm64RelativePatcher::EncodeBakerReadBarrierArrayData(temp.GetCode()); vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data); - // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. - DCHECK_EQ(ip0.GetCode(), 16u); - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); - __ Ldr(ip1, MemOperand(tr, entry_point_offset)); __ Add(temp.X(), obj.X(), Operand(data_offset)); EmissionCheckScope guard(GetVIXLAssembler(), (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize); vixl::aarch64::Label return_address; __ adr(lr, &return_address); __ Bind(cbnz_label); - __ cbnz(ip1, static_cast<int64_t>(0)); // Placeholder, patched at link-time. + __ cbnz(mr, static_cast<int64_t>(0)); // Placeholder, patched at link-time. static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), "Array LDR must be 1 instruction (4B) before the return address label; " " 2 instructions (8B) for heap poisoning."); @@ -6231,35 +6207,28 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // `instruction->IsArrayGet()` => `!use_load_acquire`. DCHECK(!instruction->IsArrayGet() || !use_load_acquire); - // Query `art::Thread::Current()->GetIsGcMarking()` to decide - // whether we need to enter the slow path to mark the reference. - // Then, in the slow path, check the gray bit in the lock word of - // the reference's holder (`obj`) to decide whether to mark `ref` or - // not. + // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the + // Marking Register) to decide whether we need to enter the slow + // path to mark the reference. Then, in the slow path, check the + // gray bit in the lock word of the reference's holder (`obj`) to + // decide whether to mark `ref` or not. // - // Note that we do not actually check the value of `GetIsGcMarking()`; - // instead, we load into `temp2` the read barrier mark entry point - // corresponding to register `ref`. If `temp2` is null, it means - // that `GetIsGcMarking()` is false, and vice versa. - // - // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // if (mr) { // Thread::Current()->GetIsGcMarking() // // Slow path. // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); // lfence; // Load fence or artificial data dependency to prevent load-load reordering // HeapReference<mirror::Object> ref = *src; // Original reference load. // bool is_gray = (rb_state == ReadBarrier::GrayState()); // if (is_gray) { - // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. // } // } else { // HeapReference<mirror::Object> ref = *src; // Original reference load. // } // Slow path marking the object `ref` when the GC is marking. The - // entrypoint will already be loaded in `temp2`. - Register temp2 = lr; - Location temp2_loc = LocationFrom(temp2); + // entrypoint will be loaded by the slow path code. SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64( instruction, @@ -6270,19 +6239,10 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* scale_factor, needs_null_check, use_load_acquire, - temp, - /* entrypoint */ temp2_loc); + temp); AddSlowPath(slow_path); - // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ Ldr(temp2, MemOperand(tr, entry_point_offset)); - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ Cbnz(temp2, slow_path->GetEntryLabel()); + __ Cbnz(mr, slow_path->GetEntryLabel()); // Fast path: the GC is not marking: just load the reference. GenerateRawReferenceLoad( instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire); @@ -6303,19 +6263,14 @@ void CodeGeneratorARM64::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* // `instruction->IsArrayGet()` => `!use_load_acquire`. DCHECK(!instruction->IsArrayGet() || !use_load_acquire); - // Query `art::Thread::Current()->GetIsGcMarking()` to decide - // whether we need to enter the slow path to update the reference - // field within `obj`. Then, in the slow path, check the gray bit - // in the lock word of the reference's holder (`obj`) to decide - // whether to mark `ref` and update the field or not. - // - // Note that we do not actually check the value of `GetIsGcMarking()`; - // instead, we load into `temp2` the read barrier mark entry point - // corresponding to register `ref`. If `temp2` is null, it means - // that `GetIsGcMarking()` is false, and vice versa. + // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the + // Marking Register) to decide whether we need to enter the slow + // path to update the reference field within `obj`. Then, in the + // slow path, check the gray bit in the lock word of the reference's + // holder (`obj`) to decide whether to mark `ref` and update the + // field or not. // - // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // if (mr) { // Thread::Current()->GetIsGcMarking() // // Slow path. // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); // lfence; // Load fence or artificial data dependency to prevent load-load reordering @@ -6323,15 +6278,14 @@ void CodeGeneratorARM64::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* // bool is_gray = (rb_state == ReadBarrier::GrayState()); // if (is_gray) { // old_ref = ref; - // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. // compareAndSwapObject(obj, field_offset, old_ref, ref); // } // } // Slow path updating the object reference at address `obj + field_offset` - // when the GC is marking. The entrypoint will already be loaded in `temp2`. - Register temp2 = lr; - Location temp2_loc = LocationFrom(temp2); + // when the GC is marking. The entrypoint will be loaded by the slow path code. SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64( instruction, @@ -6342,19 +6296,10 @@ void CodeGeneratorARM64::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* /* scale_factor */ 0u /* "times 1" */, needs_null_check, use_load_acquire, - temp, - /* entrypoint */ temp2_loc); + temp); AddSlowPath(slow_path); - // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ Ldr(temp2, MemOperand(tr, entry_point_offset)); - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ Cbnz(temp2, slow_path->GetEntryLabel()); + __ Cbnz(mr, slow_path->GetEntryLabel()); // Fast path: the GC is not marking: nothing to do (the field is // up-to-date, and we don't need to load the reference). __ Bind(slow_path->GetExitLabel()); diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index d9c49d19bb..584eead81b 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -70,21 +70,32 @@ static const vixl::aarch64::FPRegister kParameterFPRegisters[] = { }; static constexpr size_t kParameterFPRegistersLength = arraysize(kParameterFPRegisters); -// Thread Register +// Thread Register. const vixl::aarch64::Register tr = vixl::aarch64::x19; +// Marking Register. +const vixl::aarch64::Register mr = vixl::aarch64::x20; // Method register on invoke. static const vixl::aarch64::Register kArtMethodRegister = vixl::aarch64::x0; const vixl::aarch64::CPURegList vixl_reserved_core_registers(vixl::aarch64::ip0, vixl::aarch64::ip1); const vixl::aarch64::CPURegList vixl_reserved_fp_registers(vixl::aarch64::d31); -const vixl::aarch64::CPURegList runtime_reserved_core_registers(tr, vixl::aarch64::lr); - -// Callee-saved registers AAPCS64 (without x19 - Thread Register) -const vixl::aarch64::CPURegList callee_saved_core_registers(vixl::aarch64::CPURegister::kRegister, - vixl::aarch64::kXRegSize, - vixl::aarch64::x20.GetCode(), - vixl::aarch64::x30.GetCode()); +const vixl::aarch64::CPURegList runtime_reserved_core_registers = + vixl::aarch64::CPURegList( + tr, + // Reserve X20 as Marking Register when emitting Baker read barriers. + ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) ? mr : vixl::aarch64::NoCPUReg), + vixl::aarch64::lr); + +// Callee-save registers AAPCS64, without x19 (Thread Register) (nor +// x20 (Marking Register) when emitting Baker read barriers). +const vixl::aarch64::CPURegList callee_saved_core_registers( + vixl::aarch64::CPURegister::kRegister, + vixl::aarch64::kXRegSize, + ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) + ? vixl::aarch64::x21.GetCode() + : vixl::aarch64::x20.GetCode()), + vixl::aarch64::x30.GetCode()); const vixl::aarch64::CPURegList callee_saved_fp_registers(vixl::aarch64::CPURegister::kFPRegister, vixl::aarch64::kDRegSize, vixl::aarch64::d8.GetCode(), diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 9a2402be04..7334678f99 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -786,7 +786,7 @@ class ReadBarrierMarkSlowPathBaseARMVIXL : public SlowPathCodeARMVIXL { } else { // Entrypoint is not already loaded, load from the thread. int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode()); // This runtime call does not require a stack map. arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); } @@ -8559,7 +8559,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. DCHECK_EQ(ip.GetCode(), 12u); const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); vixl::EmissionCheckScope guard(GetVIXLAssembler(), @@ -8601,7 +8601,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); @@ -8705,7 +8705,7 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i // Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection. DCHECK_EQ(ip.GetCode(), 12u); const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); vixl::EmissionCheckScope guard( @@ -8797,7 +8797,7 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i // Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. DCHECK_EQ(ip.GetCode(), 12u); const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode()); __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset)); __ Add(data_reg, obj, Operand(data_offset)); @@ -8883,7 +8883,7 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp2), tr, entry_point_offset); @@ -8951,7 +8951,7 @@ void CodeGeneratorARMVIXL::UpdateReferenceFieldWithBakerReadBarrier(HInstruction // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset); diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index abe1d70216..be8f9e9cf8 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -656,7 +656,7 @@ class ReadBarrierMarkSlowPathMIPS : public SlowPathCodeMIPS { __ NopIfNoReordering(); } else { int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1); + Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1); // This runtime call does not require a stack map. mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, @@ -750,7 +750,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathMIPS : public SlowPathCodeMIPS { // rX <- ReadBarrierMarkRegX(rX) // int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1); + Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1); // This runtime call does not require a stack map. mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, @@ -6497,7 +6497,7 @@ void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(HInstruction* instruc // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1); + Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index 232241c5ad..cf6b3d5805 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -606,7 +606,7 @@ class ReadBarrierMarkSlowPathMIPS64 : public SlowPathCodeMIPS64 { __ Nop(); } else { int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1); + Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1); // This runtime call does not require a stack map. mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, @@ -699,7 +699,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 : public SlowPathCodeMIPS64 { // rX <- ReadBarrierMarkRegX(rX) // int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1); + Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1); // This runtime call does not require a stack map. mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, @@ -4421,7 +4421,7 @@ void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad( // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1); + Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1); // Loading the entrypoint does not require a load acquire since it is only changed when // threads are suspended or running a checkpoint. __ LoadFromOffset(kLoadDoubleword, temp.AsRegister<GpuRegister>(), TR, entry_point_offset); diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 79fccfeaef..af0e6462a2 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -509,8 +509,7 @@ class ReadBarrierMarkSlowPathX86 : public SlowPathCode { // // rX <- ReadBarrierMarkRegX(rX) // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg); + int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg); // This runtime call does not require a stack map. x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ jmp(GetExitLabel()); @@ -595,8 +594,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathX86 : public SlowPathCode { // // rX <- ReadBarrierMarkRegX(rX) // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg); + int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg); // This runtime call does not require a stack map. x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); @@ -7153,7 +7151,7 @@ void InstructionCodeGeneratorX86::GenerateGcRootFieldLoad( // Test the entrypoint (`Thread::Current()->pReadBarrierMarkReg ## root.reg()`). const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(root.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(root.reg()); __ fs()->cmpl(Address::Absolute(entry_point_offset), Immediate(0)); // The entrypoint is null when the GC is not marking. __ j(kNotEqual, slow_path->GetEntryLabel()); diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 57319ce735..86f6d51734 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -524,7 +524,7 @@ class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode { // rX <- ReadBarrierMarkRegX(rX) // int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg); + Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg); // This runtime call does not require a stack map. x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ jmp(GetExitLabel()); @@ -615,7 +615,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathX86_64 : public SlowPathCode { // rX <- ReadBarrierMarkRegX(rX) // int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg); + Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg); // This runtime call does not require a stack map. x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); @@ -6540,7 +6540,7 @@ void InstructionCodeGeneratorX86_64::GenerateGcRootFieldLoad( // Test the `Thread::Current()->pReadBarrierMarkReg ## root.reg()` entrypoint. const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(root.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(root.reg()); __ gs()->cmpl(Address::Absolute(entry_point_offset, /* no_rip */ true), Immediate(0)); // The entrypoint is null when the GC is not marking. __ j(kNotEqual, slow_path->GetEntryLabel()); diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index ae5f8d1760..37958660e1 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -154,8 +154,7 @@ class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp; // TODO: Load the entrypoint once before the loop, instead of // loading it at every iteration. - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp); + int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp); // This runtime call does not require a stack map. arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ MaybePoisonHeapReference(tmp); diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 37d79814be..aec1ec7669 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -205,7 +205,7 @@ class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 { // TODO: Load the entrypoint once before the loop, instead of // loading it at every iteration. int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg()); + Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg()); // This runtime call does not require a stack map. codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg); diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index 3c9b613803..ced931b36b 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -226,7 +226,7 @@ class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL { // TODO: Load the entrypoint once before the loop, instead of // loading it at every iteration. int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp.GetCode()); + Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp.GetCode()); // This runtime call does not require a stack map. arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); assembler->MaybePoisonHeapReference(tmp); diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 6b4851d541..a18b0cc400 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -143,8 +143,7 @@ class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode { // explanations.) DCHECK_NE(temp2, ESP); DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2; - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2); + int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2); // This runtime call does not require a stack map. x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ MaybePoisonHeapReference(temp2); diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index ef98b7be30..5abdb1d1bd 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -105,8 +105,7 @@ class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode { // No need to save live registers; it's taken care of by the // entrypoint. Also, there is no need to update the stack mask, // as this runtime call will not trigger a garbage collection. - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP); + int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP); // This runtime call does not require a stack map. x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); __ MaybePoisonHeapReference(CpuRegister(TMP)); diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc index 60af2b4201..abab431bb2 100644 --- a/compiler/optimizing/optimizing_cfi_test_expected.inc +++ b/compiler/optimizing/optimizing_cfi_test_expected.inc @@ -31,21 +31,21 @@ static constexpr uint8_t expected_cfi_kThumb2[] = { // 0x00000010: .cfi_def_cfa_offset: 64 static constexpr uint8_t expected_asm_kArm64[] = { - 0xFF, 0x03, 0x01, 0xD1, 0xF4, 0x17, 0x00, 0xF9, 0xF5, 0x7B, 0x03, 0xA9, - 0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF4, 0x17, 0x40, 0xF9, - 0xF5, 0x7B, 0x43, 0xA9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6, + 0xFF, 0x03, 0x01, 0xD1, 0xF5, 0x17, 0x00, 0xF9, 0xF6, 0x7B, 0x03, 0xA9, + 0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF5, 0x17, 0x40, 0xF9, + 0xF6, 0x7B, 0x43, 0xA9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6, }; static constexpr uint8_t expected_cfi_kArm64[] = { - 0x44, 0x0E, 0x40, 0x44, 0x94, 0x06, 0x44, 0x95, 0x04, 0x9E, 0x02, 0x44, + 0x44, 0x0E, 0x40, 0x44, 0x95, 0x06, 0x44, 0x96, 0x04, 0x9E, 0x02, 0x44, 0x05, 0x48, 0x0A, 0x05, 0x49, 0x08, 0x0A, 0x44, 0x06, 0x48, 0x06, 0x49, - 0x44, 0xD4, 0x44, 0xD5, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40, + 0x44, 0xD5, 0x44, 0xD6, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40, }; // 0x00000000: sub sp, sp, #0x40 (64) // 0x00000004: .cfi_def_cfa_offset: 64 -// 0x00000004: str x20, [sp, #40] -// 0x00000008: .cfi_offset: r20 at cfa-24 -// 0x00000008: stp x21, lr, [sp, #48] -// 0x0000000c: .cfi_offset: r21 at cfa-16 +// 0x00000004: str x21, [sp, #40] +// 0x00000008: .cfi_offset: r21 at cfa-24 +// 0x00000008: stp x22, lr, [sp, #48] +// 0x0000000c: .cfi_offset: r22 at cfa-16 // 0x0000000c: .cfi_offset: r30 at cfa-8 // 0x0000000c: stp d8, d9, [sp, #24] // 0x00000010: .cfi_offset_extended: r72 at cfa-40 @@ -54,10 +54,10 @@ static constexpr uint8_t expected_cfi_kArm64[] = { // 0x00000010: ldp d8, d9, [sp, #24] // 0x00000014: .cfi_restore_extended: r72 // 0x00000014: .cfi_restore_extended: r73 -// 0x00000014: ldr x20, [sp, #40] -// 0x00000018: .cfi_restore: r20 -// 0x00000018: ldp x21, lr, [sp, #48] -// 0x0000001c: .cfi_restore: r21 +// 0x00000014: ldr x21, [sp, #40] +// 0x00000018: .cfi_restore: r21 +// 0x00000018: ldp x22, lr, [sp, #48] +// 0x0000001c: .cfi_restore: r22 // 0x0000001c: .cfi_restore: r30 // 0x0000001c: add sp, sp, #0x40 (64) // 0x00000020: .cfi_def_cfa_offset: 0 diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc index 9cd6884cbe..c436fd902c 100644 --- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc +++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc @@ -772,6 +772,13 @@ void Arm64JNIMacroAssembler::RemoveFrame(size_t frame_size, asm_.UnspillRegisters(core_reg_list, frame_size - core_reg_size); asm_.UnspillRegisters(fp_reg_list, frame_size - core_reg_size - fp_reg_size); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Refresh Mark Register. + // TODO: Refresh MR only if suspend is taken. + ___ Ldr(reg_w(MR), + MemOperand(reg_x(TR), Thread::IsGcMarkingOffset<kArm64PointerSize>().Int32Value())); + } + // Decrease frame size to start of callee saved regs. DecreaseFrameSize(frame_size); diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S index 676efc4a77..b909bda3f7 100644 --- a/runtime/arch/arm/quick_entrypoints_arm.S +++ b/runtime/arch/arm/quick_entrypoints_arm.S @@ -1331,7 +1331,7 @@ GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, art // r0: type r1: component_count r2: total_size r9: Thread::Current, r3, r12: free. // Need to preserve r0 and r1 to the slow path. .macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel - and r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED // Apply alignemnt mask + and r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED // Apply alignment mask // (addr + 7) & ~7. // Load thread_local_pos (r3) and diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S index bcf55e339e..715fc35ff4 100644 --- a/runtime/arch/arm64/asm_support_arm64.S +++ b/runtime/arch/arm64/asm_support_arm64.S @@ -33,6 +33,12 @@ #define xIP1 x17 #define wIP1 w17 +#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER) +// Marking Register, holding Thread::Current()->GetIsGcMarking(). +// Only used with the Concurrent Copying (CC) garbage +// collector, with the Baker read barrier configuration. +#define wMR w20 +#endif .macro ENTRY name .type \name, #function @@ -55,14 +61,14 @@ END \name .endm -// Macros to poison (negate) the reference for heap poisoning. +// Macro to poison (negate) the reference for heap poisoning. .macro POISON_HEAP_REF rRef #ifdef USE_HEAP_POISONING neg \rRef, \rRef #endif // USE_HEAP_POISONING .endm -// Macros to unpoison (negate) the reference for heap poisoning. +// Macro to unpoison (negate) the reference for heap poisoning. .macro UNPOISON_HEAP_REF rRef #ifdef USE_HEAP_POISONING neg \rRef, \rRef diff --git a/runtime/arch/arm64/context_arm64.cc b/runtime/arch/arm64/context_arm64.cc index 0465c1e79d..0f0814a675 100644 --- a/runtime/arch/arm64/context_arm64.cc +++ b/runtime/arch/arm64/context_arm64.cc @@ -137,7 +137,9 @@ void Arm64Context::DoLongJump() { for (size_t i = 0; i < kNumberOfDRegisters; ++i) { fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : Arm64Context::kBadFprBase + i; } + // Ensure the Thread Register contains the address of the current thread. DCHECK_EQ(reinterpret_cast<uintptr_t>(Thread::Current()), gprs[TR]); + // The Marking Register will be updated by art_quick_do_long_jump. art_quick_do_long_jump(gprs, fprs); } diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S index 138dbf9495..e097a336d4 100644 --- a/runtime/arch/arm64/quick_entrypoints_arm64.S +++ b/runtime/arch/arm64/quick_entrypoints_arm64.S @@ -39,6 +39,18 @@ .cfi_restore \reg .endm +.macro SAVE_REG_INCREASE_FRAME reg, frame_adjustment + str \reg, [sp, #-(\frame_adjustment)]! + .cfi_adjust_cfa_offset (\frame_adjustment) + .cfi_rel_offset \reg, 0 +.endm + +.macro RESTORE_REG_DECREASE_FRAME reg, frame_adjustment + ldr \reg, [sp], #(\frame_adjustment) + .cfi_restore \reg + .cfi_adjust_cfa_offset -(\frame_adjustment) +.endm + .macro SAVE_TWO_REGS reg1, reg2, offset stp \reg1, \reg2, [sp, #(\offset)] .cfi_rel_offset \reg1, (\offset) @@ -140,6 +152,9 @@ SAVE_TWO_REGS x29, xLR, 80 // Store ArtMethod* Runtime::callee_save_methods_[kSaveRefsOnly]. + // Note: We could avoid saving X20 in the case of Baker read + // barriers, as it is overwritten by REFRESH_MARKING_REGISTER + // later; but it's not worth handling this special case. stp xIP0, x20, [sp] .cfi_rel_offset x20, 8 @@ -151,6 +166,9 @@ // TODO: Probably no need to restore registers preserved by aapcs64. .macro RESTORE_SAVE_REFS_ONLY_FRAME // Callee-saves. + // Note: Likewise, we could avoid restoring X20 in the case of Baker + // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER + // later; but it's not worth handling this special case. RESTORE_REG x20, 8 RESTORE_TWO_REGS x21, x22, 16 RESTORE_TWO_REGS x23, x24, 32 @@ -165,11 +183,6 @@ DECREASE_FRAME 96 .endm -.macro RESTORE_SAVE_REFS_ONLY_FRAME_AND_RETURN - RESTORE_SAVE_REFS_ONLY_FRAME - ret -.endm - .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL INCREASE_FRAME 224 @@ -192,6 +205,9 @@ SAVE_TWO_REGS x5, x6, 112 // x7, Callee-saves. + // Note: We could avoid saving X20 in the case of Baker read + // barriers, as it is overwritten by REFRESH_MARKING_REGISTER + // later; but it's not worth handling this special case. SAVE_TWO_REGS x7, x20, 128 SAVE_TWO_REGS x21, x22, 144 SAVE_TWO_REGS x23, x24, 160 @@ -250,6 +266,9 @@ RESTORE_TWO_REGS x5, x6, 112 // x7, Callee-saves. + // Note: Likewise, we could avoid restoring X20 in the case of Baker + // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER + // later; but it's not worth handling this special case. RESTORE_TWO_REGS x7, x20, 128 RESTORE_TWO_REGS x21, x22, 144 RESTORE_TWO_REGS x23, x24, 160 @@ -358,7 +377,7 @@ ldp d29, d30, [sp, #240] ldr d31, [sp, #256] - // Restore core registers. + // Restore core registers, except x0. RESTORE_TWO_REGS x1, x2, 272 RESTORE_TWO_REGS x3, x4, 288 RESTORE_TWO_REGS x5, x6, 304 @@ -379,10 +398,21 @@ .endm .macro RESTORE_SAVE_EVERYTHING_FRAME - RESTORE_REG x0, 264 + RESTORE_REG x0, 264 RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0 .endm +// Macro to refresh the Marking Register (W20). +// +// This macro must be called at the end of functions implementing +// entrypoints that possibly (directly or indirectly) perform a +// suspend check (before they return). +.macro REFRESH_MARKING_REGISTER +#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER) + ldr wMR, [xSELF, #THREAD_IS_GC_MARKING_OFFSET] +#endif +.endm + .macro RETURN_IF_RESULT_IS_ZERO cbnz x0, 1f // result non-zero branch over ret // return @@ -562,6 +592,7 @@ NO_ARG_RUNTIME_EXCEPTION art_quick_throw_stack_overflow, artThrowStackOverflowFr bl \cxx_name // (method_idx, this, Thread*, SP) mov xIP0, x1 // save Method*->code_ RESTORE_SAVE_REFS_AND_ARGS_FRAME + REFRESH_MARKING_REGISTER cbz x0, 1f // did we find the target? if not go to exception delivery br xIP0 // tail call to target 1: @@ -661,13 +692,15 @@ SAVE_SIZE_AND_METHOD=SAVE_SIZE+8 .macro INVOKE_STUB_CALL_AND_RETURN + REFRESH_MARKING_REGISTER + // load method-> METHOD_QUICK_CODE_OFFSET ldr x9, [x0, #ART_METHOD_QUICK_CODE_OFFSET_64] // Branch to method. blr x9 // Restore return value address and shorty address. - ldp x4,x5, [xFP, #16] + ldp x4, x5, [xFP, #16] .cfi_restore x4 .cfi_restore x5 @@ -1046,6 +1079,7 @@ SAVE_SIZE=15*8 // x3, x4, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, SP stp x3, x4, [sp, #16] // Save result and shorty addresses. stp xFP, xLR, [sp] // Store LR & FP. mov xSELF, x5 // Move thread pointer into SELF register. + REFRESH_MARKING_REGISTER sub sp, sp, #16 str xzr, [sp] // Store null for ArtMethod* slot @@ -1152,7 +1186,7 @@ ENTRY art_quick_do_long_jump ldp x24, x25, [x0], #-16 ldp x22, x23, [x0], #-16 ldp x20, x21, [x0], #-16 - ldp x18, x19, [x0], #-16 + ldp x18, x19, [x0], #-16 // X18 & xSELF ldp x16, x17, [x0], #-16 ldp x14, x15, [x0], #-16 ldp x12, x13, [x0], #-16 @@ -1163,6 +1197,8 @@ ENTRY art_quick_do_long_jump ldp x2, x3, [x0], #-16 mov sp, x1 + REFRESH_MARKING_REGISTER + // Need to load PC, it's at the end (after the space for the unused XZR). Use x1. ldr x1, [x0, #33*8] // And the value of x0. @@ -1213,6 +1249,7 @@ ENTRY art_quick_lock_object mov x1, xSELF // pass Thread::Current bl artLockObjectFromCode // (Object* obj, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_W0_IS_ZERO_OR_DELIVER END art_quick_lock_object @@ -1221,6 +1258,7 @@ ENTRY art_quick_lock_object_no_inline mov x1, xSELF // pass Thread::Current bl artLockObjectFromCode // (Object* obj, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_W0_IS_ZERO_OR_DELIVER END art_quick_lock_object_no_inline @@ -1275,6 +1313,7 @@ ENTRY art_quick_unlock_object mov x1, xSELF // pass Thread::Current bl artUnlockObjectFromCode // (Object* obj, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_W0_IS_ZERO_OR_DELIVER END art_quick_unlock_object @@ -1283,6 +1322,7 @@ ENTRY art_quick_unlock_object_no_inline mov x1, xSELF // pass Thread::Current bl artUnlockObjectFromCode // (Object* obj, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_W0_IS_ZERO_OR_DELIVER END art_quick_unlock_object_no_inline @@ -1356,7 +1396,7 @@ END art_quick_check_instance_of */ .macro READ_BARRIER xDest, wDest, xObj, xTemp, wTemp, offset, number #ifdef USE_READ_BARRIER -#ifdef USE_BAKER_READ_BARRIER +# ifdef USE_BAKER_READ_BARRIER ldr \wTemp, [\xObj, #MIRROR_OBJECT_LOCK_WORD_OFFSET] tbnz \wTemp, #LOCK_WORD_READ_BARRIER_STATE_SHIFT, .Lrb_slowpath\number // False dependency to avoid needing load/load fence. @@ -1364,7 +1404,7 @@ END art_quick_check_instance_of ldr \wDest, [\xObj, #\offset] // Heap reference = 32b. This also zero-extends to \xDest. UNPOISON_HEAP_REF \wDest b .Lrb_exit\number -#endif +# endif // USE_BAKER_READ_BARRIER .Lrb_slowpath\number: // Store registers used in art_quick_aput_obj (x0-x4, LR), stack is 16B aligned. SAVE_TWO_REGS_INCREASE_FRAME x0, x1, 48 @@ -1471,6 +1511,7 @@ ENTRY \name mov x1, xSELF // pass Thread::Current bl \entrypoint // (uint32_t type_idx, Method* method, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return END \name .endm @@ -1483,6 +1524,7 @@ ENTRY \name mov x2, xSELF // pass Thread::Current bl \entrypoint // (uint32_t type_idx, Method* method, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return END \name .endm @@ -1495,6 +1537,7 @@ ENTRY \name mov x3, xSELF // pass Thread::Current bl \entrypoint RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return END \name .endm @@ -1507,8 +1550,8 @@ ENTRY \name mov x4, xSELF // pass Thread::Current bl \entrypoint // RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return - DELIVER_PENDING_EXCEPTION END \name .endm @@ -1520,6 +1563,7 @@ ENTRY \name mov x1, xSELF // pass Thread::Current bl \entrypoint // (uint32_t type_idx, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return END \name .endm @@ -1531,6 +1575,7 @@ ENTRY \name mov x2, xSELF // pass Thread::Current bl \entrypoint RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return END \name .endm @@ -1542,6 +1587,7 @@ ENTRY \name mov x3, xSELF // pass Thread::Current bl \entrypoint RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER \return END \name .endm @@ -1556,6 +1602,7 @@ ENTRY \name cbz w0, 1f // If result is null, deliver the OOME. .cfi_remember_state RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0 + REFRESH_MARKING_REGISTER ret // return .cfi_restore_state .cfi_def_cfa_offset FRAME_SIZE_SAVE_EVERYTHING // workaround for clang bug: 31975598 @@ -1588,6 +1635,9 @@ ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type, artInitializeTypeFro ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_resolve_string, artResolveStringFromCode +// Note: Functions `art{Get,Set}<Kind>{Static,Instance>FromCompiledCode` are +// defined by macros in runtime/entrypoints/quick/quick_field_entrypoints.cc. + ONE_ARG_REF_DOWNCALL art_quick_get_boolean_static, artGetBooleanStaticFromCompiledCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1 ONE_ARG_REF_DOWNCALL art_quick_get_byte_static, artGetByteStaticFromCompiledCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1 ONE_ARG_REF_DOWNCALL art_quick_get_char_static, artGetCharStaticFromCompiledCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1 @@ -1752,6 +1802,7 @@ ENTRY \c_name mov x1, xSELF // pass Thread::Current bl \cxx_name RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER END \c_name .endm @@ -1815,6 +1866,7 @@ ENTRY \name mov x1, xSELF // Pass Thread::Current. bl \entrypoint // (mirror::Class*, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER END \name .endm @@ -1825,7 +1877,7 @@ GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_tlab, artAll GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB, /* isInitialized */ 1 .macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel, xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2 - and \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignemnt mask + and \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignment mask // (addr + 7) & ~7. The mask must // be 64 bits to keep high bits in // case of overflow. @@ -1887,6 +1939,7 @@ ENTRY \name mov x2, xSELF // pass Thread::Current bl \entrypoint RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER END \name .endm @@ -1937,8 +1990,8 @@ END \name add \xTemp1, \xTemp1, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) .endm -# TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm64, remove -# the entrypoint once all backends have been updated to use the size variants. +// TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm64, remove +// the entrypoint once all backends have been updated to use the size variants. GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 @@ -1959,6 +2012,7 @@ ENTRY art_quick_test_suspend mov x0, xSELF bl artTestSuspendFromCode // (Thread*) RESTORE_SAVE_EVERYTHING_FRAME + REFRESH_MARKING_REGISTER ret END art_quick_test_suspend @@ -1966,7 +2020,9 @@ ENTRY art_quick_implicit_suspend mov x0, xSELF SETUP_SAVE_REFS_ONLY_FRAME // save callee saves for stack crawl bl artTestSuspendFromCode // (Thread*) - RESTORE_SAVE_REFS_ONLY_FRAME_AND_RETURN + RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER + ret END art_quick_implicit_suspend /* @@ -1983,6 +2039,7 @@ ENTRY art_quick_proxy_invoke_handler ldr x2, [xSELF, THREAD_EXCEPTION_OFFSET] cbnz x2, .Lexception_in_proxy // success if no exception is pending RESTORE_SAVE_REFS_AND_ARGS_FRAME // Restore frame + REFRESH_MARKING_REGISTER fmov d0, x0 // Store result in d0 in case it was float or double ret // return on success .Lexception_in_proxy: @@ -2035,6 +2092,7 @@ ENTRY art_quick_resolution_trampoline mov xIP0, x0 // Remember returned code pointer in xIP0. ldr x0, [sp, #0] // artQuickResolutionTrampoline puts called method in *SP. RESTORE_SAVE_REFS_AND_ARGS_FRAME + REFRESH_MARKING_REGISTER br xIP0 1: RESTORE_SAVE_REFS_AND_ARGS_FRAME @@ -2170,6 +2228,7 @@ ENTRY art_quick_generic_jni_trampoline // Tear down the callee-save frame. RESTORE_SAVE_REFS_AND_ARGS_FRAME + REFRESH_MARKING_REGISTER // store into fpr, for when it's a fpr return... fmov d0, x0 @@ -2202,6 +2261,7 @@ ENTRY art_quick_to_interpreter_bridge bl artQuickToInterpreterBridge RESTORE_SAVE_REFS_AND_ARGS_FRAME // TODO: no need to restore arguments in this case. + REFRESH_MARKING_REGISTER fmov d0, x0 @@ -2231,6 +2291,7 @@ ENTRY art_quick_instrumentation_entry mov x0, x20 // Reload method reference. RESTORE_SAVE_REFS_AND_ARGS_FRAME // Note: will restore xSELF + REFRESH_MARKING_REGISTER cbz xIP0, 1f // Deliver the pending exception if method is null. adr xLR, art_quick_instrumentation_exit br xIP0 // Tail-call method with lr set to art_quick_instrumentation_exit. @@ -2263,6 +2324,7 @@ ENTRY art_quick_instrumentation_exit .cfi_adjust_cfa_offset -16 RESTORE_SAVE_REFS_ONLY_FRAME + REFRESH_MARKING_REGISTER cbz xIP0, 1f // Handle error br xIP0 // Tail-call out. 1: @@ -2831,6 +2893,7 @@ ENTRY art_quick_invoke_polymorphic .Lcleanup_and_return: DECREASE_FRAME 16 RESTORE_SAVE_REFS_AND_ARGS_FRAME + REFRESH_MARKING_REGISTER RETURN_OR_DELIVER_PENDING_EXCEPTION_X1 .section .rodata // Place handler table in read-only section away from text. diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h index 4683fc3fdd..d4c919220d 100644 --- a/runtime/arch/arm64/registers_arm64.h +++ b/runtime/arch/arm64/registers_arm64.h @@ -61,6 +61,7 @@ enum XRegister { kNumberOfXRegisters = 33, // Aliases. TR = X19, // ART Thread Register - Managed Runtime (Callee Saved Reg) + MR = X20, // ART Marking Register - Managed Runtime (Callee Saved Reg) IP0 = X16, // Used as scratch by VIXL. IP1 = X17, // Used as scratch by ART JNI Assembler. FP = X29, diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S index 2b3525b189..fbfa7564a7 100644 --- a/runtime/arch/quick_alloc_entrypoints.S +++ b/runtime/arch/quick_alloc_entrypoints.S @@ -53,7 +53,7 @@ GENERATE_ALLOC_ENTRYPOINTS _region_tlab_instrumented, RegionTLABInstrumented .endm // Generate the allocation entrypoints for each allocator. This is used as an alternative to -// GNERATE_ALL_ALLOC_ENTRYPOINTS for selectively implementing allocation fast paths in +// GENERATE_ALL_ALLOC_ENTRYPOINTS for selectively implementing allocation fast paths in // hand-written assembly. #define GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(c_suffix, cxx_suffix) \ ONE_ARG_DOWNCALL art_quick_alloc_object_resolved ## c_suffix, artAllocObjectFromCodeResolved ## cxx_suffix, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc index 458e830eda..8d3c62f3d0 100644 --- a/runtime/gc/collector/concurrent_copying.cc +++ b/runtime/gc/collector/concurrent_copying.cc @@ -166,7 +166,7 @@ void ConcurrentCopying::RunPhases() { } if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) { // Switch to read barrier mark entrypoints before we gray the objects. This is required in case - // a mutator sees a gray bit and dispatches on the entrpoint. (b/37876887). + // a mutator sees a gray bit and dispatches on the entrypoint. (b/37876887). ActivateReadBarrierEntrypoints(); // Gray dirty immune objects concurrently to reduce GC pause times. We re-process gray cards in // the pause. diff --git a/runtime/thread.cc b/runtime/thread.cc index 36ecd3398c..3a3a5a0dde 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -2862,6 +2862,7 @@ void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset) { DO_THREAD_OFFSET(SelfOffset<ptr_size>(), "self") DO_THREAD_OFFSET(StackEndOffset<ptr_size>(), "stack_end") DO_THREAD_OFFSET(ThinLockIdOffset<ptr_size>(), "thin_lock_thread_id") + DO_THREAD_OFFSET(IsGcMarkingOffset<ptr_size>(), "is_gc_marking") DO_THREAD_OFFSET(TopOfManagedStackOffset<ptr_size>(), "top_quick_frame_method") DO_THREAD_OFFSET(TopShadowFrameOffset<ptr_size>(), "top_shadow_frame") DO_THREAD_OFFSET(TopHandleScopeOffset<ptr_size>(), "top_handle_scope") diff --git a/runtime/thread.h b/runtime/thread.h index e785ddc803..24d126f2d1 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -656,6 +656,17 @@ class Thread { OFFSETOF_MEMBER(tls_ptr_sized_values, jni_entrypoints) + jni_entrypoint_offset); } + // Return the entry point offset integer value for ReadBarrierMarkRegX, where X is `reg`. + template <PointerSize pointer_size> + static int32_t ReadBarrierMarkEntryPointsOffset(size_t reg) { + // The entry point list defines 30 ReadBarrierMarkRegX entry points. + DCHECK_LT(reg, 30u); + // The ReadBarrierMarkRegX entry points are ordered by increasing + // register number in Thread::tls_Ptr_.quick_entrypoints. + return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value() + + static_cast<size_t>(pointer_size) * reg; + } + template<PointerSize pointer_size> static ThreadOffset<pointer_size> SelfOffset() { return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, self)); diff --git a/test/990-method-handle-and-mr/build b/test/990-method-handle-and-mr/build new file mode 100755 index 0000000000..5e5f36e24c --- /dev/null +++ b/test/990-method-handle-and-mr/build @@ -0,0 +1,25 @@ +#!/bin/bash +# +# Copyright 2017 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Exit on failure. +set -e + +if [[ $@ != *"--jvm"* ]]; then + # Don't do anything with jvm. + export USE_JACK=true +fi + +./default-build "$@" --experimental method-handles diff --git a/test/990-method-handle-and-mr/expected.txt b/test/990-method-handle-and-mr/expected.txt new file mode 100644 index 0000000000..8483fb5045 --- /dev/null +++ b/test/990-method-handle-and-mr/expected.txt @@ -0,0 +1,4 @@ +Test +Test +Test +passed diff --git a/test/990-method-handle-and-mr/info.txt b/test/990-method-handle-and-mr/info.txt new file mode 100644 index 0000000000..85a957ceea --- /dev/null +++ b/test/990-method-handle-and-mr/info.txt @@ -0,0 +1,2 @@ +Test stressing code generated for invoke-polymorphic instructions with +respect to Marking Register (on architectures supporting MR). diff --git a/test/990-method-handle-and-mr/src/Main.java b/test/990-method-handle-and-mr/src/Main.java new file mode 100644 index 0000000000..739b8eb551 --- /dev/null +++ b/test/990-method-handle-and-mr/src/Main.java @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This test was inspired by benchmarks.MicroMethodHandles.java.MicroMethodHandles. + +import java.io.PrintStream; +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; + +class A { + public Long binaryFunction(int x, double y) { + return 1000l; + } +} + +class Test { + Test() throws Throwable { + this.handle = MethodHandles.lookup().findVirtual(A.class, "binaryFunction", + MethodType.methodType(Long.class, int.class, + double.class)); + this.a = new A(); + this.x = new Integer(72); + this.y = new Double(-1.39e-31); + } + + void execute() { + try { + executeFor(2000); + System.out.println(getName()); + } catch (Throwable t) { + System.err.println("Exception during the execution of " + getName()); + System.err.println(t); + t.printStackTrace(new PrintStream(System.err)); + System.exit(1); + } + } + + void executeFor(long timeMinimumMillis) throws Throwable { + long startTime = System.currentTimeMillis(); + long elapsed = 0; + while (elapsed < timeMinimumMillis) { + exercise(); + elapsed = System.currentTimeMillis() - startTime; + } + } + + void exercise() throws Throwable { + for (int i = 0; i < EXERCISE_ITERATIONS; ++i) { + run(); + } + } + + void run() throws Throwable { + long result = (long) handle.invoke(a, x, y); + } + + String getName() { + return getClass().getSimpleName(); + } + + private static final int EXERCISE_ITERATIONS = 500; + + private MethodHandle handle; + private A a; + private Integer x; + private Double y; +} + +public class Main { + public static void main(String[] args) throws Throwable { + Test[] tests = new Test[] { new Test(), new Test(), new Test() }; + for (Test test : tests) { + test.execute(); + } + System.out.println("passed"); + } +} |