diff options
Diffstat (limited to 'compiler/optimizing')
| -rw-r--r-- | compiler/optimizing/code_generator.cc | 3 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 9 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 9 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 17 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86.h | 3 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 3 | ||||
| -rw-r--r-- | compiler/optimizing/codegen_test.cc | 61 | ||||
| -rw-r--r-- | compiler/optimizing/intrinsics_arm.cc | 466 | ||||
| -rw-r--r-- | compiler/optimizing/intrinsics_arm64.cc | 463 | ||||
| -rw-r--r-- | compiler/optimizing/intrinsics_x86.cc | 492 | ||||
| -rw-r--r-- | compiler/optimizing/intrinsics_x86_64.cc | 343 | ||||
| -rw-r--r-- | compiler/optimizing/optimizing_cfi_test.cc | 18 | ||||
| -rw-r--r-- | compiler/optimizing/optimizing_compiler.cc | 26 | ||||
| -rw-r--r-- | compiler/optimizing/register_allocator_graph_color.cc | 232 | ||||
| -rw-r--r-- | compiler/optimizing/register_allocator_graph_color.h | 18 |
15 files changed, 1711 insertions, 452 deletions
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 5152075499..c532e72465 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -1228,7 +1228,8 @@ void CodeGenerator::ValidateInvokeRuntimeWithoutRecordingPcInfo(HInstruction* in instruction->IsLoadString() || instruction->IsInstanceOf() || instruction->IsCheckCast() || - (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified())) + (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified()) || + (instruction->IsInvokeStaticOrDirect() && instruction->GetLocations()->Intrinsified())) << "instruction->DebugName()=" << instruction->DebugName() << " slow_path->GetDescription()=" << slow_path->GetDescription(); } diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 870d1fbd29..404f044cef 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -429,7 +429,8 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); @@ -441,6 +442,9 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode { DCHECK_NE(reg, SP); DCHECK_NE(reg, LR); DCHECK_NE(reg, PC); + // IP is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(reg, IP); DCHECK(0 <= reg && reg < kNumberOfCoreRegisters) << reg; // "Compact" slow path, saving two moves. // @@ -6482,7 +6486,8 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i // Introduce a dependency on the lock_word including the rb_state, // which shall prevent load-load reordering without using // a memory barrier (which would be more expensive). - // obj is unchanged by this operation, but its value now depends on temp_reg. + // `obj` is unchanged by this operation, but its value now depends + // on `temp_reg`. __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32)); // The actual reference load. diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 004d427511..122c174eae 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -595,7 +595,8 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); @@ -607,7 +608,8 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { DCHECK_NE(obj_.reg(), LR); DCHECK_NE(obj_.reg(), WSP); DCHECK_NE(obj_.reg(), WZR); - // WIP0 is used by the slow path as a temp, it can not be the object register. + // IP0 is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. DCHECK_NE(obj_.reg(), IP0); DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg(); // "Compact" slow path, saving two moves. @@ -5190,7 +5192,8 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // Introduce a dependency on the lock_word including rb_state, // to prevent load-load reordering, and without using // a memory barrier (which would be more expensive). - // obj is unchanged by this operation, but its value now depends on temp. + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32)); // The actual reference load. diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 0305d6a030..7aca16f867 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -464,7 +464,8 @@ class ReadBarrierMarkSlowPathX86 : public SlowPathCode { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); @@ -1578,15 +1579,15 @@ void LocationsBuilderX86::VisitSelect(HSelect* select) { locations->SetOut(Location::SameAsFirstInput()); } -void InstructionCodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) { +void CodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) { Register lhs_reg = lhs.AsRegister<Register>(); if (rhs.IsConstant()) { int32_t value = CodeGenerator::GetInt32ValueOf(rhs.GetConstant()); - codegen_->Compare32BitValue(lhs_reg, value); + Compare32BitValue(lhs_reg, value); } else if (rhs.IsStackSlot()) { - __ cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex())); + assembler_.cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex())); } else { - __ cmpl(lhs_reg, rhs.AsRegister<Register>()); + assembler_.cmpl(lhs_reg, rhs.AsRegister<Register>()); } } @@ -1619,7 +1620,7 @@ void InstructionCodeGeneratorX86::VisitSelect(HSelect* select) { DCHECK_NE(condition->InputAt(0)->GetType(), Primitive::kPrimLong); DCHECK(!Primitive::IsFloatingPointType(condition->InputAt(0)->GetType())); LocationSummary* cond_locations = condition->GetLocations(); - GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1)); + codegen_->GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1)); cond = X86Condition(condition->GetCondition()); } } else { @@ -1728,7 +1729,7 @@ void InstructionCodeGeneratorX86::HandleCondition(HCondition* cond) { // Clear output register: setb only sets the low byte. __ xorl(reg, reg); - GenerateIntCompare(lhs, rhs); + codegen_->GenerateIntCompare(lhs, rhs); __ setb(X86Condition(cond->GetCondition()), reg); return; } @@ -4210,7 +4211,7 @@ void InstructionCodeGeneratorX86::VisitCompare(HCompare* compare) { case Primitive::kPrimShort: case Primitive::kPrimChar: case Primitive::kPrimInt: { - GenerateIntCompare(left, right); + codegen_->GenerateIntCompare(left, right); break; } case Primitive::kPrimLong: { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index f306b33247..894f2e8f40 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -295,7 +295,6 @@ class InstructionCodeGeneratorX86 : public InstructionCodeGenerator { HBasicBlock* default_block); void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double); - void GenerateIntCompare(Location lhs, Location rhs); X86Assembler* const assembler_; CodeGeneratorX86* const codegen_; @@ -431,6 +430,8 @@ class CodeGeneratorX86 : public CodeGenerator { Register value, bool value_can_be_null); + void GenerateIntCompare(Location lhs, Location rhs); + void GenerateMemoryBarrier(MemBarrierKind kind); Label* GetLabelOf(HBasicBlock* block) const { diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 9ecd14ec5b..0c55ae44de 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -485,7 +485,8 @@ class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode { instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking slow path: " << instruction_->DebugName(); diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 18db507c48..fe6c0a305e 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -29,12 +29,6 @@ #include "arch/x86_64/instruction_set_features_x86_64.h" #include "base/macros.h" #include "builder.h" -#include "code_generator_arm.h" -#include "code_generator_arm64.h" -#include "code_generator_mips.h" -#include "code_generator_mips64.h" -#include "code_generator_x86.h" -#include "code_generator_x86_64.h" #include "code_simulator_container.h" #include "common_compiler_test.h" #include "dex_file.h" @@ -52,10 +46,35 @@ #include "utils/mips64/managed_register_mips64.h" #include "utils/x86/managed_register_x86.h" +#ifdef ART_ENABLE_CODEGEN_arm +#include "code_generator_arm.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_arm64 +#include "code_generator_arm64.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_x86 +#include "code_generator_x86.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_x86_64 +#include "code_generator_x86_64.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_mips +#include "code_generator_mips.h" +#endif + +#ifdef ART_ENABLE_CODEGEN_mips64 +#include "code_generator_mips64.h" +#endif + #include "gtest/gtest.h" namespace art { +#ifdef ART_ENABLE_CODEGEN_arm // Provide our own codegen, that ensures the C calling conventions // are preserved. Currently, ART and C do not match as R4 is caller-save // in ART, and callee-save in C. Alternatively, we could use or write @@ -80,7 +99,9 @@ class TestCodeGeneratorARM : public arm::CodeGeneratorARM { blocked_register_pairs_[arm::R6_R7] = false; } }; +#endif +#ifdef ART_ENABLE_CODEGEN_x86 class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 { public: TestCodeGeneratorX86(HGraph* graph, @@ -105,6 +126,7 @@ class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 { blocked_register_pairs_[x86::ECX_EDI] = false; } }; +#endif class InternalCodeAllocator : public CodeAllocator { public: @@ -234,37 +256,54 @@ static void RunCode(InstructionSet target_isa, bool has_result, Expected expected) { CompilerOptions compiler_options; +#ifdef ART_ENABLE_CODEGEN_arm if (target_isa == kArm || target_isa == kThumb2) { std::unique_ptr<const ArmInstructionSetFeatures> features_arm( ArmInstructionSetFeatures::FromCppDefines()); TestCodeGeneratorARM codegenARM(graph, *features_arm.get(), compiler_options); RunCode(&codegenARM, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kArm64) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_arm64 + if (target_isa == kArm64) { std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64( Arm64InstructionSetFeatures::FromCppDefines()); arm64::CodeGeneratorARM64 codegenARM64(graph, *features_arm64.get(), compiler_options); RunCode(&codegenARM64, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kX86) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_x86 + if (target_isa == kX86) { std::unique_ptr<const X86InstructionSetFeatures> features_x86( X86InstructionSetFeatures::FromCppDefines()); TestCodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options); RunCode(&codegenX86, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kX86_64) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_x86_64 + if (target_isa == kX86_64) { std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64( X86_64InstructionSetFeatures::FromCppDefines()); x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options); RunCode(&codegenX86_64, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kMips) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_mips + if (target_isa == kMips) { std::unique_ptr<const MipsInstructionSetFeatures> features_mips( MipsInstructionSetFeatures::FromCppDefines()); mips::CodeGeneratorMIPS codegenMIPS(graph, *features_mips.get(), compiler_options); RunCode(&codegenMIPS, graph, hook_before_codegen, has_result, expected); - } else if (target_isa == kMips64) { + } +#endif +#ifdef ART_ENABLE_CODEGEN_mips64 + if (target_isa == kMips64) { std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64( Mips64InstructionSetFeatures::FromCppDefines()); mips64::CodeGeneratorMIPS64 codegenMIPS64(graph, *features_mips64.get(), compiler_options); RunCode(&codegenMIPS64, graph, hook_before_codegen, has_result, expected); } +#endif } static ::std::vector<InstructionSet> GetTargetISAs() { diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 27d9d48560..0bbc0e54bc 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -41,6 +41,92 @@ ArenaAllocator* IntrinsicCodeGeneratorARM::GetAllocator() { using IntrinsicSlowPathARM = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM>; +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT + +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { + public: + explicit ReadBarrierSystemArrayCopySlowPathARM(HInstruction* instruction) + : SlowPathCode(instruction) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); + uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + Register dest = locations->InAt(2).AsRegister<Register>(); + Location dest_pos = locations->InAt(3); + Register src_curr_addr = locations->GetTemp(0).AsRegister<Register>(); + Register dst_curr_addr = locations->GetTemp(1).AsRegister<Register>(); + Register src_stop_addr = locations->GetTemp(2).AsRegister<Register>(); + Register tmp = locations->GetTemp(3).AsRegister<Register>(); + + __ Bind(GetEntryLabel()); + // Compute the base destination address in `dst_curr_addr`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(dst_curr_addr, dest, element_size * constant + offset); + } else { + __ add(dst_curr_addr, + dest, + ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(dst_curr_addr, offset); + } + + Label loop; + __ Bind(&loop); + __ ldr(tmp, Address(src_curr_addr, element_size, Address::PostIndex)); + __ MaybeUnpoisonHeapReference(tmp); + // TODO: Inline the mark bit check before calling the runtime? + // tmp = ReadBarrier::Mark(tmp); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + // (See ReadBarrierMarkSlowPathARM::EmitNativeCode for more + // explanations.) + DCHECK_NE(tmp, SP); + DCHECK_NE(tmp, LR); + DCHECK_NE(tmp, PC); + // IP is used internally by the ReadBarrierMarkRegX entry point + // as a temporary (and not preserved). It thus cannot be used by + // any live register in this slow path. + DCHECK_NE(src_curr_addr, IP); + DCHECK_NE(dst_curr_addr, IP); + DCHECK_NE(src_stop_addr, IP); + DCHECK_NE(tmp, IP); + DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp; + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp); + // This runtime call does not require a stack map. + arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + __ MaybePoisonHeapReference(tmp); + __ str(tmp, Address(dst_curr_addr, element_size, Address::PostIndex)); + __ cmp(src_curr_addr, ShifterOperand(src_stop_addr)); + __ b(&loop, NE); + __ b(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM); +}; + +#undef __ + bool IntrinsicLocationsBuilderARM::TryDispatch(HInvoke* invoke) { Dispatch(invoke); LocationSummary* res = invoke->GetLocations(); @@ -1337,9 +1423,9 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromString(HInvoke* invoke) } void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -1362,6 +1448,13 @@ void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) { if (length != nullptr && !assembler_->ShifterOperandCanAlwaysHold(length->GetValue())) { locations->SetInAt(4, Location::RequiresRegister()); } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Temporary register IP cannot be used in + // ReadBarrierSystemArrayCopySlowPathARM64 (because that register + // is clobbered by ReadBarrierMarkRegX entry points). Get an extra + // temporary register from the register allocator. + locations->AddTemp(Location::RequiresRegister()); + } } static void CheckPosition(ArmAssembler* assembler, @@ -1427,9 +1520,9 @@ static void CheckPosition(ArmAssembler* assembler, } void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); ArmAssembler* assembler = GetAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -1438,18 +1531,22 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); Register src = locations->InAt(0).AsRegister<Register>(); Location src_pos = locations->InAt(1); Register dest = locations->InAt(2).AsRegister<Register>(); Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); - Register temp1 = locations->GetTemp(0).AsRegister<Register>(); - Register temp2 = locations->GetTemp(1).AsRegister<Register>(); - Register temp3 = locations->GetTemp(2).AsRegister<Register>(); + Location temp1_loc = locations->GetTemp(0); + Register temp1 = temp1_loc.AsRegister<Register>(); + Location temp2_loc = locations->GetTemp(1); + Register temp2 = temp2_loc.AsRegister<Register>(); + Location temp3_loc = locations->GetTemp(2); + Register temp3 = temp3_loc.AsRegister<Register>(); - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); Label conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -1465,7 +1562,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ cmp(src, ShifterOperand(dest)); - __ b(slow_path->GetEntryLabel(), EQ); + __ b(intrinsic_slow_path->GetEntryLabel(), EQ); } // Checked when building locations. @@ -1477,7 +1574,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { __ b(&conditions_on_positions_validated, NE); } __ cmp(dest_pos.AsRegister<Register>(), ShifterOperand(src_pos_constant)); - __ b(slow_path->GetEntryLabel(), GT); + __ b(intrinsic_slow_path->GetEntryLabel(), GT); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -1490,19 +1587,19 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { } else { __ cmp(src_pos.AsRegister<Register>(), ShifterOperand(dest_pos.AsRegister<Register>())); } - __ b(slow_path->GetEntryLabel(), LT); + __ b(intrinsic_slow_path->GetEntryLabel(), LT); } __ Bind(&conditions_on_positions_validated); if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. - __ CompareAndBranchIfZero(src, slow_path->GetEntryLabel()); + __ CompareAndBranchIfZero(src, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. - __ CompareAndBranchIfZero(dest, slow_path->GetEntryLabel()); + __ CompareAndBranchIfZero(dest, intrinsic_slow_path->GetEntryLabel()); } // If the length is negative, bail out. @@ -1511,7 +1608,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { __ cmp(length.AsRegister<Register>(), ShifterOperand(0)); - __ b(slow_path->GetEntryLabel(), LT); + __ b(intrinsic_slow_path->GetEntryLabel(), LT); } // Validity checks: source. @@ -1519,7 +1616,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -1528,7 +1625,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); @@ -1537,112 +1634,287 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. - __ LoadFromOffset(kLoadWord, temp1, dest, class_offset); - __ LoadFromOffset(kLoadWord, temp2, src, class_offset); - bool did_unpoison = false; - if (!optimizations.GetDestinationIsNonPrimitiveArray() || - !optimizations.GetSourceIsNonPrimitiveArray()) { - // One or two of the references need to be unpoisoned. Unpoison them - // both to make the identity check valid. - __ MaybeUnpoisonHeapReference(temp1); - __ MaybeUnpoisonHeapReference(temp2); - did_unpoison = true; - } - if (!optimizations.GetDestinationIsNonPrimitiveArray()) { - // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); - __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp3); - __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel()); - } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ CompareAndBranchIfZero(temp1, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp1, temp1, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel()); + } - if (!optimizations.GetSourceIsNonPrimitiveArray()) { - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp2->component_type_ - __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset); - __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp3); - __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel()); - } + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false); + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ CompareAndBranchIfZero(temp2, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp2, temp2, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp2, intrinsic_slow_path->GetEntryLabel()); + } + + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // Note: if heap poisoning is on, we are comparing two unpoisoned references here. + __ cmp(temp1, ShifterOperand(temp2)); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + Label do_copy; + __ b(&do_copy, EQ); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); + __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ b(intrinsic_slow_path->GetEntryLabel(), NE); + } + } else { + // Non read barrier code. + + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ LoadFromOffset(kLoadWord, temp1, dest, class_offset); + // /* HeapReference<Class> */ temp2 = src->klass_ + __ LoadFromOffset(kLoadWord, temp2, src, class_offset); + bool did_unpoison = false; + if (!optimizations.GetDestinationIsNonPrimitiveArray() || + !optimizations.GetSourceIsNonPrimitiveArray()) { + // One or two of the references need to be unpoisoned. Unpoison them + // both to make the identity check valid. + __ MaybeUnpoisonHeapReference(temp1); + __ MaybeUnpoisonHeapReference(temp2); + did_unpoison = true; + } - __ cmp(temp1, ShifterOperand(temp2)); + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); + } + + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp2->component_type_ + __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); + } - if (optimizations.GetDestinationIsTypedObjectArray()) { - Label do_copy; - __ b(&do_copy, EQ); - if (!did_unpoison) { + __ cmp(temp1, ShifterOperand(temp2)); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + Label do_copy; + __ b(&do_copy, EQ); + if (!did_unpoison) { + __ MaybeUnpoisonHeapReference(temp1); + } + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset); __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); + // No need to unpoison the result, we're comparing against null. + __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ b(intrinsic_slow_path->GetEntryLabel(), NE); } - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->super_class_ - __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); - // No need to unpoison the result, we're comparing against null. - __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel()); - __ Bind(&do_copy); - } else { - __ b(slow_path->GetEntryLabel(), NE); } } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ LoadFromOffset(kLoadWord, temp1, src, class_offset); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); - __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp3); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp3_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp3` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ LoadFromOffset(kLoadWord, temp1, src, class_offset); + __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset); + __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp3); + } + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset); static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel()); + __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); } - // Compute base source address, base destination address, and end source address. - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + // Compute the base source address in `temp1`. if (src_pos.IsConstant()) { int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); __ AddConstant(temp1, src, element_size * constant + offset); } else { - __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, 2)); + __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, element_size_shift)); __ AddConstant(temp1, offset); } - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(temp2, dest, element_size * constant + offset); - } else { - __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, 2)); - __ AddConstant(temp2, offset); - } - + // Compute the end source address in `temp3`. if (length.IsConstant()) { int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); __ AddConstant(temp3, temp1, element_size * constant); } else { - __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, 2)); + __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, element_size_shift)); } - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - Label loop, done; - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&done, EQ); - __ Bind(&loop); - __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); - __ str(IP, Address(temp2, element_size, Address::PostIndex)); - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&loop, NE); - __ Bind(&done); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // The base destination address is computed later, as `temp2` is + // used for intermediate computations. + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + Label loop, done; + + // Don't enter copy loop if `length == 0`. + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&done, EQ); + + // /* int32_t */ monitor = src->monitor_ + __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `temp2`. + __ add(src, src, ShifterOperand(temp2, LSR, 32)); + + // Slow path used to copy array when `src` is gray. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1); + // Carry flag is the last bit shifted out by LSRS. + __ b(read_barrier_slow_path->GetEntryLabel(), CS); + + // Fast-path copy. + + // Compute the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(temp2, dest, element_size * constant + offset); + } else { + __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(temp2, offset); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); + __ str(IP, Address(temp2, element_size, Address::PostIndex)); + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&loop, NE); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); + } else { + // Non read barrier code. + + // Compute the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(temp2, dest, element_size * constant + offset); + } else { + __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(temp2, offset); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + Label loop, done; + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&done, EQ); + __ Bind(&loop); + __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); + __ str(IP, Address(temp2, element_size, Address::PostIndex)); + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&loop, NE); + __ Bind(&done); + } // We only need one card marking on the destination array. codegen_->MarkGCCard(temp1, @@ -1651,7 +1923,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { Register(kNoRegister), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 9cfe3ce569..91374b3108 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -144,6 +144,73 @@ class IntrinsicSlowPathARM64 : public SlowPathCodeARM64 { DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARM64); }; +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 { + public: + ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp) + : SlowPathCodeARM64(instruction), tmp_(tmp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE { + CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + + Register src_curr_addr = XRegisterFrom(locations->GetTemp(0)); + Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1)); + Register src_stop_addr = XRegisterFrom(locations->GetTemp(2)); + Register tmp_reg = WRegisterFrom(tmp_); + + __ Bind(GetEntryLabel()); + vixl::aarch64::Label slow_copy_loop; + __ Bind(&slow_copy_loop); + __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex)); + codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg); + // TODO: Inline the mark bit check before calling the runtime? + // tmp_reg = ReadBarrier::Mark(tmp_reg); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more + // explanations.) + DCHECK_NE(tmp_.reg(), LR); + DCHECK_NE(tmp_.reg(), WSP); + DCHECK_NE(tmp_.reg(), WZR); + // IP0 is used internally by the ReadBarrierMarkRegX entry point + // as a temporary (and not preserved). It thus cannot be used by + // any live register in this slow path. + DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0); + DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0); + DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0); + DCHECK_NE(tmp_.reg(), IP0); + DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg(); + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg()); + // This runtime call does not require a stack map. + codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg); + __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex)); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&slow_copy_loop, ne); + __ B(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM64"; } + + private: + Location tmp_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64); +}; #undef __ bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) { @@ -2035,9 +2102,9 @@ static constexpr int32_t kSystemArrayCopyThreshold = 128; // We want to use two temporary registers in order to reduce the register pressure in arm64. // So we don't use the CodeGenerator::CreateSystemArrayCopyLocationSummary. void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -2090,12 +2157,20 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { locations->AddTemp(Location::RequiresRegister()); locations->AddTemp(Location::RequiresRegister()); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Temporary register IP0, obtained from the VIXL scratch register + // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64 + // (because that register is clobbered by ReadBarrierMarkRegX + // entry points). Get an extra temporary register from the + // register allocator. + locations->AddTemp(Location::RequiresRegister()); + } } void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -2104,6 +2179,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); Register src = XRegisterFrom(locations->InAt(0)); Location src_pos = locations->InAt(1); @@ -2111,10 +2187,12 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); Register temp1 = WRegisterFrom(locations->GetTemp(0)); + Location temp1_loc = LocationFrom(temp1); Register temp2 = WRegisterFrom(locations->GetTemp(1)); + Location temp2_loc = LocationFrom(temp2); - SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCodeARM64* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); vixl::aarch64::Label conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -2130,7 +2208,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ Cmp(src, dest); - __ B(slow_path->GetEntryLabel(), eq); + __ B(intrinsic_slow_path->GetEntryLabel(), eq); } // Checked when building locations. DCHECK(!optimizations.GetDestinationIsSource() @@ -2141,7 +2219,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { __ B(&conditions_on_positions_validated, ne); } __ Cmp(WRegisterFrom(dest_pos), src_pos_constant); - __ B(slow_path->GetEntryLabel(), gt); + __ B(intrinsic_slow_path->GetEntryLabel(), gt); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -2150,19 +2228,19 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { } __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()), OperandFrom(dest_pos, invoke->InputAt(3)->GetType())); - __ B(slow_path->GetEntryLabel(), lt); + __ B(intrinsic_slow_path->GetEntryLabel(), lt); } __ Bind(&conditions_on_positions_validated); if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. - __ Cbz(src, slow_path->GetEntryLabel()); + __ Cbz(src, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. - __ Cbz(dest, slow_path->GetEntryLabel()); + __ Cbz(dest, intrinsic_slow_path->GetEntryLabel()); } // We have already checked in the LocationsBuilder for the constant case. @@ -2170,17 +2248,17 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { // If the length is negative, bail out. - __ Tbnz(WRegisterFrom(length), kWRegSize - 1, slow_path->GetEntryLabel()); + __ Tbnz(WRegisterFrom(length), kWRegSize - 1, intrinsic_slow_path->GetEntryLabel()); // If the length >= 128 then (currently) prefer native implementation. __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold); - __ B(slow_path->GetEntryLabel(), ge); + __ B(intrinsic_slow_path->GetEntryLabel(), ge); } // Validity checks: source. CheckSystemArrayCopyPosition(masm, src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -2189,90 +2267,236 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); { // We use a block to end the scratch scope before the write barrier, thus // freeing the temporary registers so they can be used in `MarkGCCard`. UseScratchRegisterScope temps(masm); + // Note: Because it is acquired from VIXL's scratch register pool, + // `temp3` might be IP0, and thus cannot be used as `ref` argument + // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier + // calls below (see ReadBarrierMarkSlowPathARM64 for more details). Register temp3 = temps.AcquireW(); + if (!optimizations.GetDoesNotNeedTypeCheck()) { // Check whether all elements of the source array are assignable to the component // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. - __ Ldr(temp1, MemOperand(dest, class_offset)); - __ Ldr(temp2, MemOperand(src, class_offset)); - bool did_unpoison = false; - if (!optimizations.GetDestinationIsNonPrimitiveArray() || - !optimizations.GetSourceIsNonPrimitiveArray()) { - // One or two of the references need to be unpoisoned. Unpoison them - // both to make the identity check valid. - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2); - did_unpoison = true; - } - if (!optimizations.GetDestinationIsNonPrimitiveArray()) { - // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ Ldr(temp3, HeapOperand(temp1, component_offset)); - __ Cbz(temp3, slow_path->GetEntryLabel()); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); - __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ Cbnz(temp3, slow_path->GetEntryLabel()); - } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + src.W(), + class_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + temp1, + component_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + __ Cbz(temp1, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_); + __ Ldrh(temp1, HeapOperand(temp1, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel()); + } - if (!optimizations.GetSourceIsNonPrimitiveArray()) { - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp3 = temp2->component_type_ - __ Ldr(temp3, HeapOperand(temp2, component_offset)); - __ Cbz(temp3, slow_path->GetEntryLabel()); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); - __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); - static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ Cbnz(temp3, slow_path->GetEntryLabel()); - } + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + dest.W(), + class_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp2_loc, + temp1, + component_offset, + temp3, + /* needs_null_check */ false, + /* use_load_acquire */ false); + __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_); + __ Ldrh(temp2, HeapOperand(temp2, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel()); + } - __ Cmp(temp1, temp2); + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp2_loc, + src.W(), + class_offset, + temp3, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // Note: if heap poisoning is on, we are comparing two unpoisoned references here. + __ Cmp(temp1, temp2); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + vixl::aarch64::Label do_copy; + __ B(&do_copy, eq); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + temp1, + component_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ Ldr(temp1, HeapOperand(temp1, super_offset)); + __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ B(intrinsic_slow_path->GetEntryLabel(), ne); + } + } else { + // Non read barrier code. + + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ Ldr(temp1, MemOperand(dest, class_offset)); + // /* HeapReference<Class> */ temp2 = src->klass_ + __ Ldr(temp2, MemOperand(src, class_offset)); + bool did_unpoison = false; + if (!optimizations.GetDestinationIsNonPrimitiveArray() || + !optimizations.GetSourceIsNonPrimitiveArray()) { + // One or two of the references need to be unpoisoned. Unpoison them + // both to make the identity check valid. + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2); + did_unpoison = true; + } + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp1->component_type_ + __ Ldr(temp3, HeapOperand(temp1, component_offset)); + __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel()); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel()); + } - if (optimizations.GetDestinationIsTypedObjectArray()) { - vixl::aarch64::Label do_copy; - __ B(&do_copy, eq); - if (!did_unpoison) { + if (!optimizations.GetSourceIsNonPrimitiveArray()) { + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp3 = temp2->component_type_ + __ Ldr(temp3, HeapOperand(temp2, component_offset)); + __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel()); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); + // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_); + __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel()); + } + + __ Cmp(temp1, temp2); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + vixl::aarch64::Label do_copy; + __ B(&do_copy, eq); + if (!did_unpoison) { + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + } + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ Ldr(temp1, HeapOperand(temp1, component_offset)); codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + __ Ldr(temp1, HeapOperand(temp1, super_offset)); + // No need to unpoison the result, we're comparing against null. + __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ B(intrinsic_slow_path->GetEntryLabel(), ne); } - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ Ldr(temp1, HeapOperand(temp1, component_offset)); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->super_class_ - __ Ldr(temp1, HeapOperand(temp1, super_offset)); - // No need to unpoison the result, we're comparing against null. - __ Cbnz(temp1, slow_path->GetEntryLabel()); - __ Bind(&do_copy); - } else { - __ B(slow_path->GetEntryLabel(), ne); } } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ Ldr(temp1, HeapOperand(src.W(), class_offset)); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp3 = temp1->component_type_ - __ Ldr(temp3, HeapOperand(temp1, component_offset)); - __ Cbz(temp3, slow_path->GetEntryLabel()); - codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3); - __ Ldrh(temp3, HeapOperand(temp3, primitive_offset)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp1_loc, + src.W(), + class_offset, + temp2, + /* needs_null_check */ false, + /* use_load_acquire */ false); + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke, + temp2_loc, + temp1, + component_offset, + temp3, + /* needs_null_check */ false, + /* use_load_acquire */ false); + __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ Ldr(temp1, HeapOperand(src.W(), class_offset)); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + __ Ldr(temp2, HeapOperand(temp1, component_offset)); + __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2); + } + // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_); + __ Ldrh(temp2, HeapOperand(temp2, primitive_offset)); static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ Cbnz(temp3, slow_path->GetEntryLabel()); + __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel()); } Register src_curr_addr = temp1.X(); Register dst_curr_addr = temp2.X(); - Register src_stop_addr = temp3.X(); + Register src_stop_addr; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // Temporary register IP0, obtained from the VIXL scratch + // register pool as `temp3`, cannot be used in + // ReadBarrierSystemArrayCopySlowPathARM64 (because that + // register is clobbered by ReadBarrierMarkRegX entry points). + // So another temporary register allocated by the register + // allocator instead. + DCHECK_EQ(LocationFrom(temp3).reg(), IP0); + src_stop_addr = XRegisterFrom(locations->GetTemp(2)); + } else { + src_stop_addr = temp3.X(); + } GenSystemArrayCopyAddresses(masm, Primitive::kPrimNot, @@ -2285,25 +2509,98 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { dst_curr_addr, src_stop_addr); - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - vixl::aarch64::Label loop, done; const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - __ Bind(&loop); - __ Cmp(src_curr_addr, src_stop_addr); - __ B(&done, eq); - { + + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + vixl::aarch64::Label loop, done; + + // Don't enter copy loop if `length == 0`. + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&done, eq); + Register tmp = temps.AcquireW(); + // Make sure `tmp` is not IP0, as it is clobbered by + // ReadBarrierMarkRegX entry points in + // ReadBarrierSystemArrayCopySlowPathARM64. + DCHECK_NE(LocationFrom(tmp).reg(), IP0); + + // /* int32_t */ monitor = src->monitor_ + __ Ldr(tmp, HeapOperand(src.W(), monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `tmp`. + __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32)); + + // Slow path used to copy array when `src` is gray. + SlowPathCodeARM64* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp)); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&loop, ne); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); + } else { + // Non read barrier code. + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + vixl::aarch64::Label loop, done; + __ Bind(&loop); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&done, eq); + { + Register tmp = temps.AcquireW(); + __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); + __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); + } + __ B(&loop); + __ Bind(&done); } - __ B(&loop); - __ Bind(&done); } // We only need one card marking on the destination array. codegen_->MarkGCCard(dest.W(), Register(), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } static void GenIsInfinite(LocationSummary* locations, diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 22f4181b92..49d6c1952c 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -70,6 +70,105 @@ static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) { using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>; +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT + +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode { + public: + explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction) + : SlowPathCode(instruction) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + Register src = locations->InAt(0).AsRegister<Register>(); + Location src_pos = locations->InAt(1); + Register dest = locations->InAt(2).AsRegister<Register>(); + Location dest_pos = locations->InAt(3); + Location length = locations->InAt(4); + Location temp1_loc = locations->GetTemp(0); + Register temp1 = temp1_loc.AsRegister<Register>(); + Register temp2 = locations->GetTemp(1).AsRegister<Register>(); + Register temp3 = locations->GetTemp(2).AsRegister<Register>(); + + __ Bind(GetEntryLabel()); + // In this code path, registers `temp1`, `temp2`, and `temp3` + // (resp.) are not used for the base source address, the base + // destination address, and the end source address (resp.), as in + // other SystemArrayCopy intrinsic code paths. Instead they are + // (resp.) used for: + // - the loop index (`i`); + // - the source index (`src_index`) and the loaded (source) + // reference (`value`); and + // - the destination index (`dest_index`). + + // i = 0 + __ xorl(temp1, temp1); + NearLabel loop; + __ Bind(&loop); + // value = src_array[i + src_pos] + if (src_pos.IsConstant()) { + int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); + int32_t adjusted_offset = offset + constant * element_size; + __ movl(temp2, Address(src, temp1, ScaleFactor::TIMES_4, adjusted_offset)); + } else { + __ leal(temp2, Address(src_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0)); + __ movl(temp2, Address(src, temp2, ScaleFactor::TIMES_4, offset)); + } + __ MaybeUnpoisonHeapReference(temp2); + // TODO: Inline the mark bit check before calling the runtime? + // value = ReadBarrier::Mark(value) + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + // (See ReadBarrierMarkSlowPathX86::EmitNativeCode for more + // explanations.) + DCHECK_NE(temp2, ESP); + DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2; + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2); + // This runtime call does not require a stack map. + x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + __ MaybePoisonHeapReference(temp2); + // dest_array[i + dest_pos] = value + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + int32_t adjusted_offset = offset + constant * element_size; + __ movl(Address(dest, temp1, ScaleFactor::TIMES_4, adjusted_offset), temp2); + } else { + __ leal(temp3, Address(dest_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0)); + __ movl(Address(dest, temp3, ScaleFactor::TIMES_4, offset), temp2); + } + // ++i + __ addl(temp1, Immediate(1)); + // if (i != length) goto loop + x86_codegen->GenerateIntCompare(temp1_loc, length); + __ j(kNotEqual, &loop); + __ jmp(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86); +}; + +#undef __ + #define __ assembler-> static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) { @@ -2678,9 +2777,9 @@ static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1) } void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -2710,9 +2809,9 @@ void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); X86Assembler* assembler = GetAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -2721,17 +2820,21 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); Register src = locations->InAt(0).AsRegister<Register>(); Location src_pos = locations->InAt(1); Register dest = locations->InAt(2).AsRegister<Register>(); Location dest_pos = locations->InAt(3); - Location length = locations->InAt(4); - Register temp1 = locations->GetTemp(0).AsRegister<Register>(); - Register temp2 = locations->GetTemp(1).AsRegister<Register>(); + Location length_arg = locations->InAt(4); + Location length = length_arg; + Location temp1_loc = locations->GetTemp(0); + Register temp1 = temp1_loc.AsRegister<Register>(); + Location temp2_loc = locations->GetTemp(1); + Register temp2 = temp2_loc.AsRegister<Register>(); - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); NearLabel conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -2747,7 +2850,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ cmpl(src, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -2755,7 +2858,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotEqual, &conditions_on_positions_validated); } __ cmpl(dest_pos.AsRegister<Register>(), Immediate(src_pos_constant)); - __ j(kGreater, slow_path->GetEntryLabel()); + __ j(kGreater, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -2765,10 +2868,10 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { if (dest_pos.IsConstant()) { int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); __ cmpl(src_pos.AsRegister<Register>(), Immediate(dest_pos_constant)); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } else { __ cmpl(src_pos.AsRegister<Register>(), dest_pos.AsRegister<Register>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } } @@ -2777,16 +2880,17 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. __ testl(src, src); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. __ testl(dest, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } - Register temp3 = locations->GetTemp(2).AsRegister<Register>(); + Location temp3_loc = locations->GetTemp(2); + Register temp3 = temp3_loc.AsRegister<Register>(); if (length.IsStackSlot()) { __ movl(temp3, Address(ESP, length.GetStackIndex())); length = Location::RegisterLocation(temp3); @@ -2798,7 +2902,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { __ testl(length.AsRegister<Register>(), length.AsRegister<Register>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } // Validity checks: source. @@ -2806,7 +2910,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -2815,7 +2919,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); @@ -2824,72 +2928,159 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. + if (!optimizations.GetSourceIsNonPrimitiveArray()) { - // /* HeapReference<Class> */ temp1 = temp1->klass_ - __ movl(temp1, Address(src, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ movl(temp1, Address(temp1, component_offset)); - __ testl(temp1, temp1); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp1); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ movl(temp1, Address(src, class_offset)); + __ MaybeUnpoisonHeapReference(temp1); + // Bail out if the source is not a non primitive array. + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ movl(temp1, Address(temp1, component_offset)); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp1); + } __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } - if (!optimizations.GetDestinationIsNonPrimitiveArray()) { - // /* HeapReference<Class> */ temp1 = temp1->klass_ - __ movl(temp1, Address(dest, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ temp2 = temp1->component_type_ - __ movl(temp2, Address(temp1, component_offset)); - __ testl(temp2, temp2); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp2); - __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - // Re-poison the heap reference to make the compare instruction below - // compare two poisoned references. - __ PoisonHeapReference(temp1); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + if (length.Equals(Location::RegisterLocation(temp3))) { + // When Baker read barriers are enabled, register `temp3`, + // which in the present case contains the `length` parameter, + // will be overwritten below. Make the `length` location + // reference the original stack location; it will be moved + // back to `temp3` later if necessary. + DCHECK(length_arg.IsStackSlot()); + length = length_arg; + } + + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false); + + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + // Bail out if the destination is not a non primitive array. + // + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(temp2, temp2); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp2` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + } + + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // Note: if heap poisoning is on, we are comparing two unpoisoned references here. + __ cmpl(temp1, temp2); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + NearLabel do_copy; + __ j(kEqual, &do_copy); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ cmpl(Address(temp1, super_offset), Immediate(0)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + } } else { - // /* HeapReference<Class> */ temp1 = temp1->klass_ - __ movl(temp1, Address(dest, class_offset)); - } + // Non read barrier code. - // Note: if poisoning is on, we are here comparing two poisoned references. - __ cmpl(temp1, Address(src, class_offset)); + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ movl(temp1, Address(dest, class_offset)); + if (!optimizations.GetDestinationIsNonPrimitiveArray()) { + __ MaybeUnpoisonHeapReference(temp1); + // Bail out if the destination is not a non primitive array. + // /* HeapReference<Class> */ temp2 = temp1->component_type_ + __ movl(temp2, Address(temp1, component_offset)); + __ testl(temp2, temp2); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(temp2); + __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + // Re-poison the heap reference to make the compare instruction below + // compare two poisoned references. + __ PoisonHeapReference(temp1); + } - if (optimizations.GetDestinationIsTypedObjectArray()) { - NearLabel do_copy; - __ j(kEqual, &do_copy); + // Note: if heap poisoning is on, we are comparing two poisoned references here. + __ cmpl(temp1, Address(src, class_offset)); + + if (optimizations.GetDestinationIsTypedObjectArray()) { + NearLabel do_copy; + __ j(kEqual, &do_copy); + __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ movl(temp1, Address(temp1, component_offset)); + __ MaybeUnpoisonHeapReference(temp1); + __ cmpl(Address(temp1, super_offset), Immediate(0)); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + __ Bind(&do_copy); + } else { + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); + } + } + } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { + DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); + // Bail out if the source is not a non primitive array. + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `temp1` has been unpoisoned + // by the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ movl(temp1, Address(src, class_offset)); __ MaybeUnpoisonHeapReference(temp1); // /* HeapReference<Class> */ temp1 = temp1->component_type_ __ movl(temp1, Address(temp1, component_offset)); + __ testl(temp1, temp1); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); __ MaybeUnpoisonHeapReference(temp1); - __ cmpl(Address(temp1, super_offset), Immediate(0)); - __ j(kNotEqual, slow_path->GetEntryLabel()); - __ Bind(&do_copy); - } else { - __ j(kNotEqual, slow_path->GetEntryLabel()); } - } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { - DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); - // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ movl(temp1, Address(src, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ movl(temp1, Address(temp1, component_offset)); - __ testl(temp1, temp1); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(temp1); __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } - // Compute base source address, base destination address, and end source address. + // Compute the base source address in `temp1`. int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); DCHECK_EQ(element_size, 4); uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); @@ -2900,35 +3091,136 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ leal(temp1, Address(src, src_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); } - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp2, Address(dest, element_size * constant + offset)); - } else { - __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); - } + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // If it is needed (in the case of the fast-path loop), the base + // destination address is computed later, as `temp2` is used for + // intermediate computations. - if (length.IsConstant()) { - int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp3, Address(temp1, element_size * constant)); + // Compute the end source address in `temp3`. + if (length.IsConstant()) { + int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp3, Address(temp1, element_size * constant)); + } else { + if (length.IsStackSlot()) { + // Location `length` is again pointing at a stack slot, as + // register `temp3` (which was containing the length parameter + // earlier) has been overwritten; restore it now + DCHECK(length.Equals(length_arg)); + __ movl(temp3, Address(ESP, length.GetStackIndex())); + length = Location::RegisterLocation(temp3); + } + __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); + } + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // for (size_t i = 0; i != length; ++i) { + // dest_array[dest_pos + i] = + // MaybePoison(ReadBarrier::Mark(MaybeUnpoison(src_array[src_pos + i]))); + // } + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + NearLabel loop, done; + + // Don't enter copy loop if `length == 0`. + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + + // /* int32_t */ monitor = src->monitor_ + __ movl(temp2, Address(src, monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Load fence to prevent load-load reordering. + // Note that this is a no-op, thanks to the x86 memory model. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + + // Slow path used to copy array when `src` is gray. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with SHR. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ shrl(temp2, Immediate(LockWord::kReadBarrierStateShift + 1)); + __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + + // Set the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp2, Address(dest, element_size * constant + offset)); + } else { + __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ pushl(Address(temp1, 0)); + __ cfi().AdjustCFAOffset(4); + __ popl(Address(temp2, 0)); + __ cfi().AdjustCFAOffset(-4); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); } else { - __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); - } - - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - NearLabel loop, done; - __ cmpl(temp1, temp3); - __ j(kEqual, &done); - __ Bind(&loop); - __ pushl(Address(temp1, 0)); - __ cfi().AdjustCFAOffset(4); - __ popl(Address(temp2, 0)); - __ cfi().AdjustCFAOffset(-4); - __ addl(temp1, Immediate(element_size)); - __ addl(temp2, Immediate(element_size)); - __ cmpl(temp1, temp3); - __ j(kNotEqual, &loop); - __ Bind(&done); + // Non read barrier code. + + // Compute the base destination address in `temp2`. + if (dest_pos.IsConstant()) { + int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp2, Address(dest, element_size * constant + offset)); + } else { + __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); + } + + // Compute the end source address in `temp3`. + if (length.IsConstant()) { + int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); + __ leal(temp3, Address(temp1, element_size * constant)); + } else { + __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); + } + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + NearLabel loop, done; + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + __ Bind(&loop); + __ pushl(Address(temp1, 0)); + __ cfi().AdjustCFAOffset(4); + __ popl(Address(temp2, 0)); + __ cfi().AdjustCFAOffset(-4); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + __ Bind(&done); + } // We only need one card marking on the destination array. codegen_->MarkGCCard(temp1, @@ -2937,7 +3229,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { Register(kNoRegister), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble) diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index ab8b05c3d4..311e1cd6eb 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -64,6 +64,65 @@ static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) { using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>; +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT + +// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. +class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode { + public: + explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction) + : SlowPathCode(instruction) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(instruction_->IsInvokeStaticOrDirect()) + << "Unexpected instruction in read barrier arraycopy slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); + + int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + + CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>(); + CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>(); + CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>(); + + __ Bind(GetEntryLabel()); + NearLabel loop; + __ Bind(&loop); + __ movl(CpuRegister(TMP), Address(src_curr_addr, 0)); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + // TODO: Inline the mark bit check before calling the runtime? + // TMP = ReadBarrier::Mark(TMP); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP); + // This runtime call does not require a stack map. + x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + __ MaybePoisonHeapReference(CpuRegister(TMP)); + __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP)); + __ addl(src_curr_addr, Immediate(element_size)); + __ addl(dst_curr_addr, Immediate(element_size)); + __ cmpl(src_curr_addr, src_stop_addr); + __ j(kNotEqual, &loop); + __ jmp(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64); +}; + +#undef __ + #define __ assembler-> static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { @@ -1053,9 +1112,9 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - if (kEmitCompilerReadBarrier) { + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { return; } @@ -1063,9 +1122,9 @@ void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { - // TODO(rpl): Implement read barriers in the SystemArrayCopy - // intrinsic and re-enable it (b/29516905). - DCHECK(!kEmitCompilerReadBarrier); + // The only read barrier implementation supporting the + // SystemArrayCopy intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); X86_64Assembler* assembler = GetAssembler(); LocationSummary* locations = invoke->GetLocations(); @@ -1074,18 +1133,23 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>(); Location src_pos = locations->InAt(1); CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>(); Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); - CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>(); - CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>(); - CpuRegister temp3 = locations->GetTemp(2).AsRegister<CpuRegister>(); + Location temp1_loc = locations->GetTemp(0); + CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>(); + Location temp2_loc = locations->GetTemp(1); + CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>(); + Location temp3_loc = locations->GetTemp(2); + CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>(); + Location TMP_loc = Location::RegisterLocation(TMP); - SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke); + codegen_->AddSlowPath(intrinsic_slow_path); NearLabel conditions_on_positions_validated; SystemArrayCopyOptimizations optimizations(invoke); @@ -1101,7 +1165,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { DCHECK_GE(src_pos_constant, dest_pos_constant); } else if (src_pos_constant < dest_pos_constant) { __ cmpl(src, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -1109,7 +1173,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotEqual, &conditions_on_positions_validated); } __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant)); - __ j(kGreater, slow_path->GetEntryLabel()); + __ j(kGreater, intrinsic_slow_path->GetEntryLabel()); } } else { if (!optimizations.GetDestinationIsSource()) { @@ -1119,10 +1183,10 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { if (dest_pos.IsConstant()) { int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant)); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } else { __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } } @@ -1131,13 +1195,13 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { if (!optimizations.GetSourceIsNotNull()) { // Bail out if the source is null. __ testl(src, src); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) { // Bail out if the destination is null. __ testl(dest, dest); - __ j(kEqual, slow_path->GetEntryLabel()); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); } // If the length is negative, bail out. @@ -1146,7 +1210,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { !optimizations.GetCountIsSourceLength() && !optimizations.GetCountIsDestinationLength()) { __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>()); - __ j(kLess, slow_path->GetEntryLabel()); + __ j(kLess, intrinsic_slow_path->GetEntryLabel()); } // Validity checks: source. @@ -1154,7 +1218,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { src_pos, src, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsSourceLength()); @@ -1163,7 +1227,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { dest_pos, dest, length, - slow_path, + intrinsic_slow_path, temp1, optimizations.GetCountIsDestinationLength()); @@ -1172,38 +1236,80 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { // type of the destination array. We do two checks: the classes are the same, // or the destination is Object[]. If none of these checks succeed, we go to the // slow path. - __ movl(temp1, Address(dest, class_offset)); - __ movl(temp2, Address(src, class_offset)); + bool did_unpoison = false; - if (!optimizations.GetDestinationIsNonPrimitiveArray() || - !optimizations.GetSourceIsNonPrimitiveArray()) { - // One or two of the references need to be unpoisoned. Unpoison them - // both to make the identity check valid. - __ MaybeUnpoisonHeapReference(temp1); - __ MaybeUnpoisonHeapReference(temp2); - did_unpoison = true; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = dest->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, dest, class_offset, temp3_loc, /* needs_null_check */ false); + // Register `temp1` is not trashed by the read barrier emitted + // by GenerateFieldLoadWithBakerReadBarrier below, as that + // method produces a call to a ReadBarrierMarkRegX entry point, + // which saves all potentially live registers, including + // temporaries such a `temp1`. + // /* HeapReference<Class> */ temp2 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // If heap poisoning is enabled, `temp1` and `temp2` have been + // unpoisoned by the the previous calls to + // GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ temp1 = dest->klass_ + __ movl(temp1, Address(dest, class_offset)); + // /* HeapReference<Class> */ temp2 = src->klass_ + __ movl(temp2, Address(src, class_offset)); + if (!optimizations.GetDestinationIsNonPrimitiveArray() || + !optimizations.GetSourceIsNonPrimitiveArray()) { + // One or two of the references need to be unpoisoned. Unpoison them + // both to make the identity check valid. + __ MaybeUnpoisonHeapReference(temp1); + __ MaybeUnpoisonHeapReference(temp2); + did_unpoison = true; + } } if (!optimizations.GetDestinationIsNonPrimitiveArray()) { // Bail out if the destination is not a non primitive array. - // /* HeapReference<Class> */ TMP = temp1->component_type_ - __ movl(CpuRegister(TMP), Address(temp1, component_offset)); - __ testl(CpuRegister(TMP), CpuRegister(TMP)); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ TMP = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `TMP` has been unpoisoned by + // the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ TMP = temp1->component_type_ + __ movl(CpuRegister(TMP), Address(temp1, component_offset)); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + } __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } if (!optimizations.GetSourceIsNonPrimitiveArray()) { // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ TMP = temp2->component_type_ - __ movl(CpuRegister(TMP), Address(temp2, component_offset)); - __ testl(CpuRegister(TMP), CpuRegister(TMP)); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // For the same reason given earlier, `temp1` is not trashed by the + // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below. + // /* HeapReference<Class> */ TMP = temp2->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, TMP_loc, temp2, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + // If heap poisoning is enabled, `TMP` has been unpoisoned by + // the the previous call to GenerateFieldLoadWithBakerReadBarrier. + } else { + // /* HeapReference<Class> */ TMP = temp2->component_type_ + __ movl(CpuRegister(TMP), Address(temp2, component_offset)); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + } __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } __ cmpl(temp1, temp2); @@ -1211,34 +1317,56 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { if (optimizations.GetDestinationIsTypedObjectArray()) { NearLabel do_copy; __ j(kEqual, &do_copy); - if (!did_unpoison) { + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + // We do not need to emit a read barrier for the following + // heap reference load, as `temp1` is only used in a + // comparison with null below, and this reference is not + // kept afterwards. + __ cmpl(Address(temp1, super_offset), Immediate(0)); + } else { + if (!did_unpoison) { + __ MaybeUnpoisonHeapReference(temp1); + } + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ movl(temp1, Address(temp1, component_offset)); __ MaybeUnpoisonHeapReference(temp1); + // No need to unpoison the following heap reference load, as + // we're comparing against null. + __ cmpl(Address(temp1, super_offset), Immediate(0)); } - // /* HeapReference<Class> */ temp1 = temp1->component_type_ - __ movl(temp1, Address(temp1, component_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ temp1 = temp1->super_class_ - __ movl(temp1, Address(temp1, super_offset)); - // No need to unpoison the result, we're comparing against null. - __ testl(temp1, temp1); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); __ Bind(&do_copy); } else { - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } } else if (!optimizations.GetSourceIsNonPrimitiveArray()) { DCHECK(optimizations.GetDestinationIsNonPrimitiveArray()); // Bail out if the source is not a non primitive array. - // /* HeapReference<Class> */ temp1 = src->klass_ - __ movl(temp1, Address(src, class_offset)); - __ MaybeUnpoisonHeapReference(temp1); - // /* HeapReference<Class> */ TMP = temp1->component_type_ - __ movl(CpuRegister(TMP), Address(temp1, component_offset)); - __ testl(CpuRegister(TMP), CpuRegister(TMP)); - __ j(kEqual, slow_path->GetEntryLabel()); - __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // /* HeapReference<Class> */ temp1 = src->klass_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, temp1_loc, src, class_offset, temp3_loc, /* needs_null_check */ false); + // /* HeapReference<Class> */ TMP = temp1->component_type_ + codegen_->GenerateFieldLoadWithBakerReadBarrier( + invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false); + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + } else { + // /* HeapReference<Class> */ temp1 = src->klass_ + __ movl(temp1, Address(src, class_offset)); + __ MaybeUnpoisonHeapReference(temp1); + // /* HeapReference<Class> */ TMP = temp1->component_type_ + __ movl(CpuRegister(TMP), Address(temp1, component_offset)); + // No need to unpoison `TMP` now, as we're comparing against null. + __ testl(CpuRegister(TMP), CpuRegister(TMP)); + __ j(kEqual, intrinsic_slow_path->GetEntryLabel()); + __ MaybeUnpoisonHeapReference(CpuRegister(TMP)); + } __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot)); - __ j(kNotEqual, slow_path->GetEntryLabel()); + __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } // Compute base source address, base destination address, and end source address. @@ -1266,19 +1394,86 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0)); } - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - NearLabel loop, done; - __ cmpl(temp1, temp3); - __ j(kEqual, &done); - __ Bind(&loop); - __ movl(CpuRegister(TMP), Address(temp1, 0)); - __ movl(Address(temp2, 0), CpuRegister(TMP)); - __ addl(temp1, Immediate(element_size)); - __ addl(temp2, Immediate(element_size)); - __ cmpl(temp1, temp3); - __ j(kNotEqual, &loop); - __ Bind(&done); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier): + // + // if (src_ptr != end_ptr) { + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::gray_ptr_); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + // } + + NearLabel loop, done; + + // Don't enter copy loop if `length == 0`. + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + + // /* int32_t */ monitor = src->monitor_ + __ movl(CpuRegister(TMP), Address(src, monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Load fence to prevent load-load reordering. + // Note that this is a no-op, thanks to the x86-64 memory model. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + + // Slow path used to copy array when `src` is gray. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with SHR. + static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1"); + static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2"); + __ shrl(CpuRegister(TMP), Immediate(LockWord::kReadBarrierStateShift + 1)); + __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + __ Bind(&loop); + __ movl(CpuRegister(TMP), Address(temp1, 0)); + __ movl(Address(temp2, 0), CpuRegister(TMP)); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + __ Bind(&done); + } else { + // Non read barrier code. + + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + NearLabel loop, done; + __ cmpl(temp1, temp3); + __ j(kEqual, &done); + __ Bind(&loop); + __ movl(CpuRegister(TMP), Address(temp1, 0)); + __ movl(Address(temp2, 0), CpuRegister(TMP)); + __ addl(temp1, Immediate(element_size)); + __ addl(temp2, Immediate(element_size)); + __ cmpl(temp1, temp3); + __ j(kNotEqual, &loop); + __ Bind(&done); + } // We only need one card marking on the destination array. codegen_->MarkGCCard(temp1, @@ -1287,7 +1482,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { CpuRegister(kNoRegister), /* value_can_be_null */ false); - __ Bind(slow_path->GetExitLabel()); + __ Bind(intrinsic_slow_path->GetExitLabel()); } void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) { diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc index a6d234d739..8c0231e1aa 100644 --- a/compiler/optimizing/optimizing_cfi_test.cc +++ b/compiler/optimizing/optimizing_cfi_test.cc @@ -157,13 +157,26 @@ class OptimizingCFITest : public CFITest { TestImpl(isa, #isa, expected_asm, expected_cfi); \ } +#ifdef ART_ENABLE_CODEGEN_arm TEST_ISA(kThumb2) +#endif +#ifdef ART_ENABLE_CODEGEN_arm64 TEST_ISA(kArm64) +#endif +#ifdef ART_ENABLE_CODEGEN_x86 TEST_ISA(kX86) +#endif +#ifdef ART_ENABLE_CODEGEN_x86_64 TEST_ISA(kX86_64) +#endif +#ifdef ART_ENABLE_CODEGEN_mips TEST_ISA(kMips) +#endif +#ifdef ART_ENABLE_CODEGEN_mips64 TEST_ISA(kMips64) +#endif +#ifdef ART_ENABLE_CODEGEN_arm TEST_F(OptimizingCFITest, kThumb2Adjust) { std::vector<uint8_t> expected_asm( expected_asm_kThumb2_adjust, @@ -184,7 +197,9 @@ TEST_F(OptimizingCFITest, kThumb2Adjust) { Finish(); Check(kThumb2, "kThumb2_adjust", expected_asm, expected_cfi); } +#endif +#ifdef ART_ENABLE_CODEGEN_mips TEST_F(OptimizingCFITest, kMipsAdjust) { // One NOP in delay slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum. static constexpr size_t kNumNops = 1u + (1u << 15); @@ -212,7 +227,9 @@ TEST_F(OptimizingCFITest, kMipsAdjust) { Finish(); Check(kMips, "kMips_adjust", expected_asm, expected_cfi); } +#endif +#ifdef ART_ENABLE_CODEGEN_mips64 TEST_F(OptimizingCFITest, kMips64Adjust) { // One NOP in forbidden slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum. static constexpr size_t kNumNops = 1u + (1u << 15); @@ -240,6 +257,7 @@ TEST_F(OptimizingCFITest, kMips64Adjust) { Finish(); Check(kMips64, "kMips64_adjust", expected_asm, expected_cfi); } +#endif #endif // ART_TARGET_ANDROID diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 6aaa15fa02..a1da20bae4 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -180,6 +180,7 @@ class PassObserver : public ValueObject { private: void StartPass(const char* pass_name) { + VLOG(compiler) << "Starting pass: " << pass_name; // Dump graph first, then start timer. if (visualizer_enabled_) { visualizer_.DumpGraph(pass_name, /* is_after_pass */ false, graph_in_bad_state_); @@ -438,11 +439,7 @@ static HOptimization* BuildOptimization( StackHandleScopeCollection* handles, SideEffectsAnalysis* most_recent_side_effects, HInductionVarAnalysis* most_recent_induction) { - if (opt_name == arm::InstructionSimplifierArm::kInstructionSimplifierArmPassName) { - return new (arena) arm::InstructionSimplifierArm(graph, stats); - } else if (opt_name == arm64::InstructionSimplifierArm64::kInstructionSimplifierArm64PassName) { - return new (arena) arm64::InstructionSimplifierArm64(graph, stats); - } else if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) { + if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) { CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr); return new (arena) BoundsCheckElimination(graph, *most_recent_side_effects, @@ -482,16 +479,30 @@ static HOptimization* BuildOptimization( } else if (opt_name == LoadStoreElimination::kLoadStoreEliminationPassName) { CHECK(most_recent_side_effects != nullptr); return new (arena) LoadStoreElimination(graph, *most_recent_side_effects); + } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) { + return new (arena) SideEffectsAnalysis(graph); +#ifdef ART_ENABLE_CODEGEN_arm + } else if (opt_name == arm::DexCacheArrayFixups::kDexCacheArrayFixupsArmPassName) { + return new (arena) arm::DexCacheArrayFixups(graph, stats); + } else if (opt_name == arm::InstructionSimplifierArm::kInstructionSimplifierArmPassName) { + return new (arena) arm::InstructionSimplifierArm(graph, stats); +#endif +#ifdef ART_ENABLE_CODEGEN_arm64 + } else if (opt_name == arm64::InstructionSimplifierArm64::kInstructionSimplifierArm64PassName) { + return new (arena) arm64::InstructionSimplifierArm64(graph, stats); +#endif +#ifdef ART_ENABLE_CODEGEN_mips } else if (opt_name == mips::DexCacheArrayFixups::kDexCacheArrayFixupsMipsPassName) { return new (arena) mips::DexCacheArrayFixups(graph, codegen, stats); } else if (opt_name == mips::PcRelativeFixups::kPcRelativeFixupsMipsPassName) { return new (arena) mips::PcRelativeFixups(graph, codegen, stats); - } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) { - return new (arena) SideEffectsAnalysis(graph); +#endif +#ifdef ART_ENABLE_CODEGEN_x86 } else if (opt_name == x86::PcRelativeFixups::kPcRelativeFixupsX86PassName) { return new (arena) x86::PcRelativeFixups(graph, codegen, stats); } else if (opt_name == x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName) { return new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats); +#endif } return nullptr; } @@ -581,6 +592,7 @@ void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set, HGraph* graph, CodeGenerator* codegen, PassObserver* pass_observer) const { + UNUSED(codegen); // To avoid compilation error when compiling for svelte OptimizingCompilerStats* stats = compilation_stats_.get(); ArenaAllocator* arena = graph->GetArena(); switch (instruction_set) { diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc index cfdb41ab62..a21595fe03 100644 --- a/compiler/optimizing/register_allocator_graph_color.cc +++ b/compiler/optimizing/register_allocator_graph_color.cc @@ -227,7 +227,8 @@ class InterferenceNode : public ArenaObject<kArenaAllocRegisterAllocator> { out_degree_(interval->HasRegister() ? std::numeric_limits<size_t>::max() : 0), alias_(this), spill_weight_(ComputeSpillWeight(interval, liveness)), - requires_color_(interval->RequiresRegister()) { + requires_color_(interval->RequiresRegister()), + needs_spill_slot_(false) { DCHECK(!interval->IsHighInterval()) << "Pair nodes should be represented by the low interval"; } @@ -342,6 +343,14 @@ class InterferenceNode : public ArenaObject<kArenaAllocRegisterAllocator> { return (IsPair() || other->IsPair()) ? 2 : 1; } + bool NeedsSpillSlot() const { + return needs_spill_slot_; + } + + void SetNeedsSpillSlot() { + needs_spill_slot_ = true; + } + // The current stage of this node, indicating which worklist it belongs to. NodeStage stage; @@ -376,6 +385,8 @@ class InterferenceNode : public ArenaObject<kArenaAllocRegisterAllocator> { const bool requires_color_; + bool needs_spill_slot_; + DISALLOW_COPY_AND_ASSIGN(InterferenceNode); }; @@ -549,10 +560,10 @@ RegisterAllocatorGraphColor::RegisterAllocatorGraphColor(ArenaAllocator* allocat safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)), physical_core_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), physical_fp_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)), - int_spill_slot_counter_(0), - double_spill_slot_counter_(0), - float_spill_slot_counter_(0), - long_spill_slot_counter_(0), + num_int_spill_slots_(0), + num_double_spill_slots_(0), + num_float_spill_slots_(0), + num_long_spill_slots_(0), catch_phi_spill_slot_counter_(0), reserved_art_method_slots_(ComputeReservedArtMethodSlots(*codegen)), reserved_out_slots_(codegen->GetGraph()->GetMaximumNumberOfOutVRegs()), @@ -653,6 +664,9 @@ void RegisterAllocatorGraphColor::AllocateRegisters() { } if (successful) { + // Assign spill slots. + AllocateSpillSlots(iteration.GetPrunableNodes()); + // Compute the maximum number of live registers across safepoints. // Notice that we do not count globally blocked registers, such as the stack pointer. if (safepoints.size() > 0) { @@ -700,10 +714,10 @@ void RegisterAllocatorGraphColor::AllocateRegisters() { .Resolve(max_safepoint_live_core_regs_, max_safepoint_live_fp_regs_, reserved_art_method_slots_ + reserved_out_slots_, - int_spill_slot_counter_, - long_spill_slot_counter_, - float_spill_slot_counter_, - double_spill_slot_counter_, + num_int_spill_slots_, + num_long_spill_slots_, + num_float_spill_slots_, + num_double_spill_slots_, catch_phi_spill_slot_counter_, temp_intervals_); @@ -743,10 +757,10 @@ bool RegisterAllocatorGraphColor::Validate(bool log_fatal_on_failure) { } } - size_t spill_slots = int_spill_slot_counter_ - + long_spill_slot_counter_ - + float_spill_slot_counter_ - + double_spill_slot_counter_ + size_t spill_slots = num_int_spill_slots_ + + num_long_spill_slots_ + + num_float_spill_slots_ + + num_double_spill_slots_ + catch_phi_spill_slot_counter_; bool ok = ValidateIntervals(intervals, spill_slots, @@ -1910,7 +1924,7 @@ bool ColoringIteration::ColorInterferenceGraph() { // be colored, and that we should split. } else { // Spill. - register_allocator_->AllocateSpillSlotFor(interval); + node->SetNeedsSpillSlot(); } } @@ -1936,52 +1950,156 @@ size_t RegisterAllocatorGraphColor::ComputeMaxSafepointLiveRegisters( return max_safepoint_live_regs; } -void RegisterAllocatorGraphColor::AllocateSpillSlotFor(LiveInterval* interval) { - LiveInterval* parent = interval->GetParent(); - HInstruction* defined_by = parent->GetDefinedBy(); - if (parent->HasSpillSlot()) { - // We already have a spill slot for this value that we can reuse. - } else if (defined_by->IsParameterValue()) { - // Parameters already have a stack slot. - parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue())); - } else if (defined_by->IsCurrentMethod()) { - // The current method is always at spill slot 0. - parent->SetSpillSlot(0); - } else if (defined_by->IsConstant()) { - // Constants don't need a spill slot. - } else { - // Allocate a spill slot based on type. - size_t* spill_slot_counter; - switch (interval->GetType()) { - case Primitive::kPrimDouble: - spill_slot_counter = &double_spill_slot_counter_; - break; - case Primitive::kPrimLong: - spill_slot_counter = &long_spill_slot_counter_; - break; - case Primitive::kPrimFloat: - spill_slot_counter = &float_spill_slot_counter_; - break; - case Primitive::kPrimNot: - case Primitive::kPrimInt: - case Primitive::kPrimChar: - case Primitive::kPrimByte: - case Primitive::kPrimBoolean: - case Primitive::kPrimShort: - spill_slot_counter = &int_spill_slot_counter_; - break; - case Primitive::kPrimVoid: - LOG(FATAL) << "Unexpected type for interval " << interval->GetType(); - UNREACHABLE(); +void RegisterAllocatorGraphColor::AllocateSpillSlots(const ArenaVector<InterferenceNode*>& nodes) { + // The register allocation resolver will organize the stack based on value type, + // so we assign stack slots for each value type separately. + ArenaVector<LiveInterval*> double_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<LiveInterval*> long_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<LiveInterval*> float_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + ArenaVector<LiveInterval*> int_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator)); + + // The set of parent intervals already handled. + ArenaSet<LiveInterval*> seen(allocator_->Adapter(kArenaAllocRegisterAllocator)); + + // Find nodes that need spill slots. + for (InterferenceNode* node : nodes) { + if (!node->NeedsSpillSlot()) { + continue; } - parent->SetSpillSlot(*spill_slot_counter); - *spill_slot_counter += parent->NeedsTwoSpillSlots() ? 2 : 1; - // TODO: Could color stack slots if we wanted to, even if - // it's just a trivial coloring. See the linear scan implementation, - // which simply reuses spill slots for values whose live intervals - // have already ended. + LiveInterval* parent = node->GetInterval()->GetParent(); + if (seen.find(parent) != seen.end()) { + // We've already handled this interval. + // This can happen if multiple siblings of the same interval request a stack slot. + continue; + } + seen.insert(parent); + + HInstruction* defined_by = parent->GetDefinedBy(); + if (parent->HasSpillSlot()) { + // We already have a spill slot for this value that we can reuse. + } else if (defined_by->IsParameterValue()) { + // Parameters already have a stack slot. + parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue())); + } else if (defined_by->IsCurrentMethod()) { + // The current method is always at stack slot 0. + parent->SetSpillSlot(0); + } else if (defined_by->IsConstant()) { + // Constants don't need a spill slot. + } else { + // We need to find a spill slot for this interval. Place it in the correct + // worklist to be processed later. + switch (node->GetInterval()->GetType()) { + case Primitive::kPrimDouble: + double_intervals.push_back(parent); + break; + case Primitive::kPrimLong: + long_intervals.push_back(parent); + break; + case Primitive::kPrimFloat: + float_intervals.push_back(parent); + break; + case Primitive::kPrimNot: + case Primitive::kPrimInt: + case Primitive::kPrimChar: + case Primitive::kPrimByte: + case Primitive::kPrimBoolean: + case Primitive::kPrimShort: + int_intervals.push_back(parent); + break; + case Primitive::kPrimVoid: + LOG(FATAL) << "Unexpected type for interval " << node->GetInterval()->GetType(); + UNREACHABLE(); + } + } + } + + // Color spill slots for each value type. + ColorSpillSlots(&double_intervals, &num_double_spill_slots_); + ColorSpillSlots(&long_intervals, &num_long_spill_slots_); + ColorSpillSlots(&float_intervals, &num_float_spill_slots_); + ColorSpillSlots(&int_intervals, &num_int_spill_slots_); +} + +void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* intervals, + size_t* num_stack_slots_used) { + // We cannot use the original interference graph here because spill slots are assigned to + // all of the siblings of an interval, whereas an interference node represents only a single + // sibling. So, we assign spill slots linear-scan-style by sorting all the interval endpoints + // by position, and assigning the lowest spill slot available when we encounter an interval + // beginning. We ignore lifetime holes for simplicity. + ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints( + allocator_->Adapter(kArenaAllocRegisterAllocator)); + + for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) { + LiveInterval* parent_interval = *it; + DCHECK(parent_interval->IsParent()); + DCHECK(!parent_interval->HasSpillSlot()); + size_t start = parent_interval->GetStart(); + size_t end = parent_interval->GetLastSibling()->GetEnd(); + DCHECK_LT(start, end); + interval_endpoints.push_back(std::make_tuple(start, true, parent_interval)); + interval_endpoints.push_back(std::make_tuple(end, false, parent_interval)); + } + + // Sort by position. + // We explicitly ignore the third entry of each tuple (the interval pointer) in order + // to maintain determinism. + std::sort(interval_endpoints.begin(), interval_endpoints.end(), + [] (const std::tuple<size_t, bool, LiveInterval*>& lhs, + const std::tuple<size_t, bool, LiveInterval*>& rhs) { + return std::tie(std::get<0>(lhs), std::get<1>(lhs)) + < std::tie(std::get<0>(rhs), std::get<1>(rhs)); + }); + + ArenaBitVector taken(allocator_, 0, true); + for (auto it = interval_endpoints.begin(), end = interval_endpoints.end(); it != end; ++it) { + // Extract information from the current tuple. + LiveInterval* parent_interval; + bool is_interval_beginning; + size_t position; + std::tie(position, is_interval_beginning, parent_interval) = *it; + + bool needs_two_slots = parent_interval->NeedsTwoSpillSlots(); + + if (is_interval_beginning) { + DCHECK(!parent_interval->HasSpillSlot()); + DCHECK_EQ(position, parent_interval->GetStart()); + + // Find a free stack slot. + size_t slot = 0; + for (; taken.IsBitSet(slot) || (needs_two_slots && taken.IsBitSet(slot + 1)); ++slot) { + // Skip taken slots. + } + parent_interval->SetSpillSlot(slot); + + *num_stack_slots_used = std::max(*num_stack_slots_used, + needs_two_slots ? slot + 1 : slot + 2); + if (needs_two_slots && *num_stack_slots_used % 2 != 0) { + // The parallel move resolver requires that there be an even number of spill slots + // allocated for pair value types. + ++(*num_stack_slots_used); + } + + taken.SetBit(slot); + if (needs_two_slots) { + taken.SetBit(slot + 1); + } + } else { + DCHECK_EQ(position, parent_interval->GetLastSibling()->GetEnd()); + DCHECK(parent_interval->HasSpillSlot()); + + // Free up the stack slot used by this interval. + size_t slot = parent_interval->GetSpillSlot(); + DCHECK(taken.IsBitSet(slot)); + DCHECK(!needs_two_slots || taken.IsBitSet(slot + 1)); + taken.ClearBit(slot); + if (needs_two_slots) { + taken.ClearBit(slot + 1); + } + } } + DCHECK_EQ(taken.NumSetBits(), 0u); } } // namespace art diff --git a/compiler/optimizing/register_allocator_graph_color.h b/compiler/optimizing/register_allocator_graph_color.h index 9dddcea685..ed12561d2c 100644 --- a/compiler/optimizing/register_allocator_graph_color.h +++ b/compiler/optimizing/register_allocator_graph_color.h @@ -144,9 +144,13 @@ class RegisterAllocatorGraphColor : public RegisterAllocator { // based on the outgoing interference edges of safepoint nodes. size_t ComputeMaxSafepointLiveRegisters(const ArenaVector<InterferenceNode*>& safepoints); - // If necessary, add the given interval to the list of spilled intervals, - // and make sure it's ready to be spilled to the stack. - void AllocateSpillSlotFor(LiveInterval* interval); + // Assigns stack slots to a list of intervals, ensuring that interfering intervals are not + // assigned the same stack slot. + void ColorSpillSlots(ArenaVector<LiveInterval*>* nodes, + size_t* num_stack_slots_used); + + // Provide stack slots to nodes that need them. + void AllocateSpillSlots(const ArenaVector<InterferenceNode*>& nodes); // Whether iterative move coalescing should be performed. Iterative move coalescing // improves code quality, but increases compile time. @@ -170,10 +174,10 @@ class RegisterAllocatorGraphColor : public RegisterAllocator { ArenaVector<InterferenceNode*> physical_fp_nodes_; // Allocated stack slot counters. - size_t int_spill_slot_counter_; - size_t double_spill_slot_counter_; - size_t float_spill_slot_counter_; - size_t long_spill_slot_counter_; + size_t num_int_spill_slots_; + size_t num_double_spill_slots_; + size_t num_float_spill_slots_; + size_t num_long_spill_slots_; size_t catch_phi_spill_slot_counter_; // Number of stack slots needed for the pointer to the current method. |