diff options
Diffstat (limited to 'compiler/optimizing')
19 files changed, 1655 insertions, 166 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index b39a0e43fa..4629c54a17 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -16,6 +16,7 @@ #include "code_generator_arm64.h" +#include "arch/arm64/asm_support_arm64.h" #include "arch/arm64/instruction_set_features_arm64.h" #include "art_method.h" #include "code_generator_utils.h" @@ -25,6 +26,7 @@ #include "gc/accounting/card_table.h" #include "intrinsics.h" #include "intrinsics_arm64.h" +#include "linker/arm64/relative_patcher_arm64.h" #include "mirror/array-inl.h" #include "mirror/class-inl.h" #include "offsets.h" @@ -81,6 +83,26 @@ static constexpr int kCurrentMethodStackOffset = 0; // generates less code/data with a small num_entries. static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7; +// Reference load (except object array loads) is using LDR Wt, [Xn, #offset] which can handle +// offset < 16KiB. For offsets >= 16KiB, the load shall be emitted as two or more instructions. +// For the Baker read barrier implementation using link-generated thunks we need to split +// the offset explicitly. +constexpr uint32_t kReferenceLoadMinFarOffset = 16 * KB; + +// Flags controlling the use of link-time generated thunks for Baker read barriers. +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForFields = true; +constexpr bool kBakerReadBarrierLinkTimeThunksEnableForGcRoots = true; + +// Some instructions have special requirements for a temporary, for example +// LoadClass/kBssEntry and LoadString/kBssEntry for Baker read barrier require +// temp that's not an R0 (to avoid an extra move) and Baker read barrier field +// loads with large offsets need a fixed register to limit the number of link-time +// thunks we generate. For these and similar cases, we want to reserve a specific +// register that's neither callee-save nor an argument register. We choose x15. +inline Location FixedTempLocation() { + return Location::RegisterLocation(x15.GetCode()); +} + inline Condition ARM64Condition(IfCondition cond) { switch (cond) { case kCondEQ: return eq; @@ -298,23 +320,22 @@ class LoadClassSlowPathARM64 : public SlowPathCodeARM64 { constexpr bool call_saves_everything_except_r0_ip0 = (!kUseReadBarrier || kUseBakerReadBarrier); CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - // For HLoadClass/kBssEntry/kSaveEverything, make sure we preserve the page address of - // the entry which is in a scratch register. Make sure it's not used for saving/restoring - // registers. Exclude the scratch register also for non-Baker read barrier for simplicity. + InvokeRuntimeCallingConvention calling_convention; + // For HLoadClass/kBssEntry/kSaveEverything, the page address of the entry is in a temp + // register, make sure it's not clobbered by the call or by saving/restoring registers. DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_); bool is_load_class_bss_entry = (cls_ == instruction_) && (cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry); - UseScratchRegisterScope temps(arm64_codegen->GetVIXLAssembler()); if (is_load_class_bss_entry) { - // This temp is a scratch register. DCHECK(bss_entry_temp_.IsValid()); - temps.Exclude(bss_entry_temp_); + DCHECK(!bss_entry_temp_.Is(calling_convention.GetRegisterAt(0))); + DCHECK( + !UseScratchRegisterScope(arm64_codegen->GetVIXLAssembler()).IsAvailable(bss_entry_temp_)); } __ Bind(GetEntryLabel()); SaveLiveRegisters(codegen, locations); - InvokeRuntimeCallingConvention calling_convention; dex::TypeIndex type_index = cls_->GetTypeIndex(); __ Mov(calling_convention.GetRegisterAt(0).W(), type_index.index_); QuickEntrypointEnum entrypoint = do_clinit_ ? kQuickInitializeStaticStorage @@ -387,14 +408,15 @@ class LoadStringSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg())); CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - // temp_ is a scratch register. Make sure it's not used for saving/restoring registers. - UseScratchRegisterScope temps(arm64_codegen->GetVIXLAssembler()); - temps.Exclude(temp_); + InvokeRuntimeCallingConvention calling_convention; + // Make sure `temp_` is not clobbered by the call or by saving/restoring registers. + DCHECK(temp_.IsValid()); + DCHECK(!temp_.Is(calling_convention.GetRegisterAt(0))); + DCHECK(!UseScratchRegisterScope(arm64_codegen->GetVIXLAssembler()).IsAvailable(temp_)); __ Bind(GetEntryLabel()); SaveLiveRegisters(codegen, locations); - InvokeRuntimeCallingConvention calling_convention; const dex::StringIndex string_index = instruction_->AsLoadString()->GetStringIndex(); __ Mov(calling_convention.GetRegisterAt(0).W(), string_index.index_); arm64_codegen->InvokeRuntime(kQuickResolveString, instruction_, instruction_->GetDexPc(), this); @@ -1416,6 +1438,7 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), + baker_read_barrier_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -2236,7 +2259,8 @@ void LocationsBuilderARM64::HandleBinaryOp(HBinaryOperation* instr) { } } -void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction) { +void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction, + const FieldInfo& field_info) { DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet()); bool object_field_get_with_read_barrier = @@ -2250,7 +2274,17 @@ void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + !field_info.IsVolatile()) { + // If link-time thunks for the Baker read barrier are enabled, for AOT + // non-volatile loads we need a temporary only if the offset is too big. + if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) { + locations->AddTemp(FixedTempLocation()); + } + } else { + locations->AddTemp(Location::RequiresRegister()); + } } locations->SetInAt(0, Location::RequiresRegister()); if (Primitive::IsFloatingPointType(instruction->GetType())) { @@ -2279,7 +2313,8 @@ void InstructionCodeGeneratorARM64::HandleFieldGet(HInstruction* instruction, // Object FieldGet with Baker's read barrier case. // /* HeapReference<Object> */ out = *(base + offset) Register base = RegisterFrom(base_loc, Primitive::kPrimNot); - Register temp = WRegisterFrom(locations->GetTemp(0)); + Location maybe_temp = + (locations->GetTempCount() != 0) ? locations->GetTemp(0) : Location::NoLocation(); // Note that potential implicit null checks are handled in this // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier call. codegen_->GenerateFieldLoadWithBakerReadBarrier( @@ -2287,7 +2322,7 @@ void InstructionCodeGeneratorARM64::HandleFieldGet(HInstruction* instruction, out, base, offset, - temp, + maybe_temp, /* needs_null_check */ true, field_info.IsVolatile()); } else { @@ -2672,7 +2707,21 @@ void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. // We need a temporary register for the read barrier marking slow // path in CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier. - locations->AddTemp(Location::RequiresRegister()); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !Runtime::Current()->UseJitCompilation() && + instruction->GetIndex()->IsConstant()) { + // Array loads with constant index are treated as field loads. + // If link-time thunks for the Baker read barrier are enabled, for AOT + // constant index loads we need a temporary only if the offset is too big. + uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction); + uint32_t index = instruction->GetIndex()->AsIntConstant()->GetValue(); + offset += index << Primitive::ComponentSizeShift(Primitive::kPrimNot); + if (offset >= kReferenceLoadMinFarOffset) { + locations->AddTemp(FixedTempLocation()); + } + } else { + locations->AddTemp(Location::RequiresRegister()); + } } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); @@ -2708,11 +2757,25 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Object ArrayGet with Baker's read barrier case. - Register temp = WRegisterFrom(locations->GetTemp(0)); // Note that a potential implicit null check is handled in the // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call. - codegen_->GenerateArrayLoadWithBakerReadBarrier( - instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ true); + if (index.IsConstant()) { + // Array load with a constant index can be treated as a field load. + offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type); + Location maybe_temp = + (locations->GetTempCount() != 0) ? locations->GetTemp(0) : Location::NoLocation(); + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out, + obj.W(), + offset, + maybe_temp, + /* needs_null_check */ true, + /* use_load_acquire */ false); + } else { + Register temp = WRegisterFrom(locations->GetTemp(0)); + codegen_->GenerateArrayLoadWithBakerReadBarrier( + instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ true); + } } else { // General case. MemOperand source = HeapOperand(obj); @@ -2989,6 +3052,11 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) { if (!index.IsConstant()) { __ Add(temp, array, offset); + } else { + // We no longer need the `temp` here so release it as the store below may + // need a scratch register (if the constant index makes the offset too large) + // and the poisoned `source` could be using the other scratch register. + temps.Release(temp); } { // Ensure that between store and MaybeRecordImplicitNullCheck there are no pools emitted. @@ -3742,7 +3810,7 @@ void CodeGeneratorARM64::GenerateNop() { } void LocationsBuilderARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) { - HandleFieldGet(instruction); + HandleFieldGet(instruction, instruction->GetFieldInfo()); } void InstructionCodeGeneratorARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) { @@ -4544,6 +4612,11 @@ vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch( return NewPcRelativePatch(dex_file, element_offset, adrp_label, &pc_relative_dex_cache_patches_); } +vixl::aarch64::Label* CodeGeneratorARM64::NewBakerReadBarrierPatch(uint32_t custom_data) { + baker_read_barrier_patches_.emplace_back(custom_data); + return &baker_read_barrier_patches_.back().label; +} + vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativePatch( const DexFile& dex_file, uint32_t offset_or_index, @@ -4642,7 +4715,8 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc pc_relative_string_patches_.size() + boot_image_type_patches_.size() + pc_relative_type_patches_.size() + - type_bss_entry_patches_.size(); + type_bss_entry_patches_.size() + + baker_read_barrier_patches_.size(); linker_patches->reserve(size); for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) { linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.GetLocation(), @@ -4676,6 +4750,10 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc target_type.dex_file, target_type.type_index.index_)); } + for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) { + linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(), + info.custom_data)); + } DCHECK_EQ(size, linker_patches->size()); } @@ -4788,8 +4866,7 @@ void LocationsBuilderARM64::VisitLoadClass(HLoadClass* cls) { if (cls->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) { if (!kUseReadBarrier || kUseBakerReadBarrier) { // Rely on the type resolution or initialization and marking to save everything we need. - // Note that IP0 may be clobbered by saving/restoring the live register (only one thanks - // to the custom calling convention) or by marking, so we shall use IP1. + locations->AddTemp(FixedTempLocation()); RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConvention calling_convention; caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode())); @@ -4866,11 +4943,7 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SA // Add ADRP with its PC-relative Class .bss entry patch. const DexFile& dex_file = cls->GetDexFile(); dex::TypeIndex type_index = cls->GetTypeIndex(); - // We can go to slow path even with non-zero reference and in that case marking - // can clobber IP0, so we need to use IP1 which shall be preserved. - bss_entry_temp = ip1; - UseScratchRegisterScope temps(codegen_->GetVIXLAssembler()); - temps.Exclude(bss_entry_temp); + bss_entry_temp = XRegisterFrom(cls->GetLocations()->GetTemp(0)); bss_entry_adrp_label = codegen_->NewBssEntryTypePatch(dex_file, type_index); codegen_->EmitAdrpPlaceholder(bss_entry_adrp_label, bss_entry_temp); // Add LDR with its PC-relative Class patch. @@ -4977,8 +5050,7 @@ void LocationsBuilderARM64::VisitLoadString(HLoadString* load) { if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) { if (!kUseReadBarrier || kUseBakerReadBarrier) { // Rely on the pResolveString and marking to save everything we need. - // Note that IP0 may be clobbered by saving/restoring the live register (only one thanks - // to the custom calling convention) or by marking, so we shall use IP1. + locations->AddTemp(FixedTempLocation()); RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConvention calling_convention; caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode())); @@ -5029,11 +5101,7 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) NO_THREAD const DexFile& dex_file = load->GetDexFile(); const dex::StringIndex string_index = load->GetStringIndex(); DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); - // We could use IP0 as the marking shall not clobber IP0 if the reference is null and - // that's when we need the slow path. But let's not rely on such details and use IP1. - Register temp = ip1; - UseScratchRegisterScope temps(codegen_->GetVIXLAssembler()); - temps.Exclude(temp); + Register temp = XRegisterFrom(load->GetLocations()->GetTemp(0)); vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index); codegen_->EmitAdrpPlaceholder(adrp_label, temp); // Add LDR with its PC-relative String patch. @@ -5468,7 +5536,7 @@ void InstructionCodeGeneratorARM64::VisitSub(HSub* instruction) { } void LocationsBuilderARM64::VisitStaticFieldGet(HStaticFieldGet* instruction) { - HandleFieldGet(instruction); + HandleFieldGet(instruction, instruction->GetFieldInfo()); } void InstructionCodeGeneratorARM64::VisitStaticFieldGet(HStaticFieldGet* instruction) { @@ -5777,7 +5845,6 @@ void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister( Register out_reg = RegisterFrom(out, type); if (read_barrier_option == kWithReadBarrier) { CHECK(kEmitCompilerReadBarrier); - Register temp_reg = RegisterFrom(maybe_temp, type); if (kUseBakerReadBarrier) { // Load with fast path based Baker's read barrier. // /* HeapReference<Object> */ out = *(out + offset) @@ -5785,7 +5852,7 @@ void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister( out, out_reg, offset, - temp_reg, + maybe_temp, /* needs_null_check */ false, /* use_load_acquire */ false); } else { @@ -5793,6 +5860,7 @@ void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister( // Save the value of `out` into `maybe_temp` before overwriting it // in the following move operation, as we will need it for the // read barrier below. + Register temp_reg = RegisterFrom(maybe_temp, type); __ Mov(temp_reg, out_reg); // /* HeapReference<Object> */ out = *(out + offset) __ Ldr(out_reg, HeapOperand(out_reg, offset)); @@ -5820,13 +5888,12 @@ void InstructionCodeGeneratorARM64::GenerateReferenceLoadTwoRegisters( CHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Load with fast path based Baker's read barrier. - Register temp_reg = RegisterFrom(maybe_temp, type); // /* HeapReference<Object> */ out = *(obj + offset) codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, out, obj_reg, offset, - temp_reg, + maybe_temp, /* needs_null_check */ false, /* use_load_acquire */ false); } else { @@ -5857,52 +5924,97 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when // Baker's read barrier are used. - // - // Note that we do not actually check the value of - // `GetIsGcMarking()` to decide whether to mark the loaded GC - // root or not. Instead, we load into `temp` the read barrier - // mark entry point corresponding to register `root`. If `temp` - // is null, it means that `GetIsGcMarking()` is false, and vice - // versa. - // - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. - // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() - // // Slow path. - // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. - // } - - // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. - Register temp = lr; - SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64( - instruction, root, /* entrypoint */ LocationFrom(temp)); - codegen_->AddSlowPath(slow_path); - - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ Ldr(temp, MemOperand(tr, entry_point_offset)); + if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` the read barrier mark introspection entrypoint. + // If `temp` is null, it means that `GetIsGcMarking()` is false, and + // vice versa. + // + // We use link-time generated thunks for the slow path. That thunk + // checks the reference and jumps to the entrypoint if needed. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { + // goto gc_root_thunk<root_reg>(lr) + // } + // return_address: - // /* GcRoot<mirror::Object> */ root = *(obj + offset) - if (fixup_label == nullptr) { - __ Ldr(root_reg, MemOperand(obj, offset)); + UseScratchRegisterScope temps(GetVIXLAssembler()); + DCHECK(temps.IsAvailable(ip0)); + DCHECK(temps.IsAvailable(ip1)); + temps.Exclude(ip0, ip1); + uint32_t custom_data = + linker::Arm64RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode()); + vixl::aarch64::Label* cbnz_label = codegen_->NewBakerReadBarrierPatch(custom_data); + + // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip0.GetCode(), 16u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); + __ Ldr(ip1, MemOperand(tr, entry_point_offset)); + EmissionCheckScope guard(GetVIXLAssembler(), 3 * vixl::aarch64::kInstructionSize); + vixl::aarch64::Label return_address; + __ adr(lr, &return_address); + if (fixup_label != nullptr) { + __ Bind(fixup_label); + } + static_assert(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8, + "GC root LDR must be 2 instruction (8B) before the return address label."); + __ ldr(root_reg, MemOperand(obj.X(), offset)); + __ Bind(cbnz_label); + __ cbnz(ip1, static_cast<int64_t>(0)); // Placeholder, patched at link-time. + __ Bind(&return_address); } else { - codegen_->EmitLdrOffsetPlaceholder(fixup_label, root_reg, obj); + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. + // + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. + // } + + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Register temp = lr; + SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64( + instruction, root, /* entrypoint */ LocationFrom(temp)); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp, MemOperand(tr, entry_point_offset)); + + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + if (fixup_label == nullptr) { + __ Ldr(root_reg, MemOperand(obj, offset)); + } else { + codegen_->EmitLdrOffsetPlaceholder(fixup_label, root_reg, obj); + } + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ Cbnz(temp, slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); } - static_assert( - sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), - "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " - "have different sizes."); - static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), - "art::mirror::CompressedReference<mirror::Object> and int32_t " - "have different sizes."); - - // The entrypoint is null when the GC is not marking, this prevents one load compared to - // checking GetIsGcMarking. - __ Cbnz(temp, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); } else { // GC root loaded through a slow path for read barriers other // than Baker's. @@ -5932,13 +6044,80 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins Location ref, Register obj, uint32_t offset, - Register temp, + Location maybe_temp, bool needs_null_check, bool use_load_acquire) { DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); + if (kBakerReadBarrierLinkTimeThunksEnableForFields && + !use_load_acquire && + !Runtime::Current()->UseJitCompilation()) { + // Note that we do not actually check the value of `GetIsGcMarking()` + // to decide whether to mark the loaded GC root or not. Instead, we + // load into `temp` the read barrier mark introspection entrypoint. + // If `temp` is null, it means that `GetIsGcMarking()` is false, and + // vice versa. + // + // We use link-time generated thunks for the slow path. That thunk checks + // the holder and jumps to the entrypoint if needed. If the holder is not + // gray, it creates a fake dependency and returns to the LDR instruction. + // + // temp = Thread::Current()->pReadBarrierMarkIntrospection + // lr = &return_address; + // if (temp != nullptr) { + // goto field_thunk<holder_reg, base_reg>(lr) + // } + // not_gray_return_address: + // // Original reference load. If the offset is too large to fit + // // into LDR, we use an adjusted base register here. + // GcRoot<mirror::Object> root = *(obj+offset); + // gray_return_address: + + DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>)); + Register base = obj; + if (offset >= kReferenceLoadMinFarOffset) { + DCHECK(maybe_temp.IsRegister()); + base = WRegisterFrom(maybe_temp); + static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2."); + __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u))); + offset &= (kReferenceLoadMinFarOffset - 1u); + } + UseScratchRegisterScope temps(GetVIXLAssembler()); + DCHECK(temps.IsAvailable(ip0)); + DCHECK(temps.IsAvailable(ip1)); + temps.Exclude(ip0, ip1); + uint32_t custom_data = linker::Arm64RelativePatcher::EncodeBakerReadBarrierFieldData( + base.GetCode(), + obj.GetCode()); + vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data); + + // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection. + DCHECK_EQ(ip0.GetCode(), 16u); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode()); + __ Ldr(ip1, MemOperand(tr, entry_point_offset)); + EmissionCheckScope guard(GetVIXLAssembler(), + (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize); + vixl::aarch64::Label return_address; + __ adr(lr, &return_address); + __ Bind(cbnz_label); + __ cbnz(ip1, static_cast<int64_t>(0)); // Placeholder, patched at link-time. + static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4), + "Field LDR must be 1 instruction (4B) before the return address label; " + " 2 instructions (8B) for heap poisoning."); + Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot); + __ ldr(ref_reg, MemOperand(base.X(), offset)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); + __ Bind(&return_address); + return; + } + // /* HeapReference<Object> */ ref = *(obj + offset) + Register temp = WRegisterFrom(maybe_temp); Location no_index = Location::NoLocation(); size_t no_scale_factor = 0u; GenerateReferenceLoadWithBakerReadBarrier(instruction, diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 869aad2942..58feea2423 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -351,7 +351,7 @@ class LocationsBuilderARM64 : public HGraphVisitor { private: void HandleBinaryOp(HBinaryOperation* instr); void HandleFieldSet(HInstruction* instruction); - void HandleFieldGet(HInstruction* instruction); + void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); void HandleInvoke(HInvoke* instr); void HandleCondition(HCondition* instruction); void HandleShift(HBinaryOperation* instr); @@ -579,6 +579,10 @@ class CodeGeneratorARM64 : public CodeGenerator { uint32_t element_offset, vixl::aarch64::Label* adrp_label = nullptr); + // Add a new baker read barrier patch and return the label to be bound + // before the CBNZ instruction. + vixl::aarch64::Label* NewBakerReadBarrierPatch(uint32_t custom_data); + vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageStringLiteral( const DexFile& dex_file, dex::StringIndex string_index); @@ -610,7 +614,7 @@ class CodeGeneratorARM64 : public CodeGenerator { Location ref, vixl::aarch64::Register obj, uint32_t offset, - vixl::aarch64::Register temp, + Location maybe_temp, bool needs_null_check, bool use_load_acquire); // Fast path implementation of ReadBarrier::Barrier for a heap @@ -738,6 +742,13 @@ class CodeGeneratorARM64 : public CodeGenerator { vixl::aarch64::Label* pc_insn_label; }; + struct BakerReadBarrierPatchInfo { + explicit BakerReadBarrierPatchInfo(uint32_t data) : label(), custom_data(data) { } + + vixl::aarch64::Label label; + uint32_t custom_data; + }; + vixl::aarch64::Label* NewPcRelativePatch(const DexFile& dex_file, uint32_t offset_or_index, vixl::aarch64::Label* adrp_label, @@ -777,6 +788,8 @@ class CodeGeneratorARM64 : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; + // Baker read barrier patch info. + ArenaDeque<BakerReadBarrierPatchInfo> baker_read_barrier_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index cce412b314..b6678b03ef 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -3634,8 +3634,8 @@ void InstructionCodeGeneratorARMVIXL::VisitTypeConversion(HTypeConversion* conve } else { DCHECK(in.IsConstant()); DCHECK(in.GetConstant()->IsLongConstant()); - int32_t value = Int32ConstantFrom(in); - __ Mov(OutputRegister(conversion), value); + int64_t value = in.GetConstant()->AsLongConstant()->GetValue(); + __ Mov(OutputRegister(conversion), static_cast<int32_t>(value)); } break; diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc index e7f7b3019c..6e82123e56 100644 --- a/compiler/optimizing/code_generator_vector_arm.cc +++ b/compiler/optimizing/code_generator_vector_arm.cc @@ -124,6 +124,14 @@ void InstructionCodeGeneratorARM::VisitVecAdd(HVecAdd* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderARM::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderARM::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -148,6 +156,22 @@ void InstructionCodeGeneratorARM::VisitVecDiv(HVecDiv* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderARM::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderARM::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 0923920366..2dfccfff85 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -318,6 +318,47 @@ void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) { } } +void LocationsBuilderARM64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + instruction->IsRounded() + ? __ Urhadd(dst.V16B(), lhs.V16B(), rhs.V16B()) + : __ Uhadd(dst.V16B(), lhs.V16B(), rhs.V16B()); + } else { + instruction->IsRounded() + ? __ Srhadd(dst.V16B(), lhs.V16B(), rhs.V16B()) + : __ Shadd(dst.V16B(), lhs.V16B(), rhs.V16B()); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->IsUnsigned()) { + instruction->IsRounded() + ? __ Urhadd(dst.V8H(), lhs.V8H(), rhs.V8H()) + : __ Uhadd(dst.V8H(), lhs.V8H(), rhs.V8H()); + } else { + instruction->IsRounded() + ? __ Srhadd(dst.V8H(), lhs.V8H(), rhs.V8H()) + : __ Shadd(dst.V8H(), lhs.V8H(), rhs.V8H()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderARM64::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -420,6 +461,22 @@ void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) { } } +void LocationsBuilderARM64::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); +} + +void LocationsBuilderARM64::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); +} + void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc index 74fa584e09..990178b31b 100644 --- a/compiler/optimizing/code_generator_vector_arm_vixl.cc +++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc @@ -124,6 +124,14 @@ void InstructionCodeGeneratorARMVIXL::VisitVecAdd(HVecAdd* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderARMVIXL::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -148,6 +156,22 @@ void InstructionCodeGeneratorARMVIXL::VisitVecDiv(HVecDiv* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderARMVIXL::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc index 6969abd422..8ea1ca7d90 100644 --- a/compiler/optimizing/code_generator_vector_mips.cc +++ b/compiler/optimizing/code_generator_vector_mips.cc @@ -124,6 +124,14 @@ void InstructionCodeGeneratorMIPS::VisitVecAdd(HVecAdd* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderMIPS::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -148,6 +156,22 @@ void InstructionCodeGeneratorMIPS::VisitVecDiv(HVecDiv* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderMIPS::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc index 87118cefa5..a484bb4774 100644 --- a/compiler/optimizing/code_generator_vector_mips64.cc +++ b/compiler/optimizing/code_generator_vector_mips64.cc @@ -124,6 +124,14 @@ void InstructionCodeGeneratorMIPS64::VisitVecAdd(HVecAdd* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -148,6 +156,22 @@ void InstructionCodeGeneratorMIPS64::VisitVecDiv(HVecDiv* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderMIPS64::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS64::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 8dabb4d08f..a86d060821 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -350,6 +350,35 @@ void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) { } } +void LocationsBuilderX86::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + DCHECK(instruction->IsRounded()); + DCHECK(instruction->IsUnsigned()); + + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ pavgb(dst, src); + return; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ pavgw(dst, src); + return; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -448,6 +477,22 @@ void InstructionCodeGeneratorX86::VisitVecDiv(HVecDiv* instruction) { } } +void LocationsBuilderX86::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderX86::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index e95608839b..696735367e 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -343,6 +343,31 @@ void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) { } } +void LocationsBuilderX86_64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ pavgb(dst, src); + return; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ pavgw(dst, src); + return; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86_64::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } @@ -441,6 +466,22 @@ void InstructionCodeGeneratorX86_64::VisitVecDiv(HVecDiv* instruction) { } } +void LocationsBuilderX86_64::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecMin(HVecMin* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderX86_64::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecMax(HVecMax* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderX86_64::VisitVecAnd(HVecAnd* instruction) { CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); } diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index cc3c143b15..1b2b9f80ac 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -509,6 +509,11 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { StartAttributeStream("kind") << deoptimize->GetKind(); } + void VisitVecHalvingAdd(HVecHalvingAdd* hadd) OVERRIDE { + StartAttributeStream("unsigned") << std::boolalpha << hadd->IsUnsigned() << std::noboolalpha; + StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha; + } + #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64) void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE { StartAttributeStream("kind") << instruction->GetOpKind(); diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc index 1c8674d522..7c833cf70c 100644 --- a/compiler/optimizing/induction_var_range.cc +++ b/compiler/optimizing/induction_var_range.cc @@ -45,18 +45,6 @@ static bool IsSafeDiv(int32_t c1, int32_t c2) { return c2 != 0 && CanLongValueFitIntoInt(static_cast<int64_t>(c1) / static_cast<int64_t>(c2)); } -/** Returns true for 32/64-bit constant instruction. */ -static bool IsIntAndGet(HInstruction* instruction, int64_t* value) { - if (instruction->IsIntConstant()) { - *value = instruction->AsIntConstant()->GetValue(); - return true; - } else if (instruction->IsLongConstant()) { - *value = instruction->AsLongConstant()->GetValue(); - return true; - } - return false; -} - /** Computes a * b for a,b > 0 (at least until first overflow happens). */ static int64_t SafeMul(int64_t a, int64_t b, /*out*/ bool* overflow) { if (a > 0 && b > 0 && a > (std::numeric_limits<int64_t>::max() / b)) { @@ -106,7 +94,7 @@ static bool IsGEZero(HInstruction* instruction) { } } int64_t value = -1; - return IsIntAndGet(instruction, &value) && value >= 0; + return IsInt64AndGet(instruction, &value) && value >= 0; } /** Hunts "under the hood" for a suitable instruction at the hint. */ @@ -149,7 +137,7 @@ static InductionVarRange::Value SimplifyMax(InductionVarRange::Value v, HInstruc int64_t value; if (v.instruction->IsDiv() && v.instruction->InputAt(0)->IsArrayLength() && - IsIntAndGet(v.instruction->InputAt(1), &value) && v.a_constant == value) { + IsInt64AndGet(v.instruction->InputAt(1), &value) && v.a_constant == value) { return InductionVarRange::Value(v.instruction->InputAt(0), 1, v.b_constant); } // If a == 1, the most suitable one suffices as maximum value. @@ -444,7 +432,7 @@ bool InductionVarRange::IsConstant(HInductionVarAnalysis::InductionInfo* info, // any of the three requests (kExact, kAtMost, and KAtLeast). if (info->induction_class == HInductionVarAnalysis::kInvariant && info->operation == HInductionVarAnalysis::kFetch) { - if (IsIntAndGet(info->fetch, value)) { + if (IsInt64AndGet(info->fetch, value)) { return true; } } @@ -635,7 +623,7 @@ InductionVarRange::Value InductionVarRange::GetGeometric(HInductionVarAnalysis:: int64_t f = 0; if (IsConstant(info->op_a, kExact, &a) && CanLongValueFitIntoInt(a) && - IsIntAndGet(info->fetch, &f) && f >= 1) { + IsInt64AndGet(info->fetch, &f) && f >= 1) { // Conservative bounds on a * f^-i + b with f >= 1 can be computed without // trip count. Other forms would require a much more elaborate evaluation. const bool is_min_a = a >= 0 ? is_min : !is_min; @@ -663,7 +651,7 @@ InductionVarRange::Value InductionVarRange::GetFetch(HInstruction* instruction, // Unless at a constant or hint, chase the instruction a bit deeper into the HIR tree, so that // it becomes more likely range analysis will compare the same instructions as terminal nodes. int64_t value; - if (IsIntAndGet(instruction, &value) && CanLongValueFitIntoInt(value)) { + if (IsInt64AndGet(instruction, &value) && CanLongValueFitIntoInt(value)) { // Proper constant reveals best information. return Value(static_cast<int32_t>(value)); } else if (instruction == chase_hint_) { @@ -671,10 +659,10 @@ InductionVarRange::Value InductionVarRange::GetFetch(HInstruction* instruction, return Value(instruction, 1, 0); } else if (instruction->IsAdd()) { // Incorporate suitable constants in the chased value. - if (IsIntAndGet(instruction->InputAt(0), &value) && CanLongValueFitIntoInt(value)) { + if (IsInt64AndGet(instruction->InputAt(0), &value) && CanLongValueFitIntoInt(value)) { return AddValue(Value(static_cast<int32_t>(value)), GetFetch(instruction->InputAt(1), trip, in_body, is_min)); - } else if (IsIntAndGet(instruction->InputAt(1), &value) && CanLongValueFitIntoInt(value)) { + } else if (IsInt64AndGet(instruction->InputAt(1), &value) && CanLongValueFitIntoInt(value)) { return AddValue(GetFetch(instruction->InputAt(0), trip, in_body, is_min), Value(static_cast<int32_t>(value))); } @@ -1074,7 +1062,7 @@ bool InductionVarRange::GenerateLastValueGeometric(HInductionVarAnalysis::Induct // Detect known base and trip count (always taken). int64_t f = 0; int64_t m = 0; - if (IsIntAndGet(info->fetch, &f) && f >= 1 && IsConstant(trip->op_a, kExact, &m) && m >= 1) { + if (IsInt64AndGet(info->fetch, &f) && f >= 1 && IsConstant(trip->op_a, kExact, &m) && m >= 1) { HInstruction* opa = nullptr; HInstruction* opb = nullptr; if (GenerateCode(info->op_a, nullptr, graph, block, &opa, false, false) && diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 423fd3c6ae..47bcb5d000 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -2507,9 +2507,11 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { // We use a block to end the scratch scope before the write barrier, thus // freeing the temporary registers so they can be used in `MarkGCCard`. UseScratchRegisterScope temps(masm); + Location temp3_loc; // Used only for Baker read barrier. Register temp3; if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - temp3 = WRegisterFrom(locations->GetTemp(2)); + temp3_loc = locations->GetTemp(2); + temp3 = WRegisterFrom(temp3_loc); } else { temp3 = temps.AcquireW(); } @@ -2527,7 +2529,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp1_loc, src.W(), class_offset, - temp2, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); // Bail out if the source is not a non primitive array. @@ -2536,7 +2538,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp1_loc, temp1, component_offset, - temp2, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); __ Cbz(temp1, intrinsic_slow_path->GetEntryLabel()); @@ -2553,7 +2555,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp1_loc, dest.W(), class_offset, - temp2, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); @@ -2570,7 +2572,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp2_loc, temp1, component_offset, - temp3, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); @@ -2589,7 +2591,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp2_loc, src.W(), class_offset, - temp3, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); // Note: if heap poisoning is on, we are comparing two unpoisoned references here. @@ -2603,7 +2605,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp1_loc, temp1, component_offset, - temp2, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); // /* HeapReference<Class> */ temp1 = temp1->super_class_ @@ -2687,7 +2689,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp1_loc, src.W(), class_offset, - temp2, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); // /* HeapReference<Class> */ temp2 = temp1->component_type_ @@ -2695,7 +2697,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { temp2_loc, temp1, component_offset, - temp3, + temp3_loc, /* needs_null_check */ false, /* use_load_acquire */ false); __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel()); @@ -2755,9 +2757,17 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { // Make sure `tmp` is not IP0, as it is clobbered by // ReadBarrierMarkRegX entry points in // ReadBarrierSystemArrayCopySlowPathARM64. + DCHECK(temps.IsAvailable(ip0)); temps.Exclude(ip0); Register tmp = temps.AcquireW(); DCHECK_NE(LocationFrom(tmp).reg(), IP0); + // Put IP0 back in the pool so that VIXL has at least one + // scratch register available to emit macro-instructions (note + // that IP1 is already used for `tmp`). Indeed some + // macro-instructions used in GenSystemArrayCopyAddresses + // (invoked hereunder) may require a scratch register (for + // instance to emit a load with a large constant offset). + temps.Include(ip0); // /* int32_t */ monitor = src->monitor_ __ Ldr(tmp, HeapOperand(src.W(), monitor_offset)); diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index 900b00e222..41df56b514 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -2742,6 +2742,397 @@ void IntrinsicCodeGeneratorMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ Bind(&done); } +static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCallOnMainOnly, + kIntrinsified); + InvokeRuntimeCallingConvention calling_convention; + + locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimDouble)); +} + +static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCallOnMainOnly, + kIntrinsified); + InvokeRuntimeCallingConvention calling_convention; + + locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1))); + locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimDouble)); +} + +static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorMIPS* codegen, QuickEntrypointEnum entry) { + LocationSummary* locations = invoke->GetLocations(); + FRegister in = locations->InAt(0).AsFpuRegister<FRegister>(); + DCHECK_EQ(in, F12); + FRegister out = locations->Out().AsFpuRegister<FRegister>(); + DCHECK_EQ(out, F0); + + codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc()); +} + +static void GenFPFPToFPCall(HInvoke* invoke, + CodeGeneratorMIPS* codegen, + QuickEntrypointEnum entry) { + LocationSummary* locations = invoke->GetLocations(); + FRegister in0 = locations->InAt(0).AsFpuRegister<FRegister>(); + DCHECK_EQ(in0, F12); + FRegister in1 = locations->InAt(1).AsFpuRegister<FRegister>(); + DCHECK_EQ(in1, F14); + FRegister out = locations->Out().AsFpuRegister<FRegister>(); + DCHECK_EQ(out, F0); + + codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc()); +} + +// static double java.lang.Math.cos(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathCos(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathCos(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickCos); +} + +// static double java.lang.Math.sin(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathSin(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathSin(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickSin); +} + +// static double java.lang.Math.acos(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathAcos(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathAcos(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickAcos); +} + +// static double java.lang.Math.asin(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathAsin(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathAsin(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickAsin); +} + +// static double java.lang.Math.atan(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathAtan(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathAtan(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickAtan); +} + +// static double java.lang.Math.atan2(double y, double x) +void IntrinsicLocationsBuilderMIPS::VisitMathAtan2(HInvoke* invoke) { + CreateFPFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathAtan2(HInvoke* invoke) { + GenFPFPToFPCall(invoke, codegen_, kQuickAtan2); +} + +// static double java.lang.Math.cbrt(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathCbrt(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathCbrt(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickCbrt); +} + +// static double java.lang.Math.cosh(double x) +void IntrinsicLocationsBuilderMIPS::VisitMathCosh(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathCosh(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickCosh); +} + +// static double java.lang.Math.exp(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathExp(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathExp(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickExp); +} + +// static double java.lang.Math.expm1(double x) +void IntrinsicLocationsBuilderMIPS::VisitMathExpm1(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathExpm1(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickExpm1); +} + +// static double java.lang.Math.hypot(double x, double y) +void IntrinsicLocationsBuilderMIPS::VisitMathHypot(HInvoke* invoke) { + CreateFPFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathHypot(HInvoke* invoke) { + GenFPFPToFPCall(invoke, codegen_, kQuickHypot); +} + +// static double java.lang.Math.log(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathLog(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathLog(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickLog); +} + +// static double java.lang.Math.log10(double x) +void IntrinsicLocationsBuilderMIPS::VisitMathLog10(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathLog10(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickLog10); +} + +// static double java.lang.Math.nextAfter(double start, double direction) +void IntrinsicLocationsBuilderMIPS::VisitMathNextAfter(HInvoke* invoke) { + CreateFPFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathNextAfter(HInvoke* invoke) { + GenFPFPToFPCall(invoke, codegen_, kQuickNextAfter); +} + +// static double java.lang.Math.sinh(double x) +void IntrinsicLocationsBuilderMIPS::VisitMathSinh(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathSinh(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickSinh); +} + +// static double java.lang.Math.tan(double a) +void IntrinsicLocationsBuilderMIPS::VisitMathTan(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathTan(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickTan); +} + +// static double java.lang.Math.tanh(double x) +void IntrinsicLocationsBuilderMIPS::VisitMathTanh(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS::VisitMathTanh(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickTanh); +} + +// static void java.lang.System.arraycopy(Object src, int srcPos, +// Object dest, int destPos, +// int length) +void IntrinsicLocationsBuilderMIPS::VisitSystemArrayCopyChar(HInvoke* invoke) { + HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant(); + HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant(); + HIntConstant* length = invoke->InputAt(4)->AsIntConstant(); + + // As long as we are checking, we might as well check to see if the src and dest + // positions are >= 0. + if ((src_pos != nullptr && src_pos->GetValue() < 0) || + (dest_pos != nullptr && dest_pos->GetValue() < 0)) { + // We will have to fail anyways. + return; + } + + // And since we are already checking, check the length too. + if (length != nullptr) { + int32_t len = length->GetValue(); + if (len < 0) { + // Just call as normal. + return; + } + } + + // Okay, it is safe to generate inline code. + LocationSummary* locations = + new (arena_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); + // arraycopy(Object src, int srcPos, Object dest, int destPos, int length). + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1))); + locations->SetInAt(2, Location::RequiresRegister()); + locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3))); + locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4))); + + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); +} + +// Utility routine to verify that "length(input) - pos >= length" +static void EnoughItems(MipsAssembler* assembler, + Register length_input_minus_pos, + Location length, + SlowPathCodeMIPS* slow_path) { + if (length.IsConstant()) { + int32_t length_constant = length.GetConstant()->AsIntConstant()->GetValue(); + + if (IsInt<16>(length_constant)) { + __ Slti(TMP, length_input_minus_pos, length_constant); + __ Bnez(TMP, slow_path->GetEntryLabel()); + } else { + __ LoadConst32(TMP, length_constant); + __ Blt(length_input_minus_pos, TMP, slow_path->GetEntryLabel()); + } + } else { + __ Blt(length_input_minus_pos, length.AsRegister<Register>(), slow_path->GetEntryLabel()); + } +} + +static void CheckPosition(MipsAssembler* assembler, + Location pos, + Register input, + Location length, + SlowPathCodeMIPS* slow_path, + bool length_is_input_length = false) { + // Where is the length in the Array? + const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value(); + + // Calculate length(input) - pos. + if (pos.IsConstant()) { + int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue(); + if (pos_const == 0) { + if (!length_is_input_length) { + // Check that length(input) >= length. + __ LoadFromOffset(kLoadWord, AT, input, length_offset); + EnoughItems(assembler, AT, length, slow_path); + } + } else { + // Check that (length(input) - pos) >= zero. + __ LoadFromOffset(kLoadWord, AT, input, length_offset); + DCHECK_GT(pos_const, 0); + __ Addiu32(AT, AT, -pos_const, TMP); + __ Bltz(AT, slow_path->GetEntryLabel()); + + // Verify that (length(input) - pos) >= length. + EnoughItems(assembler, AT, length, slow_path); + } + } else if (length_is_input_length) { + // The only way the copy can succeed is if pos is zero. + Register pos_reg = pos.AsRegister<Register>(); + __ Bnez(pos_reg, slow_path->GetEntryLabel()); + } else { + // Verify that pos >= 0. + Register pos_reg = pos.AsRegister<Register>(); + __ Bltz(pos_reg, slow_path->GetEntryLabel()); + + // Check that (length(input) - pos) >= zero. + __ LoadFromOffset(kLoadWord, AT, input, length_offset); + __ Subu(AT, AT, pos_reg); + __ Bltz(AT, slow_path->GetEntryLabel()); + + // Verify that (length(input) - pos) >= length. + EnoughItems(assembler, AT, length, slow_path); + } +} + +void IntrinsicCodeGeneratorMIPS::VisitSystemArrayCopyChar(HInvoke* invoke) { + MipsAssembler* assembler = GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); + + Register src = locations->InAt(0).AsRegister<Register>(); + Location src_pos = locations->InAt(1); + Register dest = locations->InAt(2).AsRegister<Register>(); + Location dest_pos = locations->InAt(3); + Location length = locations->InAt(4); + + MipsLabel loop; + + Register dest_base = locations->GetTemp(0).AsRegister<Register>(); + Register src_base = locations->GetTemp(1).AsRegister<Register>(); + Register count = locations->GetTemp(2).AsRegister<Register>(); + + SlowPathCodeMIPS* slow_path = new (GetAllocator()) IntrinsicSlowPathMIPS(invoke); + codegen_->AddSlowPath(slow_path); + + // Bail out if the source and destination are the same (to handle overlap). + __ Beq(src, dest, slow_path->GetEntryLabel()); + + // Bail out if the source is null. + __ Beqz(src, slow_path->GetEntryLabel()); + + // Bail out if the destination is null. + __ Beqz(dest, slow_path->GetEntryLabel()); + + // Load length into register for count. + if (length.IsConstant()) { + __ LoadConst32(count, length.GetConstant()->AsIntConstant()->GetValue()); + } else { + // If the length is negative, bail out. + // We have already checked in the LocationsBuilder for the constant case. + __ Bltz(length.AsRegister<Register>(), slow_path->GetEntryLabel()); + + __ Move(count, length.AsRegister<Register>()); + } + + // Validity checks: source. + CheckPosition(assembler, src_pos, src, Location::RegisterLocation(count), slow_path); + + // Validity checks: dest. + CheckPosition(assembler, dest_pos, dest, Location::RegisterLocation(count), slow_path); + + // If count is zero, we're done. + __ Beqz(count, slow_path->GetExitLabel()); + + // Okay, everything checks out. Finally time to do the copy. + // Check assumption that sizeof(Char) is 2 (used in scaling below). + const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar); + DCHECK_EQ(char_size, 2u); + + const size_t char_shift = Primitive::ComponentSizeShift(Primitive::kPrimChar); + + const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); + + // Calculate source and destination addresses. + if (src_pos.IsConstant()) { + int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue(); + + __ Addiu32(src_base, src, data_offset + char_size * src_pos_const, TMP); + } else { + __ Addiu32(src_base, src, data_offset, TMP); + __ ShiftAndAdd(src_base, src_pos.AsRegister<Register>(), src_base, char_shift); + } + if (dest_pos.IsConstant()) { + int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + + __ Addiu32(dest_base, dest, data_offset + char_size * dest_pos_const, TMP); + } else { + __ Addiu32(dest_base, dest, data_offset, TMP); + __ ShiftAndAdd(dest_base, dest_pos.AsRegister<Register>(), dest_base, char_shift); + } + + __ Bind(&loop); + __ Lh(TMP, src_base, 0); + __ Addiu(src_base, src_base, char_size); + __ Addiu(count, count, -1); + __ Sh(TMP, dest_base, 0); + __ Addiu(dest_base, dest_base, char_size); + __ Bnez(count, &loop); + + __ Bind(slow_path->GetExitLabel()); +} + // Unimplemented intrinsics. UNIMPLEMENTED_INTRINSIC(MIPS, MathCeil) @@ -2753,27 +3144,8 @@ UNIMPLEMENTED_INTRINSIC(MIPS, UnsafePutLongVolatile); UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeCASLong) UNIMPLEMENTED_INTRINSIC(MIPS, ReferenceGetReferent) -UNIMPLEMENTED_INTRINSIC(MIPS, SystemArrayCopyChar) UNIMPLEMENTED_INTRINSIC(MIPS, SystemArrayCopy) -UNIMPLEMENTED_INTRINSIC(MIPS, MathCos) -UNIMPLEMENTED_INTRINSIC(MIPS, MathSin) -UNIMPLEMENTED_INTRINSIC(MIPS, MathAcos) -UNIMPLEMENTED_INTRINSIC(MIPS, MathAsin) -UNIMPLEMENTED_INTRINSIC(MIPS, MathAtan) -UNIMPLEMENTED_INTRINSIC(MIPS, MathAtan2) -UNIMPLEMENTED_INTRINSIC(MIPS, MathCbrt) -UNIMPLEMENTED_INTRINSIC(MIPS, MathCosh) -UNIMPLEMENTED_INTRINSIC(MIPS, MathExp) -UNIMPLEMENTED_INTRINSIC(MIPS, MathExpm1) -UNIMPLEMENTED_INTRINSIC(MIPS, MathHypot) -UNIMPLEMENTED_INTRINSIC(MIPS, MathLog) -UNIMPLEMENTED_INTRINSIC(MIPS, MathLog10) -UNIMPLEMENTED_INTRINSIC(MIPS, MathNextAfter) -UNIMPLEMENTED_INTRINSIC(MIPS, MathSinh) -UNIMPLEMENTED_INTRINSIC(MIPS, MathTan) -UNIMPLEMENTED_INTRINSIC(MIPS, MathTanh) - UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter); UNIMPLEMENTED_INTRINSIC(MIPS, StringBufferAppend); diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index c2518a7861..b57b41f686 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -2093,6 +2093,199 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ Bind(&done); } +// static void java.lang.System.arraycopy(Object src, int srcPos, +// Object dest, int destPos, +// int length) +void IntrinsicLocationsBuilderMIPS64::VisitSystemArrayCopyChar(HInvoke* invoke) { + HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant(); + HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant(); + HIntConstant* length = invoke->InputAt(4)->AsIntConstant(); + + // As long as we are checking, we might as well check to see if the src and dest + // positions are >= 0. + if ((src_pos != nullptr && src_pos->GetValue() < 0) || + (dest_pos != nullptr && dest_pos->GetValue() < 0)) { + // We will have to fail anyways. + return; + } + + // And since we are already checking, check the length too. + if (length != nullptr) { + int32_t len = length->GetValue(); + if (len < 0) { + // Just call as normal. + return; + } + } + + // Okay, it is safe to generate inline code. + LocationSummary* locations = + new (arena_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); + // arraycopy(Object src, int srcPos, Object dest, int destPos, int length). + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1))); + locations->SetInAt(2, Location::RequiresRegister()); + locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3))); + locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4))); + + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); +} + +// Utility routine to verify that "length(input) - pos >= length" +static void EnoughItems(Mips64Assembler* assembler, + GpuRegister length_input_minus_pos, + Location length, + SlowPathCodeMIPS64* slow_path) { + if (length.IsConstant()) { + int32_t length_constant = length.GetConstant()->AsIntConstant()->GetValue(); + + if (IsInt<16>(length_constant)) { + __ Slti(TMP, length_input_minus_pos, length_constant); + __ Bnezc(TMP, slow_path->GetEntryLabel()); + } else { + __ LoadConst32(TMP, length_constant); + __ Bltc(length_input_minus_pos, TMP, slow_path->GetEntryLabel()); + } + } else { + __ Bltc(length_input_minus_pos, length.AsRegister<GpuRegister>(), slow_path->GetEntryLabel()); + } +} + +static void CheckPosition(Mips64Assembler* assembler, + Location pos, + GpuRegister input, + Location length, + SlowPathCodeMIPS64* slow_path, + bool length_is_input_length = false) { + // Where is the length in the Array? + const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value(); + + // Calculate length(input) - pos. + if (pos.IsConstant()) { + int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue(); + if (pos_const == 0) { + if (!length_is_input_length) { + // Check that length(input) >= length. + __ LoadFromOffset(kLoadWord, AT, input, length_offset); + EnoughItems(assembler, AT, length, slow_path); + } + } else { + // Check that (length(input) - pos) >= zero. + __ LoadFromOffset(kLoadWord, AT, input, length_offset); + DCHECK_GT(pos_const, 0); + __ Addiu32(AT, AT, -pos_const); + __ Bltzc(AT, slow_path->GetEntryLabel()); + + // Verify that (length(input) - pos) >= length. + EnoughItems(assembler, AT, length, slow_path); + } + } else if (length_is_input_length) { + // The only way the copy can succeed is if pos is zero. + GpuRegister pos_reg = pos.AsRegister<GpuRegister>(); + __ Bnezc(pos_reg, slow_path->GetEntryLabel()); + } else { + // Verify that pos >= 0. + GpuRegister pos_reg = pos.AsRegister<GpuRegister>(); + __ Bltzc(pos_reg, slow_path->GetEntryLabel()); + + // Check that (length(input) - pos) >= zero. + __ LoadFromOffset(kLoadWord, AT, input, length_offset); + __ Subu(AT, AT, pos_reg); + __ Bltzc(AT, slow_path->GetEntryLabel()); + + // Verify that (length(input) - pos) >= length. + EnoughItems(assembler, AT, length, slow_path); + } +} + +void IntrinsicCodeGeneratorMIPS64::VisitSystemArrayCopyChar(HInvoke* invoke) { + Mips64Assembler* assembler = GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); + + GpuRegister src = locations->InAt(0).AsRegister<GpuRegister>(); + Location src_pos = locations->InAt(1); + GpuRegister dest = locations->InAt(2).AsRegister<GpuRegister>(); + Location dest_pos = locations->InAt(3); + Location length = locations->InAt(4); + + Mips64Label loop; + + GpuRegister dest_base = locations->GetTemp(0).AsRegister<GpuRegister>(); + GpuRegister src_base = locations->GetTemp(1).AsRegister<GpuRegister>(); + GpuRegister count = locations->GetTemp(2).AsRegister<GpuRegister>(); + + SlowPathCodeMIPS64* slow_path = new (GetAllocator()) IntrinsicSlowPathMIPS64(invoke); + codegen_->AddSlowPath(slow_path); + + // Bail out if the source and destination are the same (to handle overlap). + __ Beqc(src, dest, slow_path->GetEntryLabel()); + + // Bail out if the source is null. + __ Beqzc(src, slow_path->GetEntryLabel()); + + // Bail out if the destination is null. + __ Beqzc(dest, slow_path->GetEntryLabel()); + + // Load length into register for count. + if (length.IsConstant()) { + __ LoadConst32(count, length.GetConstant()->AsIntConstant()->GetValue()); + } else { + // If the length is negative, bail out. + // We have already checked in the LocationsBuilder for the constant case. + __ Bltzc(length.AsRegister<GpuRegister>(), slow_path->GetEntryLabel()); + + __ Move(count, length.AsRegister<GpuRegister>()); + } + + // Validity checks: source. + CheckPosition(assembler, src_pos, src, Location::RegisterLocation(count), slow_path); + + // Validity checks: dest. + CheckPosition(assembler, dest_pos, dest, Location::RegisterLocation(count), slow_path); + + // If count is zero, we're done. + __ Beqzc(count, slow_path->GetExitLabel()); + + // Okay, everything checks out. Finally time to do the copy. + // Check assumption that sizeof(Char) is 2 (used in scaling below). + const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar); + DCHECK_EQ(char_size, 2u); + + const size_t char_shift = Primitive::ComponentSizeShift(Primitive::kPrimChar); + + const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); + + // Calculate source and destination addresses. + if (src_pos.IsConstant()) { + int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue(); + + __ Daddiu64(src_base, src, data_offset + char_size * src_pos_const, TMP); + } else { + __ Daddiu64(src_base, src, data_offset, TMP); + __ Dlsa(src_base, src_pos.AsRegister<GpuRegister>(), src_base, char_shift); + } + if (dest_pos.IsConstant()) { + int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + + __ Daddiu64(dest_base, dest, data_offset + char_size * dest_pos_const, TMP); + } else { + __ Daddiu64(dest_base, dest, data_offset, TMP); + __ Dlsa(dest_base, dest_pos.AsRegister<GpuRegister>(), dest_base, char_shift); + } + + __ Bind(&loop); + __ Lh(TMP, src_base, 0); + __ Daddiu(src_base, src_base, char_size); + __ Daddiu(count, count, -1); + __ Sh(TMP, dest_base, 0); + __ Daddiu(dest_base, dest_base, char_size); + __ Bnezc(count, &loop); + + __ Bind(slow_path->GetExitLabel()); +} + static void GenHighestOneBit(LocationSummary* locations, Primitive::Type type, Mips64Assembler* assembler) { @@ -2171,28 +2364,209 @@ void IntrinsicCodeGeneratorMIPS64::VisitLongLowestOneBit(HInvoke* invoke) { GenLowestOneBit(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); } +static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCallOnMainOnly, + kIntrinsified); + InvokeRuntimeCallingConvention calling_convention; + + locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimDouble)); +} + +static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kCallOnMainOnly, + kIntrinsified); + InvokeRuntimeCallingConvention calling_convention; + + locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0))); + locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1))); + locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimDouble)); +} + +static void GenFPToFPCall(HInvoke* invoke, + CodeGeneratorMIPS64* codegen, + QuickEntrypointEnum entry) { + LocationSummary* locations = invoke->GetLocations(); + FpuRegister in = locations->InAt(0).AsFpuRegister<FpuRegister>(); + DCHECK_EQ(in, F12); + FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>(); + DCHECK_EQ(out, F0); + + codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc()); +} + +static void GenFPFPToFPCall(HInvoke* invoke, + CodeGeneratorMIPS64* codegen, + QuickEntrypointEnum entry) { + LocationSummary* locations = invoke->GetLocations(); + FpuRegister in0 = locations->InAt(0).AsFpuRegister<FpuRegister>(); + DCHECK_EQ(in0, F12); + FpuRegister in1 = locations->InAt(1).AsFpuRegister<FpuRegister>(); + DCHECK_EQ(in1, F13); + FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>(); + DCHECK_EQ(out, F0); + + codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc()); +} + +// static double java.lang.Math.cos(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathCos(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathCos(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickCos); +} + +// static double java.lang.Math.sin(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathSin(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathSin(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickSin); +} + +// static double java.lang.Math.acos(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathAcos(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathAcos(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickAcos); +} + +// static double java.lang.Math.asin(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathAsin(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathAsin(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickAsin); +} + +// static double java.lang.Math.atan(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathAtan(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathAtan(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickAtan); +} + +// static double java.lang.Math.atan2(double y, double x) +void IntrinsicLocationsBuilderMIPS64::VisitMathAtan2(HInvoke* invoke) { + CreateFPFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathAtan2(HInvoke* invoke) { + GenFPFPToFPCall(invoke, codegen_, kQuickAtan2); +} + +// static double java.lang.Math.cbrt(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathCbrt(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathCbrt(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickCbrt); +} + +// static double java.lang.Math.cosh(double x) +void IntrinsicLocationsBuilderMIPS64::VisitMathCosh(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathCosh(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickCosh); +} + +// static double java.lang.Math.exp(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathExp(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathExp(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickExp); +} + +// static double java.lang.Math.expm1(double x) +void IntrinsicLocationsBuilderMIPS64::VisitMathExpm1(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathExpm1(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickExpm1); +} + +// static double java.lang.Math.hypot(double x, double y) +void IntrinsicLocationsBuilderMIPS64::VisitMathHypot(HInvoke* invoke) { + CreateFPFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathHypot(HInvoke* invoke) { + GenFPFPToFPCall(invoke, codegen_, kQuickHypot); +} + +// static double java.lang.Math.log(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathLog(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathLog(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickLog); +} + +// static double java.lang.Math.log10(double x) +void IntrinsicLocationsBuilderMIPS64::VisitMathLog10(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathLog10(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickLog10); +} + +// static double java.lang.Math.nextAfter(double start, double direction) +void IntrinsicLocationsBuilderMIPS64::VisitMathNextAfter(HInvoke* invoke) { + CreateFPFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathNextAfter(HInvoke* invoke) { + GenFPFPToFPCall(invoke, codegen_, kQuickNextAfter); +} + +// static double java.lang.Math.sinh(double x) +void IntrinsicLocationsBuilderMIPS64::VisitMathSinh(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathSinh(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickSinh); +} + +// static double java.lang.Math.tan(double a) +void IntrinsicLocationsBuilderMIPS64::VisitMathTan(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathTan(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickTan); +} + +// static double java.lang.Math.tanh(double x) +void IntrinsicLocationsBuilderMIPS64::VisitMathTanh(HInvoke* invoke) { + CreateFPToFPCallLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitMathTanh(HInvoke* invoke) { + GenFPToFPCall(invoke, codegen_, kQuickTanh); +} + UNIMPLEMENTED_INTRINSIC(MIPS64, ReferenceGetReferent) -UNIMPLEMENTED_INTRINSIC(MIPS64, SystemArrayCopyChar) UNIMPLEMENTED_INTRINSIC(MIPS64, SystemArrayCopy) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathCos) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathSin) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathAcos) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathAsin) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathAtan) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathAtan2) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathCbrt) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathCosh) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathExp) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathExpm1) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathHypot) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathLog) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathLog10) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathNextAfter) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathSinh) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathTan) -UNIMPLEMENTED_INTRINSIC(MIPS64, MathTanh) - UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter); UNIMPLEMENTED_INTRINSIC(MIPS64, StringBufferAppend); diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 4710b32e9c..5a95abdb50 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -63,12 +63,122 @@ static bool IsEarlyExit(HLoopInformation* loop_info) { return false; } +// Detect a sign extension from the given type. Returns the promoted operand on success. +static bool IsSignExtensionAndGet(HInstruction* instruction, + Primitive::Type type, + /*out*/ HInstruction** operand) { + // Accept any already wider constant that would be handled properly by sign + // extension when represented in the *width* of the given narrower data type + // (the fact that char normally zero extends does not matter here). + int64_t value = 0; + if (IsInt64AndGet(instruction, &value)) { + switch (type) { + case Primitive::kPrimByte: + if (std::numeric_limits<int8_t>::min() <= value && + std::numeric_limits<int8_t>::max() >= value) { + *operand = instruction; + return true; + } + return false; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + if (std::numeric_limits<int16_t>::min() <= value && + std::numeric_limits<int16_t>::max() <= value) { + *operand = instruction; + return true; + } + return false; + default: + return false; + } + } + // An implicit widening conversion of a signed integer to an integral type sign-extends + // the two's-complement representation of the integer value to fill the wider format. + if (instruction->GetType() == type && (instruction->IsArrayGet() || + instruction->IsStaticFieldGet() || + instruction->IsInstanceFieldGet())) { + switch (type) { + case Primitive::kPrimByte: + case Primitive::kPrimShort: + *operand = instruction; + return true; + default: + return false; + } + } + // TODO: perhaps explicit conversions later too? + // (this may return something different from instruction) + return false; +} + +// Detect a zero extension from the given type. Returns the promoted operand on success. +static bool IsZeroExtensionAndGet(HInstruction* instruction, + Primitive::Type type, + /*out*/ HInstruction** operand) { + // Accept any already wider constant that would be handled properly by zero + // extension when represented in the *width* of the given narrower data type + // (the fact that byte/short normally sign extend does not matter here). + int64_t value = 0; + if (IsInt64AndGet(instruction, &value)) { + switch (type) { + case Primitive::kPrimByte: + if (std::numeric_limits<uint8_t>::min() <= value && + std::numeric_limits<uint8_t>::max() >= value) { + *operand = instruction; + return true; + } + return false; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + if (std::numeric_limits<uint16_t>::min() <= value && + std::numeric_limits<uint16_t>::max() <= value) { + *operand = instruction; + return true; + } + return false; + default: + return false; + } + } + // An implicit widening conversion of a char to an integral type zero-extends + // the representation of the char value to fill the wider format. + if (instruction->GetType() == type && (instruction->IsArrayGet() || + instruction->IsStaticFieldGet() || + instruction->IsInstanceFieldGet())) { + if (type == Primitive::kPrimChar) { + *operand = instruction; + return true; + } + } + // A sign (or zero) extension followed by an explicit removal of just the + // higher sign bits is equivalent to a zero extension of the underlying operand. + if (instruction->IsAnd()) { + int64_t mask = 0; + HInstruction* a = instruction->InputAt(0); + HInstruction* b = instruction->InputAt(1); + // In (a & b) find (mask & b) or (a & mask) with sign or zero extension on the non-mask. + if ((IsInt64AndGet(a, /*out*/ &mask) && (IsSignExtensionAndGet(b, type, /*out*/ operand) || + IsZeroExtensionAndGet(b, type, /*out*/ operand))) || + (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) || + IsZeroExtensionAndGet(a, type, /*out*/ operand)))) { + switch ((*operand)->GetType()) { + case Primitive::kPrimByte: return mask == std::numeric_limits<uint8_t>::max(); + case Primitive::kPrimChar: + case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max(); + default: return false; + } + } + } + // TODO: perhaps explicit conversions later too? + return false; +} + // Test vector restrictions. static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) { return (restrictions & tested) != 0; } -// Inserts an instruction. +// Insert an instruction. static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) { DCHECK(block != nullptr); DCHECK(instruction != nullptr); @@ -713,6 +823,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return true; } } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) { + // Recognize vectorization idioms. + if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) { + return true; + } // Deal with vector restrictions. if ((HasVectorRestrictions(restrictions, kNoShift)) || (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) { @@ -806,11 +920,11 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs; + *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoAbs; + *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd; return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; @@ -1039,6 +1153,90 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, #undef GENERATE_VEC // +// Vectorization idioms. +// + +// Method recognizes the following idioms: +// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b +// regular halving add (a + b) >> 1 for unsigned/signed operands a, b +// Provided that the operands are promoted to a wider form to do the arithmetic and +// then cast back to narrower form, the idioms can be mapped into efficient SIMD +// implementation that operates directly in narrower form (plus one extra bit). +// TODO: current version recognizes implicit byte/short/char widening only; +// explicit widening from int to long could be added later. +bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type type, + uint64_t restrictions) { + // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1 + // (note whether the sign bit in higher precision is shifted in has no effect + // on the narrow precision computed by the idiom). + int64_t value = 0; + if ((instruction->IsShr() || + instruction->IsUShr()) && + IsInt64AndGet(instruction->InputAt(1), &value) && value == 1) { + // + // TODO: make following code less sensitive to associativity and commutativity differences. + // + HInstruction* x = instruction->InputAt(0); + // Test for an optional rounding part (x + 1) >> 1. + bool is_rounded = false; + if (x->IsAdd() && IsInt64AndGet(x->InputAt(1), &value) && value == 1) { + x = x->InputAt(0); + is_rounded = true; + } + // Test for a core addition (a + b) >> 1 (possibly rounded), either unsigned or signed. + if (x->IsAdd()) { + HInstruction* a = x->InputAt(0); + HInstruction* b = x->InputAt(1); + HInstruction* r = nullptr; + HInstruction* s = nullptr; + bool is_unsigned = false; + if (IsZeroExtensionAndGet(a, type, &r) && IsZeroExtensionAndGet(b, type, &s)) { + is_unsigned = true; + } else if (IsSignExtensionAndGet(a, type, &r) && IsSignExtensionAndGet(b, type, &s)) { + is_unsigned = false; + } else { + return false; + } + // Deal with vector restrictions. + if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) || + (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) { + return false; + } + // Accept recognized halving add for vectorizable operands. Vectorized code uses the + // shorthand idiomatic operation. Sequential code uses the original scalar expressions. + DCHECK(r != nullptr && s != nullptr); + if (VectorizeUse(node, r, generate_code, type, restrictions) && + VectorizeUse(node, s, generate_code, type, restrictions)) { + if (generate_code) { + if (vector_mode_ == kVector) { + vector_map_->Put(instruction, new (global_allocator_) HVecHalvingAdd( + global_allocator_, + vector_map_->Get(r), + vector_map_->Get(s), + type, + vector_length_, + is_unsigned, + is_rounded)); + } else { + VectorizeUse(node, instruction->InputAt(0), generate_code, type, restrictions); + VectorizeUse(node, instruction->InputAt(1), generate_code, type, restrictions); + GenerateVecOp(instruction, + vector_map_->Get(instruction->InputAt(0)), + vector_map_->Get(instruction->InputAt(1)), + type); + } + } + return true; + } + } + } + return false; +} + +// // Helpers. // @@ -1082,7 +1280,10 @@ bool HLoopOptimization::TrySetSimpleLoopHeader(HBasicBlock* block) { HInstruction* s = block->GetFirstInstruction(); if (s != nullptr && s->IsSuspendCheck()) { HInstruction* c = s->GetNext(); - if (c != nullptr && c->IsCondition() && c->GetUses().HasExactlyOneElement()) { + if (c != nullptr && + c->IsCondition() && + c->GetUses().HasExactlyOneElement() && // only used for termination + !c->HasEnvironmentUses()) { // unlikely, but not impossible HInstruction* i = c->GetNext(); if (i != nullptr && i->IsIf() && i->InputAt(0) == c) { iset_->insert(c); diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index d8f50aab28..4a7da86e32 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -62,13 +62,15 @@ class HLoopOptimization : public HOptimization { * Vectorization restrictions (bit mask). */ enum VectorRestrictions { - kNone = 0, // no restrictions - kNoMul = 1, // no multiplication - kNoDiv = 2, // no division - kNoShift = 4, // no shift - kNoShr = 8, // no arithmetic shift right - kNoHiBits = 16, // "wider" operations cannot bring in higher order bits - kNoAbs = 32, // no absolute value + kNone = 0, // no restrictions + kNoMul = 1, // no multiplication + kNoDiv = 2, // no division + kNoShift = 4, // no shift + kNoShr = 8, // no arithmetic shift right + kNoHiBits = 16, // "wider" operations cannot bring in higher order bits + kNoSignedHAdd = 32, // no signed halving add + kNoUnroundedHAdd = 64, // no unrounded halving add + kNoAbs = 128, // no absolute value }; /* @@ -136,6 +138,13 @@ class HLoopOptimization : public HOptimization { Primitive::Type type); void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type); + // Vectorization idioms. + bool VectorizeHalvingAddIdiom(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type type, + uint64_t restrictions); + // Helpers. bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); bool TrySetSimpleLoopHeader(HBasicBlock* block); diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index c109369106..6be237e612 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1369,9 +1369,12 @@ class HLoopInformationOutwardIterator : public ValueObject { M(VecAbs, VecUnaryOperation) \ M(VecNot, VecUnaryOperation) \ M(VecAdd, VecBinaryOperation) \ + M(VecHalvingAdd, VecBinaryOperation) \ M(VecSub, VecBinaryOperation) \ M(VecMul, VecBinaryOperation) \ M(VecDiv, VecBinaryOperation) \ + M(VecMin, VecBinaryOperation) \ + M(VecMax, VecBinaryOperation) \ M(VecAnd, VecBinaryOperation) \ M(VecAndNot, VecBinaryOperation) \ M(VecOr, VecBinaryOperation) \ @@ -6845,6 +6848,7 @@ class HBlocksInLoopReversePostOrderIterator : public ValueObject { DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator); }; +// Returns int64_t value of a properly typed constant. inline int64_t Int64FromConstant(HConstant* constant) { if (constant->IsIntConstant()) { return constant->AsIntConstant()->GetValue(); @@ -6856,6 +6860,21 @@ inline int64_t Int64FromConstant(HConstant* constant) { } } +// Returns true iff instruction is an integral constant (and sets value on success). +inline bool IsInt64AndGet(HInstruction* instruction, /*out*/ int64_t* value) { + if (instruction->IsIntConstant()) { + *value = instruction->AsIntConstant()->GetValue(); + return true; + } else if (instruction->IsLongConstant()) { + *value = instruction->AsLongConstant()->GetValue(); + return true; + } else if (instruction->IsNullConstant()) { + *value = 0; + return true; + } + return false; +} + #define INSTRUCTION_TYPE_CHECK(type, super) \ inline bool HInstruction::Is##type() const { return GetKind() == k##type; } \ inline const H##type* HInstruction::As##type() const { \ diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 0cbbf2a215..bff58d0910 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -338,6 +338,42 @@ class HVecAdd FINAL : public HVecBinaryOperation { DISALLOW_COPY_AND_ASSIGN(HVecAdd); }; +// Performs halving add on every component in the two vectors, viz. +// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ] +// or [ x1, .. , xn ] hadd [ y1, .. , yn ] = [ (x1 + y1) >> 1, .. , (xn + yn ) >> 1 ] +// for signed operands x, y (sign extension) or unsigned operands x, y (zero extension). +class HVecHalvingAdd FINAL : public HVecBinaryOperation { + public: + HVecHalvingAdd(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + bool is_unsigned, + bool is_rounded, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc), + is_unsigned_(is_unsigned), + is_rounded_(is_rounded) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + + bool IsUnsigned() const { return is_unsigned_; } + bool IsRounded() const { return is_rounded_; } + + DECLARE_INSTRUCTION(VecHalvingAdd); + + private: + bool is_unsigned_; + bool is_rounded_; + + DISALLOW_COPY_AND_ASSIGN(HVecHalvingAdd); +}; + // Subtracts every component in the two vectors, // viz. [ x1, .. , xn ] - [ y1, .. , yn ] = [ x1 - y1, .. , xn - yn ]. class HVecSub FINAL : public HVecBinaryOperation { @@ -404,6 +440,50 @@ class HVecDiv FINAL : public HVecBinaryOperation { DISALLOW_COPY_AND_ASSIGN(HVecDiv); }; +// Takes minimum of every component in the two vectors, +// viz. MIN( [ x1, .. , xn ] , [ y1, .. , yn ]) = [ min(x1, y1), .. , min(xn, yn) ]. +class HVecMin FINAL : public HVecBinaryOperation { + public: + HVecMin(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecMin); + private: + DISALLOW_COPY_AND_ASSIGN(HVecMin); +}; + +// Takes maximum of every component in the two vectors, +// viz. MAX( [ x1, .. , xn ] , [ y1, .. , yn ]) = [ max(x1, y1), .. , max(xn, yn) ]. +class HVecMax FINAL : public HVecBinaryOperation { + public: + HVecMax(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecMax); + private: + DISALLOW_COPY_AND_ASSIGN(HVecMax); +}; + // Bitwise-ands every component in the two vectors, // viz. [ x1, .. , xn ] & [ y1, .. , yn ] = [ x1 & y1, .. , xn & yn ]. class HVecAnd FINAL : public HVecBinaryOperation { |