diff options
Diffstat (limited to 'compiler')
71 files changed, 3404 insertions, 2102 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp index 40c676c406..c4d538fc88 100644 --- a/compiler/Android.bp +++ b/compiler/Android.bp @@ -256,7 +256,14 @@ art_cc_library { instrumentation: true, profile_file: "art/dex2oat.profdata", benchmarks: ["dex2oat"], - } + }, + target: { + android: { + lto: { + thin: true, + }, + }, + }, } art_cc_library { diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc index a20313374c..d3e3a51f7a 100644 --- a/compiler/common_compiler_test.cc +++ b/compiler/common_compiler_test.cc @@ -21,6 +21,7 @@ #include "art_method-inl.h" #include "base/callee_save_type.h" #include "base/enums.h" +#include "base/utils.h" #include "class_linker.h" #include "compiled_method-inl.h" #include "dex/descriptors_names.h" @@ -36,7 +37,6 @@ #include "oat_quick_method_header.h" #include "scoped_thread_state_change-inl.h" #include "thread-current-inl.h" -#include "utils.h" namespace art { diff --git a/compiler/compiler.cc b/compiler/compiler.cc index 7c7ae71d77..646040fd9d 100644 --- a/compiler/compiler.cc +++ b/compiler/compiler.cc @@ -19,10 +19,10 @@ #include <android-base/logging.h> #include "base/macros.h" +#include "base/utils.h" #include "dex/code_item_accessors-inl.h" #include "driver/compiler_driver.h" #include "optimizing/optimizing_compiler.h" -#include "utils.h" namespace art { diff --git a/compiler/compiler.h b/compiler/compiler.h index a17e2b5875..f2ec3a9fa3 100644 --- a/compiler/compiler.h +++ b/compiler/compiler.h @@ -18,8 +18,8 @@ #define ART_COMPILER_COMPILER_H_ #include "base/mutex.h" +#include "base/os.h" #include "dex/dex_file.h" -#include "os.h" namespace art { diff --git a/compiler/debug/dwarf/dwarf_test.h b/compiler/debug/dwarf/dwarf_test.h index 5405759c1f..9a7c604ca1 100644 --- a/compiler/debug/dwarf/dwarf_test.h +++ b/compiler/debug/dwarf/dwarf_test.h @@ -26,12 +26,12 @@ #include <set> #include <string> +#include "base/os.h" #include "base/unix_file/fd_file.h" #include "common_runtime_test.h" #include "gtest/gtest.h" #include "linker/elf_builder.h" #include "linker/file_output_stream.h" -#include "os.h" namespace art { namespace dwarf { diff --git a/compiler/debug/elf_symtab_writer.h b/compiler/debug/elf_symtab_writer.h index 1310e8dabd..7a8e29191a 100644 --- a/compiler/debug/elf_symtab_writer.h +++ b/compiler/debug/elf_symtab_writer.h @@ -20,12 +20,12 @@ #include <map> #include <unordered_set> +#include "base/utils.h" #include "debug/debug_info.h" #include "debug/method_debug_info.h" #include "dex/dex_file-inl.h" #include "dex/code_item_accessors.h" #include "linker/elf_builder.h" -#include "utils.h" namespace art { namespace debug { diff --git a/compiler/driver/compiled_method_storage.cc b/compiler/driver/compiled_method_storage.cc index 48477abe5b..a26a985ff9 100644 --- a/compiler/driver/compiled_method_storage.cc +++ b/compiler/driver/compiled_method_storage.cc @@ -21,10 +21,10 @@ #include <android-base/logging.h> +#include "base/utils.h" #include "compiled_method.h" #include "linker/linker_patch.h" #include "thread-current-inl.h" -#include "utils.h" #include "utils/dedupe_set-inl.h" #include "utils/swap_space.h" diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h index a3bb4ec34d..8db892bf18 100644 --- a/compiler/driver/compiler_driver.h +++ b/compiler/driver/compiler_driver.h @@ -29,6 +29,8 @@ #include "base/array_ref.h" #include "base/bit_utils.h" #include "base/mutex.h" +#include "base/os.h" +#include "base/quasi_atomic.h" #include "base/safe_map.h" #include "base/timing_logger.h" #include "class_reference.h" @@ -39,7 +41,6 @@ #include "dex/dex_to_dex_compiler.h" #include "driver/compiled_method_storage.h" #include "method_reference.h" -#include "os.h" #include "thread_pool.h" #include "utils/atomic_dex_ref_map.h" #include "utils/dex_cache_arrays_layout.h" diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h index 18b0913430..05d8805e81 100644 --- a/compiler/driver/compiler_options.h +++ b/compiler/driver/compiler_options.h @@ -22,10 +22,10 @@ #include <vector> #include "base/macros.h" +#include "base/utils.h" #include "compiler_filter.h" #include "globals.h" #include "optimizing/register_allocator.h" -#include "utils.h" namespace art { diff --git a/compiler/driver/dex_compilation_unit.cc b/compiler/driver/dex_compilation_unit.cc index 2e315b5d12..c90c37d54a 100644 --- a/compiler/driver/dex_compilation_unit.cc +++ b/compiler/driver/dex_compilation_unit.cc @@ -16,10 +16,10 @@ #include "dex_compilation_unit.h" +#include "base/utils.h" #include "dex/code_item_accessors-inl.h" #include "dex/descriptors_names.h" #include "mirror/dex_cache.h" -#include "utils.h" namespace art { diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc index fc44927231..d001cfe4fc 100644 --- a/compiler/jni/quick/jni_compiler.cc +++ b/compiler/jni/quick/jni_compiler.cc @@ -27,6 +27,7 @@ #include "base/enums.h" #include "base/logging.h" // For VLOG. #include "base/macros.h" +#include "base/utils.h" #include "calling_convention.h" #include "class_linker.h" #include "debug/dwarf/debug_frame_opcode_writer.h" @@ -37,7 +38,6 @@ #include "jni_env_ext.h" #include "memory_region.h" #include "thread.h" -#include "utils.h" #include "utils/arm/managed_register_arm.h" #include "utils/arm64/managed_register_arm64.h" #include "utils/assembler.h" diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc index 52a07965b9..b268204b4a 100644 --- a/compiler/linker/arm64/relative_patcher_arm64.cc +++ b/compiler/linker/arm64/relative_patcher_arm64.cc @@ -60,13 +60,12 @@ inline bool IsAdrpPatch(const LinkerPatch& patch) { case LinkerPatch::Type::kCallRelative: case LinkerPatch::Type::kBakerReadBarrierBranch: return false; + case LinkerPatch::Type::kDataBimgRelRo: case LinkerPatch::Type::kMethodRelative: case LinkerPatch::Type::kMethodBssEntry: case LinkerPatch::Type::kTypeRelative: - case LinkerPatch::Type::kTypeClassTable: case LinkerPatch::Type::kTypeBssEntry: case LinkerPatch::Type::kStringRelative: - case LinkerPatch::Type::kStringInternTable: case LinkerPatch::Type::kStringBssEntry: return patch.LiteralOffset() == patch.PcInsnOffset(); } @@ -271,10 +270,9 @@ void Arm64RelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code, shift = 0u; // No shift for ADD. } else { // LDR/STR 32-bit or 64-bit with imm12 == 0 (unset). - DCHECK(patch.GetType() == LinkerPatch::Type::kMethodBssEntry || - patch.GetType() == LinkerPatch::Type::kTypeClassTable || + DCHECK(patch.GetType() == LinkerPatch::Type::kDataBimgRelRo || + patch.GetType() == LinkerPatch::Type::kMethodBssEntry || patch.GetType() == LinkerPatch::Type::kTypeBssEntry || - patch.GetType() == LinkerPatch::Type::kStringInternTable || patch.GetType() == LinkerPatch::Type::kStringBssEntry) << patch.GetType(); DCHECK_EQ(insn & 0xbfbffc00, 0xb9000000) << std::hex << insn; } diff --git a/compiler/linker/elf_builder.h b/compiler/linker/elf_builder.h index a5f60992ca..3da7a43762 100644 --- a/compiler/linker/elf_builder.h +++ b/compiler/linker/elf_builder.h @@ -529,6 +529,8 @@ class ElfBuilder FINAL { stream_(output), rodata_(this, ".rodata", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0), text_(this, ".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, nullptr, 0, kPageSize, 0), + data_bimg_rel_ro_( + this, ".data.bimg.rel.ro", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0), bss_(this, ".bss", SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0), dex_(this, ".dex", SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0), dynstr_(this, ".dynstr", SHF_ALLOC, kPageSize), @@ -552,6 +554,7 @@ class ElfBuilder FINAL { loaded_size_(0u), virtual_address_(0) { text_.phdr_flags_ = PF_R | PF_X; + data_bimg_rel_ro_.phdr_flags_ = PF_R | PF_W; // Shall be made read-only at run time. bss_.phdr_flags_ = PF_R | PF_W; dex_.phdr_flags_ = PF_R; dynamic_.phdr_flags_ = PF_R | PF_W; @@ -566,6 +569,7 @@ class ElfBuilder FINAL { BuildIdSection* GetBuildId() { return &build_id_; } Section* GetRoData() { return &rodata_; } Section* GetText() { return &text_; } + Section* GetDataBimgRelRo() { return &data_bimg_rel_ro_; } Section* GetBss() { return &bss_; } Section* GetDex() { return &dex_; } StringSection* GetStrTab() { return &strtab_; } @@ -694,6 +698,7 @@ class ElfBuilder FINAL { void PrepareDynamicSection(const std::string& elf_file_path, Elf_Word rodata_size, Elf_Word text_size, + Elf_Word data_bimg_rel_ro_size, Elf_Word bss_size, Elf_Word bss_methods_offset, Elf_Word bss_roots_offset, @@ -707,6 +712,9 @@ class ElfBuilder FINAL { // Allocate all pre-dynamic sections. rodata_.AllocateVirtualMemory(rodata_size); text_.AllocateVirtualMemory(text_size); + if (data_bimg_rel_ro_size != 0) { + data_bimg_rel_ro_.AllocateVirtualMemory(data_bimg_rel_ro_size); + } if (bss_size != 0) { bss_.AllocateVirtualMemory(bss_size); } @@ -735,6 +743,24 @@ class ElfBuilder FINAL { Elf_Word oatlastword_address = rodata_.GetAddress() + rodata_size - 4; dynsym_.Add(oatlastword, &rodata_, oatlastword_address, 4, STB_GLOBAL, STT_OBJECT); } + if (data_bimg_rel_ro_size != 0u) { + Elf_Word oatdatabimgrelro = dynstr_.Add("oatdatabimgrelro"); + dynsym_.Add(oatdatabimgrelro, + &data_bimg_rel_ro_, + data_bimg_rel_ro_.GetAddress(), + data_bimg_rel_ro_size, + STB_GLOBAL, + STT_OBJECT); + Elf_Word oatdatabimgrelrolastword = dynstr_.Add("oatdatabimgrelrolastword"); + Elf_Word oatdatabimgrelrolastword_address = + data_bimg_rel_ro_.GetAddress() + data_bimg_rel_ro_size - 4; + dynsym_.Add(oatdatabimgrelrolastword, + &data_bimg_rel_ro_, + oatdatabimgrelrolastword_address, + 4, + STB_GLOBAL, + STT_OBJECT); + } DCHECK_LE(bss_roots_offset, bss_size); if (bss_size != 0u) { Elf_Word oatbss = dynstr_.Add("oatbss"); @@ -1010,6 +1036,7 @@ class ElfBuilder FINAL { Section rodata_; Section text_; + Section data_bimg_rel_ro_; Section bss_; Section dex_; CachedStringSection dynstr_; diff --git a/compiler/linker/file_output_stream.h b/compiler/linker/file_output_stream.h index 28296a47fd..deb051fca4 100644 --- a/compiler/linker/file_output_stream.h +++ b/compiler/linker/file_output_stream.h @@ -17,9 +17,9 @@ #ifndef ART_COMPILER_LINKER_FILE_OUTPUT_STREAM_H_ #define ART_COMPILER_LINKER_FILE_OUTPUT_STREAM_H_ -#include "output_stream.h" +#include "base/os.h" -#include "os.h" +#include "output_stream.h" namespace art { namespace linker { diff --git a/compiler/linker/linker_patch.h b/compiler/linker/linker_patch.h index 6f4e7746a6..36051d2726 100644 --- a/compiler/linker/linker_patch.h +++ b/compiler/linker/linker_patch.h @@ -41,19 +41,27 @@ class LinkerPatch { // choose to squeeze the Type into fewer than 8 bits, we'll have to declare // patch_type_ as an uintN_t and do explicit static_cast<>s. enum class Type : uint8_t { + kDataBimgRelRo, // NOTE: Actual patching is instruction_set-dependent. kMethodRelative, // NOTE: Actual patching is instruction_set-dependent. kMethodBssEntry, // NOTE: Actual patching is instruction_set-dependent. kCall, kCallRelative, // NOTE: Actual patching is instruction_set-dependent. kTypeRelative, // NOTE: Actual patching is instruction_set-dependent. - kTypeClassTable, // NOTE: Actual patching is instruction_set-dependent. kTypeBssEntry, // NOTE: Actual patching is instruction_set-dependent. kStringRelative, // NOTE: Actual patching is instruction_set-dependent. - kStringInternTable, // NOTE: Actual patching is instruction_set-dependent. kStringBssEntry, // NOTE: Actual patching is instruction_set-dependent. kBakerReadBarrierBranch, // NOTE: Actual patching is instruction_set-dependent. }; + static LinkerPatch DataBimgRelRoPatch(size_t literal_offset, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + LinkerPatch patch(literal_offset, Type::kDataBimgRelRo, /* target_dex_file */ nullptr); + patch.boot_image_offset_ = boot_image_offset; + patch.pc_insn_offset_ = pc_insn_offset; + return patch; + } + static LinkerPatch RelativeMethodPatch(size_t literal_offset, const DexFile* target_dex_file, uint32_t pc_insn_offset, @@ -100,16 +108,6 @@ class LinkerPatch { return patch; } - static LinkerPatch TypeClassTablePatch(size_t literal_offset, - const DexFile* target_dex_file, - uint32_t pc_insn_offset, - uint32_t target_type_idx) { - LinkerPatch patch(literal_offset, Type::kTypeClassTable, target_dex_file); - patch.type_idx_ = target_type_idx; - patch.pc_insn_offset_ = pc_insn_offset; - return patch; - } - static LinkerPatch TypeBssEntryPatch(size_t literal_offset, const DexFile* target_dex_file, uint32_t pc_insn_offset, @@ -130,16 +128,6 @@ class LinkerPatch { return patch; } - static LinkerPatch StringInternTablePatch(size_t literal_offset, - const DexFile* target_dex_file, - uint32_t pc_insn_offset, - uint32_t target_string_idx) { - LinkerPatch patch(literal_offset, Type::kStringInternTable, target_dex_file); - patch.string_idx_ = target_string_idx; - patch.pc_insn_offset_ = pc_insn_offset; - return patch; - } - static LinkerPatch StringBssEntryPatch(size_t literal_offset, const DexFile* target_dex_file, uint32_t pc_insn_offset, @@ -172,14 +160,13 @@ class LinkerPatch { bool IsPcRelative() const { switch (GetType()) { + case Type::kDataBimgRelRo: case Type::kMethodRelative: case Type::kMethodBssEntry: case Type::kCallRelative: case Type::kTypeRelative: - case Type::kTypeClassTable: case Type::kTypeBssEntry: case Type::kStringRelative: - case Type::kStringInternTable: case Type::kStringBssEntry: case Type::kBakerReadBarrierBranch: return true; @@ -188,6 +175,11 @@ class LinkerPatch { } } + uint32_t BootImageOffset() const { + DCHECK(patch_type_ == Type::kDataBimgRelRo); + return boot_image_offset_; + } + MethodReference TargetMethod() const { DCHECK(patch_type_ == Type::kMethodRelative || patch_type_ == Type::kMethodBssEntry || @@ -198,40 +190,35 @@ class LinkerPatch { const DexFile* TargetTypeDexFile() const { DCHECK(patch_type_ == Type::kTypeRelative || - patch_type_ == Type::kTypeClassTable || patch_type_ == Type::kTypeBssEntry); return target_dex_file_; } dex::TypeIndex TargetTypeIndex() const { DCHECK(patch_type_ == Type::kTypeRelative || - patch_type_ == Type::kTypeClassTable || patch_type_ == Type::kTypeBssEntry); return dex::TypeIndex(type_idx_); } const DexFile* TargetStringDexFile() const { DCHECK(patch_type_ == Type::kStringRelative || - patch_type_ == Type::kStringInternTable || patch_type_ == Type::kStringBssEntry); return target_dex_file_; } dex::StringIndex TargetStringIndex() const { DCHECK(patch_type_ == Type::kStringRelative || - patch_type_ == Type::kStringInternTable || patch_type_ == Type::kStringBssEntry); return dex::StringIndex(string_idx_); } uint32_t PcInsnOffset() const { - DCHECK(patch_type_ == Type::kMethodRelative || + DCHECK(patch_type_ == Type::kDataBimgRelRo || + patch_type_ == Type::kMethodRelative || patch_type_ == Type::kMethodBssEntry || patch_type_ == Type::kTypeRelative || - patch_type_ == Type::kTypeClassTable || patch_type_ == Type::kTypeBssEntry || patch_type_ == Type::kStringRelative || - patch_type_ == Type::kStringInternTable || patch_type_ == Type::kStringBssEntry); return pc_insn_offset_; } @@ -263,10 +250,11 @@ class LinkerPatch { uint32_t literal_offset_ : 24; // Method code size up to 16MiB. Type patch_type_ : 8; union { - uint32_t cmp1_; // Used for relational operators. - uint32_t method_idx_; // Method index for Call/Method patches. - uint32_t type_idx_; // Type index for Type patches. - uint32_t string_idx_; // String index for String patches. + uint32_t cmp1_; // Used for relational operators. + uint32_t boot_image_offset_; // Data to write to the .data.bimg.rel.ro entry. + uint32_t method_idx_; // Method index for Call/Method patches. + uint32_t type_idx_; // Type index for Type patches. + uint32_t string_idx_; // String index for String patches. uint32_t baker_custom_value1_; static_assert(sizeof(method_idx_) == sizeof(cmp1_), "needed by relational operators"); static_assert(sizeof(type_idx_) == sizeof(cmp1_), "needed by relational operators"); diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index ff59173c8b..0fcc9c6d05 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -51,6 +51,8 @@ #include "dex/verified_method.h" #include "driver/compiler_driver.h" #include "graph_visualizer.h" +#include "image.h" +#include "gc/space/image_space.h" #include "intern_table.h" #include "intrinsics.h" #include "mirror/array-inl.h" @@ -722,6 +724,47 @@ void CodeGenerator::GenerateLoadClassRuntimeCall(HLoadClass* cls) { } } +static uint32_t GetBootImageOffsetImpl(const void* object, ImageHeader::ImageSections section) { + Runtime* runtime = Runtime::Current(); + DCHECK(runtime->IsAotCompiler()); + const std::vector<gc::space::ImageSpace*>& boot_image_spaces = + runtime->GetHeap()->GetBootImageSpaces(); + // Check that the `object` is in the expected section of one of the boot image files. + DCHECK(std::any_of(boot_image_spaces.begin(), + boot_image_spaces.end(), + [object, section](gc::space::ImageSpace* space) { + uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin()); + uintptr_t offset = reinterpret_cast<uintptr_t>(object) - begin; + return space->GetImageHeader().GetImageSection(section).Contains(offset); + })); + uintptr_t begin = reinterpret_cast<uintptr_t>(boot_image_spaces.front()->Begin()); + uintptr_t offset = reinterpret_cast<uintptr_t>(object) - begin; + return dchecked_integral_cast<uint32_t>(offset); +} + +// NO_THREAD_SAFETY_ANALYSIS: Avoid taking the mutator lock, boot image classes are non-moveable. +uint32_t CodeGenerator::GetBootImageOffset(HLoadClass* load_class) NO_THREAD_SAFETY_ANALYSIS { + DCHECK_EQ(load_class->GetLoadKind(), HLoadClass::LoadKind::kBootImageRelRo); + ObjPtr<mirror::Class> klass = load_class->GetClass().Get(); + DCHECK(klass != nullptr); + return GetBootImageOffsetImpl(klass.Ptr(), ImageHeader::kSectionObjects); +} + +// NO_THREAD_SAFETY_ANALYSIS: Avoid taking the mutator lock, boot image strings are non-moveable. +uint32_t CodeGenerator::GetBootImageOffset(HLoadString* load_string) NO_THREAD_SAFETY_ANALYSIS { + DCHECK_EQ(load_string->GetLoadKind(), HLoadString::LoadKind::kBootImageRelRo); + ObjPtr<mirror::String> string = load_string->GetString().Get(); + DCHECK(string != nullptr); + return GetBootImageOffsetImpl(string.Ptr(), ImageHeader::kSectionObjects); +} + +uint32_t CodeGenerator::GetBootImageOffset(HInvokeStaticOrDirect* invoke) { + DCHECK_EQ(invoke->GetMethodLoadKind(), HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo); + ArtMethod* method = invoke->GetResolvedMethod(); + DCHECK(method != nullptr); + return GetBootImageOffsetImpl(method, ImageHeader::kSectionArtMethods); +} + void CodeGenerator::BlockIfInRegister(Location location, bool is_out) const { // The DCHECKS below check that a register is not specified twice in // the summary. The out location can overlap with an input, so we need diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 60de722285..7031483a77 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -556,6 +556,10 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { Location runtime_return_location); void GenerateLoadClassRuntimeCall(HLoadClass* cls); + uint32_t GetBootImageOffset(HLoadClass* load_class); + uint32_t GetBootImageOffset(HLoadString* load_string); + uint32_t GetBootImageOffset(HInvokeStaticOrDirect* invoke); + static void CreateSystemArrayCopyLocationSummary(HInvoke* invoke); void SetDisassemblyInformation(DisassemblyInformation* info) { disasm_info_ = info; } diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 1f9c5546e2..a024df8537 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -78,6 +78,7 @@ using helpers::OutputFPRegister; using helpers::OutputRegister; using helpers::QRegisterFrom; using helpers::RegisterFrom; +using helpers::SRegisterFrom; using helpers::StackOperandFrom; using helpers::VIXLRegCodeFromART; using helpers::WRegisterFrom; @@ -4459,12 +4460,23 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall( // Load method address from literal pool. __ Ldr(XRegisterFrom(temp), DeduplicateUint64Literal(invoke->GetMethodAddress())); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: { + // Add ADRP with its PC-relative .data.bimg.rel.ro patch. + uint32_t boot_image_offset = GetBootImageOffset(invoke); + vixl::aarch64::Label* adrp_label = NewBootImageRelRoPatch(boot_image_offset); + EmitAdrpPlaceholder(adrp_label, XRegisterFrom(temp)); + // Add LDR with its PC-relative .data.bimg.rel.ro patch. + vixl::aarch64::Label* ldr_label = NewBootImageRelRoPatch(boot_image_offset, adrp_label); + // Note: Boot image is in the low 4GiB and the entry is 32-bit, so emit a 32-bit load. + EmitLdrOffsetPlaceholder(ldr_label, WRegisterFrom(temp), XRegisterFrom(temp)); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: { - // Add ADRP with its PC-relative DexCache access patch. + // Add ADRP with its PC-relative .bss entry patch. MethodReference target_method(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex()); vixl::aarch64::Label* adrp_label = NewMethodBssEntryPatch(target_method); EmitAdrpPlaceholder(adrp_label, XRegisterFrom(temp)); - // Add LDR with its PC-relative DexCache access patch. + // Add LDR with its PC-relative .bss entry patch. vixl::aarch64::Label* ldr_label = NewMethodBssEntryPatch(target_method, adrp_label); EmitLdrOffsetPlaceholder(ldr_label, XRegisterFrom(temp), XRegisterFrom(temp)); @@ -4559,6 +4571,13 @@ void InstructionCodeGeneratorARM64::VisitInvokePolymorphic(HInvokePolymorphic* i codegen_->MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__); } +vixl::aarch64::Label* CodeGeneratorARM64::NewBootImageRelRoPatch( + uint32_t boot_image_offset, + vixl::aarch64::Label* adrp_label) { + return NewPcRelativePatch( + /* dex_file */ nullptr, boot_image_offset, adrp_label, &boot_image_method_patches_); +} + vixl::aarch64::Label* CodeGeneratorARM64::NewBootImageMethodPatch( MethodReference target_method, vixl::aarch64::Label* adrp_label) { @@ -4681,6 +4700,14 @@ inline void CodeGeneratorARM64::EmitPcRelativeLinkerPatches( } } +linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset, + const DexFile* target_dex_file, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + DCHECK(target_dex_file == nullptr); // Unused for DataBimgRelRoPatch(), should be null. + return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset); +} + void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) { DCHECK(linker_patches->empty()); size_t size = @@ -4700,11 +4727,10 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* lin EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>( boot_image_string_patches_, linker_patches); } else { - DCHECK(boot_image_method_patches_.empty()); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>( - boot_image_type_patches_, linker_patches); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>( - boot_image_string_patches_, linker_patches); + EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>( + boot_image_method_patches_, linker_patches); + DCHECK(boot_image_type_patches_.empty()); + DCHECK(boot_image_string_patches_.empty()); } EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>( method_bss_entry_patches_, linker_patches); @@ -4779,7 +4805,7 @@ HLoadClass::LoadKind CodeGeneratorARM64::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -4888,23 +4914,16 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SA __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(address)); break; } - case HLoadClass::LoadKind::kBootImageClassTable: { + case HLoadClass::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); - // Add ADRP with its PC-relative type patch. - const DexFile& dex_file = cls->GetDexFile(); - dex::TypeIndex type_index = cls->GetTypeIndex(); - vixl::aarch64::Label* adrp_label = codegen_->NewBootImageTypePatch(dex_file, type_index); + uint32_t boot_image_offset = codegen_->GetBootImageOffset(cls); + // Add ADRP with its PC-relative .data.bimg.rel.ro patch. + vixl::aarch64::Label* adrp_label = codegen_->NewBootImageRelRoPatch(boot_image_offset); codegen_->EmitAdrpPlaceholder(adrp_label, out.X()); - // Add LDR with its PC-relative type patch. + // Add LDR with its PC-relative .data.bimg.rel.ro patch. vixl::aarch64::Label* ldr_label = - codegen_->NewBootImageTypePatch(dex_file, type_index, adrp_label); + codegen_->NewBootImageRelRoPatch(boot_image_offset, adrp_label); codegen_->EmitLdrOffsetPlaceholder(ldr_label, out.W(), out.X()); - // Extract the reference from the slot data, i.e. clear the hash bits. - int32_t masked_hash = ClassTable::TableSlot::MaskHash( - ComputeModifiedUtf8Hash(dex_file.StringByTypeIdx(type_index))); - if (masked_hash != 0) { - __ Sub(out.W(), out.W(), Operand(masked_hash)); - } break; } case HLoadClass::LoadKind::kBssEntry: { @@ -4914,7 +4933,7 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SA vixl::aarch64::Register temp = XRegisterFrom(out_loc); vixl::aarch64::Label* adrp_label = codegen_->NewBssEntryTypePatch(dex_file, type_index); codegen_->EmitAdrpPlaceholder(adrp_label, temp); - // Add LDR with its PC-relative Class patch. + // Add LDR with its PC-relative Class .bss entry patch. vixl::aarch64::Label* ldr_label = codegen_->NewBssEntryTypePatch(dex_file, type_index, adrp_label); // /* GcRoot<mirror::Class> */ out = *(base_address + offset) /* PC-relative */ @@ -4989,7 +5008,7 @@ HLoadString::LoadKind CodeGeneratorARM64::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -5055,16 +5074,15 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) NO_THREAD __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(address)); return; } - case HLoadString::LoadKind::kBootImageInternTable: { + case HLoadString::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); - // Add ADRP with its PC-relative String patch. - const DexFile& dex_file = load->GetDexFile(); - const dex::StringIndex string_index = load->GetStringIndex(); - vixl::aarch64::Label* adrp_label = codegen_->NewBootImageStringPatch(dex_file, string_index); + // Add ADRP with its PC-relative .data.bimg.rel.ro patch. + uint32_t boot_image_offset = codegen_->GetBootImageOffset(load); + vixl::aarch64::Label* adrp_label = codegen_->NewBootImageRelRoPatch(boot_image_offset); codegen_->EmitAdrpPlaceholder(adrp_label, out.X()); - // Add LDR with its PC-relative String patch. + // Add LDR with its PC-relative .data.bimg.rel.ro patch. vixl::aarch64::Label* ldr_label = - codegen_->NewBootImageStringPatch(dex_file, string_index, adrp_label); + codegen_->NewBootImageRelRoPatch(boot_image_offset, adrp_label); codegen_->EmitLdrOffsetPlaceholder(ldr_label, out.W(), out.X()); return; } @@ -5076,7 +5094,7 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) NO_THREAD Register temp = XRegisterFrom(out_loc); vixl::aarch64::Label* adrp_label = codegen_->NewStringBssEntryPatch(dex_file, string_index); codegen_->EmitAdrpPlaceholder(adrp_label, temp); - // Add LDR with its .bss entry String patch. + // Add LDR with its PC-relative String .bss entry patch. vixl::aarch64::Label* ldr_label = codegen_->NewStringBssEntryPatch(dex_file, string_index, adrp_label); // /* GcRoot<mirror::String> */ out = *(base_address + offset) /* PC-relative */ @@ -5462,6 +5480,113 @@ void InstructionCodeGeneratorARM64::VisitRem(HRem* rem) { } } +// TODO: integrate with HandleBinaryOp? +static void CreateMinMaxLocations(ArenaAllocator* allocator, HBinaryOperation* minmax) { + LocationSummary* locations = new (allocator) LocationSummary(minmax); + switch (minmax->GetResultType()) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << minmax->GetResultType(); + } +} + +void InstructionCodeGeneratorARM64::GenerateMinMaxInt(LocationSummary* locations, + bool is_min, + DataType::Type type) { + Location op1 = locations->InAt(0); + Location op2 = locations->InAt(1); + Location out = locations->Out(); + + Register op1_reg; + Register op2_reg; + Register out_reg; + if (type == DataType::Type::kInt64) { + op1_reg = XRegisterFrom(op1); + op2_reg = XRegisterFrom(op2); + out_reg = XRegisterFrom(out); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + op1_reg = WRegisterFrom(op1); + op2_reg = WRegisterFrom(op2); + out_reg = WRegisterFrom(out); + } + + __ Cmp(op1_reg, op2_reg); + __ Csel(out_reg, op1_reg, op2_reg, is_min ? lt : gt); +} + +void InstructionCodeGeneratorARM64::GenerateMinMaxFP(LocationSummary* locations, + bool is_min, + DataType::Type type) { + Location op1 = locations->InAt(0); + Location op2 = locations->InAt(1); + Location out = locations->Out(); + + FPRegister op1_reg; + FPRegister op2_reg; + FPRegister out_reg; + if (type == DataType::Type::kFloat64) { + op1_reg = DRegisterFrom(op1); + op2_reg = DRegisterFrom(op2); + out_reg = DRegisterFrom(out); + } else { + DCHECK_EQ(type, DataType::Type::kFloat32); + op1_reg = SRegisterFrom(op1); + op2_reg = SRegisterFrom(op2); + out_reg = SRegisterFrom(out); + } + + if (is_min) { + __ Fmin(out_reg, op1_reg, op2_reg); + } else { + __ Fmax(out_reg, op1_reg, op2_reg); + } +} + +// TODO: integrate with HandleBinaryOp? +void InstructionCodeGeneratorARM64::GenerateMinMax(HBinaryOperation* minmax, bool is_min) { + DataType::Type type = minmax->GetResultType(); + switch (type) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + GenerateMinMaxInt(minmax->GetLocations(), is_min, type); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + GenerateMinMaxFP(minmax->GetLocations(), is_min, type); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << type; + } +} + +void LocationsBuilderARM64::VisitMin(HMin* min) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), min); +} + +void InstructionCodeGeneratorARM64::VisitMin(HMin* min) { + GenerateMinMax(min, /*is_min*/ true); +} + +void LocationsBuilderARM64::VisitMax(HMax* max) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), max); +} + +void InstructionCodeGeneratorARM64::VisitMax(HMax* max) { + GenerateMinMax(max, /*is_min*/ false); +} + void LocationsBuilderARM64::VisitAbs(HAbs* abs) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(abs); switch (abs->GetResultType()) { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index e34f799d15..b59ccd9034 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -273,6 +273,10 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); void HandleCondition(HCondition* instruction); + void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMax(HBinaryOperation* minmax, bool is_min); + // Generate a heap reference load using one register `out`: // // out <- *(out + offset) @@ -561,7 +565,14 @@ class CodeGeneratorARM64 : public CodeGenerator { UNIMPLEMENTED(FATAL); } - // Add a new PC-relative method patch for an instruction and return the label + // Add a new boot image relocation patch for an instruction and return the label + // to be bound before the instruction. The instruction will be either the + // ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label` pointing + // to the associated ADRP patch label). + vixl::aarch64::Label* NewBootImageRelRoPatch(uint32_t boot_image_offset, + vixl::aarch64::Label* adrp_label = nullptr); + + // Add a new boot image method patch for an instruction and return the label // to be bound before the instruction. The instruction will be either the // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing // to the associated ADRP patch label). @@ -575,7 +586,7 @@ class CodeGeneratorARM64 : public CodeGenerator { vixl::aarch64::Label* NewMethodBssEntryPatch(MethodReference target_method, vixl::aarch64::Label* adrp_label = nullptr); - // Add a new PC-relative type patch for an instruction and return the label + // Add a new boot image type patch for an instruction and return the label // to be bound before the instruction. The instruction will be either the // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing // to the associated ADRP patch label). @@ -591,7 +602,7 @@ class CodeGeneratorARM64 : public CodeGenerator { dex::TypeIndex type_index, vixl::aarch64::Label* adrp_label = nullptr); - // Add a new PC-relative string patch for an instruction and return the label + // Add a new boot image string patch for an instruction and return the label // to be bound before the instruction. The instruction will be either the // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing // to the associated ADRP patch label). @@ -820,7 +831,8 @@ class CodeGeneratorARM64 : public CodeGenerator { Uint32ToLiteralMap uint32_literals_; // Deduplication map for 64-bit literals, used for non-patchable method address or method code. Uint64ToLiteralMap uint64_literals_; - // PC-relative method patch info for kBootImageLinkTimePcRelative. + // PC-relative method patch info for kBootImageLinkTimePcRelative/BootImageRelRo. + // Also used for type/string patches for kBootImageRelRo (same linker patch as for methods). ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_; // PC-relative method patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_; @@ -828,7 +840,7 @@ class CodeGeneratorARM64 : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> boot_image_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // PC-relative String patch info; type depends on configuration (intern table or boot image PIC). + // PC-relative String patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> boot_image_string_patches_; // PC-relative String patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> string_bss_entry_patches_; diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 13518adf7d..6ebcc67b49 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -4690,6 +4690,244 @@ void InstructionCodeGeneratorARMVIXL::VisitRem(HRem* rem) { } } +static void CreateMinMaxLocations(ArenaAllocator* allocator, HBinaryOperation* minmax) { + LocationSummary* locations = new (allocator) LocationSummary(minmax); + switch (minmax->GetResultType()) { + case DataType::Type::kInt32: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + break; + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + case DataType::Type::kFloat32: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + locations->AddTemp(Location::RequiresRegister()); + break; + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << minmax->GetResultType(); + } +} + +void InstructionCodeGeneratorARMVIXL::GenerateMinMaxInt(LocationSummary* locations, bool is_min) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + + vixl32::Register op1 = RegisterFrom(op1_loc); + vixl32::Register op2 = RegisterFrom(op2_loc); + vixl32::Register out = RegisterFrom(out_loc); + + __ Cmp(op1, op2); + + { + ExactAssemblyScope aas(GetVIXLAssembler(), + 3 * kMaxInstructionSizeInBytes, + CodeBufferCheckScope::kMaximumSize); + + __ ite(is_min ? lt : gt); + __ mov(is_min ? lt : gt, out, op1); + __ mov(is_min ? ge : le, out, op2); + } +} + +void InstructionCodeGeneratorARMVIXL::GenerateMinMaxLong(LocationSummary* locations, bool is_min) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + + // Optimization: don't generate any code if inputs are the same. + if (op1_loc.Equals(op2_loc)) { + DCHECK(out_loc.Equals(op1_loc)); // out_loc is set as SameAsFirstInput() in location builder. + return; + } + + vixl32::Register op1_lo = LowRegisterFrom(op1_loc); + vixl32::Register op1_hi = HighRegisterFrom(op1_loc); + vixl32::Register op2_lo = LowRegisterFrom(op2_loc); + vixl32::Register op2_hi = HighRegisterFrom(op2_loc); + vixl32::Register out_lo = LowRegisterFrom(out_loc); + vixl32::Register out_hi = HighRegisterFrom(out_loc); + UseScratchRegisterScope temps(GetVIXLAssembler()); + const vixl32::Register temp = temps.Acquire(); + + DCHECK(op1_lo.Is(out_lo)); + DCHECK(op1_hi.Is(out_hi)); + + // Compare op1 >= op2, or op1 < op2. + __ Cmp(out_lo, op2_lo); + __ Sbcs(temp, out_hi, op2_hi); + + // Now GE/LT condition code is correct for the long comparison. + { + vixl32::ConditionType cond = is_min ? ge : lt; + ExactAssemblyScope it_scope(GetVIXLAssembler(), + 3 * kMaxInstructionSizeInBytes, + CodeBufferCheckScope::kMaximumSize); + __ itt(cond); + __ mov(cond, out_lo, op2_lo); + __ mov(cond, out_hi, op2_hi); + } +} + +void InstructionCodeGeneratorARMVIXL::GenerateMinMaxFloat(HInstruction* minmax, bool is_min) { + LocationSummary* locations = minmax->GetLocations(); + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + + // Optimization: don't generate any code if inputs are the same. + if (op1_loc.Equals(op2_loc)) { + DCHECK(out_loc.Equals(op1_loc)); // out_loc is set as SameAsFirstInput() in location builder. + return; + } + + vixl32::SRegister op1 = SRegisterFrom(op1_loc); + vixl32::SRegister op2 = SRegisterFrom(op2_loc); + vixl32::SRegister out = SRegisterFrom(out_loc); + + UseScratchRegisterScope temps(GetVIXLAssembler()); + const vixl32::Register temp1 = temps.Acquire(); + vixl32::Register temp2 = RegisterFrom(locations->GetTemp(0)); + vixl32::Label nan, done; + vixl32::Label* final_label = codegen_->GetFinalLabel(minmax, &done); + + DCHECK(op1.Is(out)); + + __ Vcmp(op1, op2); + __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); + __ B(vs, &nan, /* far_target */ false); // if un-ordered, go to NaN handling. + + // op1 <> op2 + vixl32::ConditionType cond = is_min ? gt : lt; + { + ExactAssemblyScope it_scope(GetVIXLAssembler(), + 2 * kMaxInstructionSizeInBytes, + CodeBufferCheckScope::kMaximumSize); + __ it(cond); + __ vmov(cond, F32, out, op2); + } + // for <>(not equal), we've done min/max calculation. + __ B(ne, final_label, /* far_target */ false); + + // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0). + __ Vmov(temp1, op1); + __ Vmov(temp2, op2); + if (is_min) { + __ Orr(temp1, temp1, temp2); + } else { + __ And(temp1, temp1, temp2); + } + __ Vmov(out, temp1); + __ B(final_label); + + // handle NaN input. + __ Bind(&nan); + __ Movt(temp1, High16Bits(kNanFloat)); // 0x7FC0xxxx is a NaN. + __ Vmov(out, temp1); + + if (done.IsReferenced()) { + __ Bind(&done); + } +} + +void InstructionCodeGeneratorARMVIXL::GenerateMinMaxDouble(HInstruction* minmax, bool is_min) { + LocationSummary* locations = minmax->GetLocations(); + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + + // Optimization: don't generate any code if inputs are the same. + if (op1_loc.Equals(op2_loc)) { + DCHECK(out_loc.Equals(op1_loc)); // out_loc is set as SameAsFirstInput() in. + return; + } + + vixl32::DRegister op1 = DRegisterFrom(op1_loc); + vixl32::DRegister op2 = DRegisterFrom(op2_loc); + vixl32::DRegister out = DRegisterFrom(out_loc); + vixl32::Label handle_nan_eq, done; + vixl32::Label* final_label = codegen_->GetFinalLabel(minmax, &done); + + DCHECK(op1.Is(out)); + + __ Vcmp(op1, op2); + __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); + __ B(vs, &handle_nan_eq, /* far_target */ false); // if un-ordered, go to NaN handling. + + // op1 <> op2 + vixl32::ConditionType cond = is_min ? gt : lt; + { + ExactAssemblyScope it_scope(GetVIXLAssembler(), + 2 * kMaxInstructionSizeInBytes, + CodeBufferCheckScope::kMaximumSize); + __ it(cond); + __ vmov(cond, F64, out, op2); + } + // for <>(not equal), we've done min/max calculation. + __ B(ne, final_label, /* far_target */ false); + + // handle op1 == op2, max(+0.0,-0.0). + if (!is_min) { + __ Vand(F64, out, op1, op2); + __ B(final_label); + } + + // handle op1 == op2, min(+0.0,-0.0), NaN input. + __ Bind(&handle_nan_eq); + __ Vorr(F64, out, op1, op2); // assemble op1/-0.0/NaN. + + if (done.IsReferenced()) { + __ Bind(&done); + } +} + +void InstructionCodeGeneratorARMVIXL::GenerateMinMax(HBinaryOperation* minmax, bool is_min) { + DataType::Type type = minmax->GetResultType(); + switch (type) { + case DataType::Type::kInt32: + GenerateMinMaxInt(minmax->GetLocations(), is_min); + break; + case DataType::Type::kInt64: + GenerateMinMaxLong(minmax->GetLocations(), is_min); + break; + case DataType::Type::kFloat32: + GenerateMinMaxFloat(minmax, is_min); + break; + case DataType::Type::kFloat64: + GenerateMinMaxDouble(minmax, is_min); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << type; + } +} + +void LocationsBuilderARMVIXL::VisitMin(HMin* min) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), min); +} + +void InstructionCodeGeneratorARMVIXL::VisitMin(HMin* min) { + GenerateMinMax(min, /*is_min*/ true); +} + +void LocationsBuilderARMVIXL::VisitMax(HMax* max) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), max); +} + +void InstructionCodeGeneratorARMVIXL::VisitMax(HMax* max) { + GenerateMinMax(max, /*is_min*/ false); +} + void LocationsBuilderARMVIXL::VisitAbs(HAbs* abs) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(abs); switch (abs->GetResultType()) { @@ -7088,7 +7326,7 @@ HLoadClass::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -7198,18 +7436,12 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadClass(HLoadClass* cls) NO_THREAD_ __ Ldr(out, codegen_->DeduplicateBootImageAddressLiteral(address)); break; } - case HLoadClass::LoadKind::kBootImageClassTable: { + case HLoadClass::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorARMVIXL::PcRelativePatchInfo* labels = - codegen_->NewBootImageTypePatch(cls->GetDexFile(), cls->GetTypeIndex()); + codegen_->NewBootImageRelRoPatch(codegen_->GetBootImageOffset(cls)); codegen_->EmitMovwMovtPlaceholder(labels, out); __ Ldr(out, MemOperand(out, /* offset */ 0)); - // Extract the reference from the slot data, i.e. clear the hash bits. - int32_t masked_hash = ClassTable::TableSlot::MaskHash( - ComputeModifiedUtf8Hash(cls->GetDexFile().StringByTypeIdx(cls->GetTypeIndex()))); - if (masked_hash != 0) { - __ Sub(out, out, Operand(masked_hash)); - } break; } case HLoadClass::LoadKind::kBssEntry: { @@ -7295,7 +7527,7 @@ HLoadString::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -7359,10 +7591,10 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadString(HLoadString* load) NO_THRE __ Ldr(out, codegen_->DeduplicateBootImageAddressLiteral(address)); return; } - case HLoadString::LoadKind::kBootImageInternTable: { + case HLoadString::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorARMVIXL::PcRelativePatchInfo* labels = - codegen_->NewBootImageStringPatch(load->GetDexFile(), load->GetStringIndex()); + codegen_->NewBootImageRelRoPatch(codegen_->GetBootImageOffset(load)); codegen_->EmitMovwMovtPlaceholder(labels, out); __ Ldr(out, MemOperand(out, /* offset */ 0)); return; @@ -8956,6 +9188,14 @@ void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall( case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ Mov(RegisterFrom(temp), Operand::From(invoke->GetMethodAddress())); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: { + uint32_t boot_image_offset = GetBootImageOffset(invoke); + PcRelativePatchInfo* labels = NewBootImageRelRoPatch(boot_image_offset); + vixl32::Register temp_reg = RegisterFrom(temp); + EmitMovwMovtPlaceholder(labels, temp_reg); + GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, temp_reg, /* offset*/ 0); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: { PcRelativePatchInfo* labels = NewMethodBssEntryPatch( MethodReference(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex())); @@ -9053,6 +9293,13 @@ void CodeGeneratorARMVIXL::GenerateVirtualCall( } } +CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewBootImageRelRoPatch( + uint32_t boot_image_offset) { + return NewPcRelativePatch(/* dex_file */ nullptr, + boot_image_offset, + &boot_image_method_patches_); +} + CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewBootImageMethodPatch( MethodReference target_method) { return NewPcRelativePatch( @@ -9143,6 +9390,14 @@ inline void CodeGeneratorARMVIXL::EmitPcRelativeLinkerPatches( } } +linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset, + const DexFile* target_dex_file, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + DCHECK(target_dex_file == nullptr); // Unused for DataBimgRelRoPatch(), should be null. + return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset); +} + void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) { DCHECK(linker_patches->empty()); size_t size = @@ -9162,11 +9417,10 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* l EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>( boot_image_string_patches_, linker_patches); } else { - DCHECK(boot_image_method_patches_.empty()); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>( - boot_image_type_patches_, linker_patches); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>( - boot_image_string_patches_, linker_patches); + EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>( + boot_image_method_patches_, linker_patches); + DCHECK(boot_image_type_patches_.empty()); + DCHECK(boot_image_string_patches_.empty()); } EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>( method_bss_entry_patches_, linker_patches); diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index bbc715c59d..2d8f6a6c78 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -349,6 +349,12 @@ class InstructionCodeGeneratorARMVIXL : public InstructionCodeGenerator { bool value_can_be_null); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); + void GenerateMinMaxInt(LocationSummary* locations, bool is_min); + void GenerateMinMaxLong(LocationSummary* locations, bool is_min); + void GenerateMinMaxFloat(HInstruction* minmax, bool is_min); + void GenerateMinMaxDouble(HInstruction* minmax, bool is_min); + void GenerateMinMax(HBinaryOperation* minmax, bool is_min); + // Generate a heap reference load using one register `out`: // // out <- *(out + offset) @@ -574,6 +580,7 @@ class CodeGeneratorARMVIXL : public CodeGenerator { vixl::aarch32::Label add_pc_label; }; + PcRelativePatchInfo* NewBootImageRelRoPatch(uint32_t boot_image_offset); PcRelativePatchInfo* NewBootImageMethodPatch(MethodReference target_method); PcRelativePatchInfo* NewMethodBssEntryPatch(MethodReference target_method); PcRelativePatchInfo* NewBootImageTypePatch(const DexFile& dex_file, dex::TypeIndex type_index); @@ -798,7 +805,8 @@ class CodeGeneratorARMVIXL : public CodeGenerator { // Deduplication map for 32-bit literals, used for non-patchable boot image addresses. Uint32ToLiteralMap uint32_literals_; - // PC-relative method patch info for kBootImageLinkTimePcRelative. + // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo. + // Also used for type/string patches for kBootImageRelRo (same linker patch as for methods). ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_; // PC-relative method patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_; @@ -806,7 +814,7 @@ class CodeGeneratorARMVIXL : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> boot_image_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // PC-relative String patch info; type depends on configuration (intern table or boot image PIC). + // PC-relative String patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> boot_image_string_patches_; // PC-relative String patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> string_bss_entry_patches_; diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index eb5f72e953..be9ff48a6b 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -1597,6 +1597,14 @@ inline void CodeGeneratorMIPS::EmitPcRelativeLinkerPatches( } } +linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset, + const DexFile* target_dex_file, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + DCHECK(target_dex_file == nullptr); // Unused for DataBimgRelRoPatch(), should be null. + return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset); +} + void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) { DCHECK(linker_patches->empty()); size_t size = @@ -1615,11 +1623,10 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* link EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>( boot_image_string_patches_, linker_patches); } else { - DCHECK(boot_image_method_patches_.empty()); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>( - boot_image_type_patches_, linker_patches); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>( - boot_image_string_patches_, linker_patches); + EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>( + boot_image_method_patches_, linker_patches); + DCHECK(boot_image_type_patches_.empty()); + DCHECK(boot_image_string_patches_.empty()); } EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>( method_bss_entry_patches_, linker_patches); @@ -1630,6 +1637,13 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* link DCHECK_EQ(size, linker_patches->size()); } +CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewBootImageRelRoPatch( + uint32_t boot_image_offset, + const PcRelativePatchInfo* info_high) { + return NewPcRelativePatch( + /* dex_file */ nullptr, boot_image_offset, info_high, &boot_image_method_patches_); +} + CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewBootImageMethodPatch( MethodReference target_method, const PcRelativePatchInfo* info_high) { @@ -7725,7 +7739,7 @@ HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -7748,7 +7762,7 @@ HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -7835,6 +7849,15 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall( case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: { + uint32_t boot_image_offset = GetBootImageOffset(invoke); + PcRelativePatchInfo* info_high = NewBootImageRelRoPatch(boot_image_offset); + PcRelativePatchInfo* info_low = NewBootImageRelRoPatch(boot_image_offset, info_high); + Register temp_reg = temp.AsRegister<Register>(); + EmitPcRelativeAddressPlaceholderHigh(info_high, TMP, base_reg); + __ Lw(temp_reg, TMP, /* placeholder */ 0x5678, &info_low->label); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: { PcRelativePatchInfo* info_high = NewMethodBssEntryPatch( MethodReference(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex())); @@ -7956,7 +7979,7 @@ void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) { // We need an extra register for PC-relative literals on R2. case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: case HLoadClass::LoadKind::kBootImageAddress: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: if (isR6) { break; @@ -8008,7 +8031,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF // We need an extra register for PC-relative literals on R2. case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: case HLoadClass::LoadKind::kBootImageAddress: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: base_or_current_method_reg = (isR6 || has_irreducible_loops) ? ZERO : locations->InAt(0).AsRegister<Register>(); @@ -8065,22 +8088,17 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF } break; } - case HLoadClass::LoadKind::kBootImageClassTable: { + case HLoadClass::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); + uint32_t boot_image_offset = codegen_->GetBootImageOffset(cls); CodeGeneratorMIPS::PcRelativePatchInfo* info_high = - codegen_->NewBootImageTypePatch(cls->GetDexFile(), cls->GetTypeIndex()); + codegen_->NewBootImageRelRoPatch(boot_image_offset); CodeGeneratorMIPS::PcRelativePatchInfo* info_low = - codegen_->NewBootImageTypePatch(cls->GetDexFile(), cls->GetTypeIndex(), info_high); + codegen_->NewBootImageRelRoPatch(boot_image_offset, info_high); codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high, out, base_or_current_method_reg); __ Lw(out, out, /* placeholder */ 0x5678, &info_low->label); - // Extract the reference from the slot data, i.e. clear the hash bits. - int32_t masked_hash = ClassTable::TableSlot::MaskHash( - ComputeModifiedUtf8Hash(cls->GetDexFile().StringByTypeIdx(cls->GetTypeIndex()))); - if (masked_hash != 0) { - __ Addiu(out, out, -masked_hash); - } break; } case HLoadClass::LoadKind::kBssEntry: { @@ -8171,7 +8189,7 @@ void LocationsBuilderMIPS::VisitLoadString(HLoadString* load) { // We need an extra register for PC-relative literals on R2. case HLoadString::LoadKind::kBootImageAddress: case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: if (isR6) { break; @@ -8223,7 +8241,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ // We need an extra register for PC-relative literals on R2. case HLoadString::LoadKind::kBootImageAddress: case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: base_or_current_method_reg = (isR6 || has_irreducible_loops) ? ZERO : locations->InAt(0).AsRegister<Register>(); @@ -8259,12 +8277,13 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ } return; } - case HLoadString::LoadKind::kBootImageInternTable: { + case HLoadString::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); + uint32_t boot_image_offset = codegen_->GetBootImageOffset(load); CodeGeneratorMIPS::PcRelativePatchInfo* info_high = - codegen_->NewBootImageStringPatch(load->GetDexFile(), load->GetStringIndex()); + codegen_->NewBootImageRelRoPatch(boot_image_offset); CodeGeneratorMIPS::PcRelativePatchInfo* info_low = - codegen_->NewBootImageStringPatch(load->GetDexFile(), load->GetStringIndex(), info_high); + codegen_->NewBootImageRelRoPatch(boot_image_offset, info_high); codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high, out, base_or_current_method_reg); @@ -8779,6 +8798,390 @@ void InstructionCodeGeneratorMIPS::VisitRem(HRem* instruction) { } } +static void CreateMinMaxLocations(ArenaAllocator* allocator, HBinaryOperation* minmax) { + LocationSummary* locations = new (allocator) LocationSummary(minmax); + switch (minmax->GetResultType()) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kOutputOverlap); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << minmax->GetResultType(); + } +} + +void InstructionCodeGeneratorMIPS::GenerateMinMaxInt(LocationSummary* locations, + bool is_min, + bool isR6, + DataType::Type type) { + if (isR6) { + // Some architectures, such as ARM and MIPS (prior to r6), have a + // conditional move instruction which only changes the target + // (output) register if the condition is true (MIPS prior to r6 had + // MOVF, MOVT, MOVN, and MOVZ). The SELEQZ and SELNEZ instructions + // always change the target (output) register. If the condition is + // true the output register gets the contents of the "rs" register; + // otherwise, the output register is set to zero. One consequence + // of this is that to implement something like "rd = c==0 ? rs : rt" + // MIPS64r6 needs to use a pair of SELEQZ/SELNEZ instructions. + // After executing this pair of instructions one of the output + // registers from the pair will necessarily contain zero. Then the + // code ORs the output registers from the SELEQZ/SELNEZ instructions + // to get the final result. + // + // The initial test to see if the output register is same as the + // first input register is needed to make sure that value in the + // first input register isn't clobbered before we've finished + // computing the output value. The logic in the corresponding else + // clause performs the same task but makes sure the second input + // register isn't clobbered in the event that it's the same register + // as the output register; the else clause also handles the case + // where the output register is distinct from both the first, and the + // second input registers. + if (type == DataType::Type::kInt64) { + Register a_lo = locations->InAt(0).AsRegisterPairLow<Register>(); + Register a_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); + Register b_lo = locations->InAt(1).AsRegisterPairLow<Register>(); + Register b_hi = locations->InAt(1).AsRegisterPairHigh<Register>(); + Register out_lo = locations->Out().AsRegisterPairLow<Register>(); + Register out_hi = locations->Out().AsRegisterPairHigh<Register>(); + + MipsLabel compare_done; + + if (a_lo == b_lo) { + if (out_lo != a_lo) { + __ Move(out_lo, a_lo); + __ Move(out_hi, a_hi); + } + } else { + __ Slt(TMP, b_hi, a_hi); + __ Bne(b_hi, a_hi, &compare_done); + + __ Sltu(TMP, b_lo, a_lo); + + __ Bind(&compare_done); + + if (is_min) { + __ Seleqz(AT, a_lo, TMP); + __ Selnez(out_lo, b_lo, TMP); // Safe even if out_lo == a_lo/b_lo + // because at this point we're + // done using a_lo/b_lo. + } else { + __ Selnez(AT, a_lo, TMP); + __ Seleqz(out_lo, b_lo, TMP); // ditto + } + __ Or(out_lo, out_lo, AT); + if (is_min) { + __ Seleqz(AT, a_hi, TMP); + __ Selnez(out_hi, b_hi, TMP); // ditto but for out_hi & a_hi/b_hi + } else { + __ Selnez(AT, a_hi, TMP); + __ Seleqz(out_hi, b_hi, TMP); // ditto but for out_hi & a_hi/b_hi + } + __ Or(out_hi, out_hi, AT); + } + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + Register a = locations->InAt(0).AsRegister<Register>(); + Register b = locations->InAt(1).AsRegister<Register>(); + Register out = locations->Out().AsRegister<Register>(); + + if (a == b) { + if (out != a) { + __ Move(out, a); + } + } else { + __ Slt(AT, b, a); + if (is_min) { + __ Seleqz(TMP, a, AT); + __ Selnez(AT, b, AT); + } else { + __ Selnez(TMP, a, AT); + __ Seleqz(AT, b, AT); + } + __ Or(out, TMP, AT); + } + } + } else { // !isR6 + if (type == DataType::Type::kInt64) { + Register a_lo = locations->InAt(0).AsRegisterPairLow<Register>(); + Register a_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); + Register b_lo = locations->InAt(1).AsRegisterPairLow<Register>(); + Register b_hi = locations->InAt(1).AsRegisterPairHigh<Register>(); + Register out_lo = locations->Out().AsRegisterPairLow<Register>(); + Register out_hi = locations->Out().AsRegisterPairHigh<Register>(); + + MipsLabel compare_done; + + if (a_lo == b_lo) { + if (out_lo != a_lo) { + __ Move(out_lo, a_lo); + __ Move(out_hi, a_hi); + } + } else { + __ Slt(TMP, a_hi, b_hi); + __ Bne(a_hi, b_hi, &compare_done); + + __ Sltu(TMP, a_lo, b_lo); + + __ Bind(&compare_done); + + if (is_min) { + if (out_lo != a_lo) { + __ Movn(out_hi, a_hi, TMP); + __ Movn(out_lo, a_lo, TMP); + } + if (out_lo != b_lo) { + __ Movz(out_hi, b_hi, TMP); + __ Movz(out_lo, b_lo, TMP); + } + } else { + if (out_lo != a_lo) { + __ Movz(out_hi, a_hi, TMP); + __ Movz(out_lo, a_lo, TMP); + } + if (out_lo != b_lo) { + __ Movn(out_hi, b_hi, TMP); + __ Movn(out_lo, b_lo, TMP); + } + } + } + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + Register a = locations->InAt(0).AsRegister<Register>(); + Register b = locations->InAt(1).AsRegister<Register>(); + Register out = locations->Out().AsRegister<Register>(); + + if (a == b) { + if (out != a) { + __ Move(out, a); + } + } else { + __ Slt(AT, a, b); + if (is_min) { + if (out != a) { + __ Movn(out, a, AT); + } + if (out != b) { + __ Movz(out, b, AT); + } + } else { + if (out != a) { + __ Movz(out, a, AT); + } + if (out != b) { + __ Movn(out, b, AT); + } + } + } + } + } +} + +void InstructionCodeGeneratorMIPS::GenerateMinMaxFP(LocationSummary* locations, + bool is_min, + bool isR6, + DataType::Type type) { + FRegister out = locations->Out().AsFpuRegister<FRegister>(); + FRegister a = locations->InAt(0).AsFpuRegister<FRegister>(); + FRegister b = locations->InAt(1).AsFpuRegister<FRegister>(); + + if (isR6) { + MipsLabel noNaNs; + MipsLabel done; + FRegister ftmp = ((out != a) && (out != b)) ? out : FTMP; + + // When Java computes min/max it prefers a NaN to a number; the + // behavior of MIPSR6 is to prefer numbers to NaNs, i.e., if one of + // the inputs is a NaN and the other is a valid number, the MIPS + // instruction will return the number; Java wants the NaN value + // returned. This is why there is extra logic preceding the use of + // the MIPS min.fmt/max.fmt instructions. If either a, or b holds a + // NaN, return the NaN, otherwise return the min/max. + if (type == DataType::Type::kFloat64) { + __ CmpUnD(FTMP, a, b); + __ Bc1eqz(FTMP, &noNaNs); + + // One of the inputs is a NaN + __ CmpEqD(ftmp, a, a); + // If a == a then b is the NaN, otherwise a is the NaN. + __ SelD(ftmp, a, b); + + if (ftmp != out) { + __ MovD(out, ftmp); + } + + __ B(&done); + + __ Bind(&noNaNs); + + if (is_min) { + __ MinD(out, a, b); + } else { + __ MaxD(out, a, b); + } + } else { + DCHECK_EQ(type, DataType::Type::kFloat32); + __ CmpUnS(FTMP, a, b); + __ Bc1eqz(FTMP, &noNaNs); + + // One of the inputs is a NaN + __ CmpEqS(ftmp, a, a); + // If a == a then b is the NaN, otherwise a is the NaN. + __ SelS(ftmp, a, b); + + if (ftmp != out) { + __ MovS(out, ftmp); + } + + __ B(&done); + + __ Bind(&noNaNs); + + if (is_min) { + __ MinS(out, a, b); + } else { + __ MaxS(out, a, b); + } + } + + __ Bind(&done); + + } else { // !isR6 + MipsLabel ordered; + MipsLabel compare; + MipsLabel select; + MipsLabel done; + + if (type == DataType::Type::kFloat64) { + __ CunD(a, b); + } else { + DCHECK_EQ(type, DataType::Type::kFloat32); + __ CunS(a, b); + } + __ Bc1f(&ordered); + + // a or b (or both) is a NaN. Return one, which is a NaN. + if (type == DataType::Type::kFloat64) { + __ CeqD(b, b); + } else { + __ CeqS(b, b); + } + __ B(&select); + + __ Bind(&ordered); + + // Neither is a NaN. + // a == b? (-0.0 compares equal with +0.0) + // If equal, handle zeroes, else compare further. + if (type == DataType::Type::kFloat64) { + __ CeqD(a, b); + } else { + __ CeqS(a, b); + } + __ Bc1f(&compare); + + // a == b either bit for bit or one is -0.0 and the other is +0.0. + if (type == DataType::Type::kFloat64) { + __ MoveFromFpuHigh(TMP, a); + __ MoveFromFpuHigh(AT, b); + } else { + __ Mfc1(TMP, a); + __ Mfc1(AT, b); + } + + if (is_min) { + // -0.0 prevails over +0.0. + __ Or(TMP, TMP, AT); + } else { + // +0.0 prevails over -0.0. + __ And(TMP, TMP, AT); + } + + if (type == DataType::Type::kFloat64) { + __ Mfc1(AT, a); + __ Mtc1(AT, out); + __ MoveToFpuHigh(TMP, out); + } else { + __ Mtc1(TMP, out); + } + __ B(&done); + + __ Bind(&compare); + + if (type == DataType::Type::kFloat64) { + if (is_min) { + // return (a <= b) ? a : b; + __ ColeD(a, b); + } else { + // return (a >= b) ? a : b; + __ ColeD(b, a); // b <= a + } + } else { + if (is_min) { + // return (a <= b) ? a : b; + __ ColeS(a, b); + } else { + // return (a >= b) ? a : b; + __ ColeS(b, a); // b <= a + } + } + + __ Bind(&select); + + if (type == DataType::Type::kFloat64) { + __ MovtD(out, a); + __ MovfD(out, b); + } else { + __ MovtS(out, a); + __ MovfS(out, b); + } + + __ Bind(&done); + } +} + +void InstructionCodeGeneratorMIPS::GenerateMinMax(HBinaryOperation* minmax, bool is_min) { + bool isR6 = codegen_->GetInstructionSetFeatures().IsR6(); + DataType::Type type = minmax->GetResultType(); + switch (type) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + GenerateMinMaxInt(minmax->GetLocations(), is_min, isR6, type); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + GenerateMinMaxFP(minmax->GetLocations(), is_min, isR6, type); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << type; + } +} + +void LocationsBuilderMIPS::VisitMin(HMin* min) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), min); +} + +void InstructionCodeGeneratorMIPS::VisitMin(HMin* min) { + GenerateMinMax(min, /*is_min*/ true); +} + +void LocationsBuilderMIPS::VisitMax(HMax* max) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), max); +} + +void InstructionCodeGeneratorMIPS::VisitMax(HMax* max) { + GenerateMinMax(max, /*is_min*/ false); +} + void LocationsBuilderMIPS::VisitAbs(HAbs* abs) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(abs); switch (abs->GetResultType()) { diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h index d09ab7cce0..d90689695b 100644 --- a/compiler/optimizing/code_generator_mips.h +++ b/compiler/optimizing/code_generator_mips.h @@ -246,6 +246,9 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { bool value_can_be_null); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc); + void GenerateMinMaxInt(LocationSummary* locations, bool is_min, bool isR6, DataType::Type type); + void GenerateMinMaxFP(LocationSummary* locations, bool is_min, bool isR6, DataType::Type type); + void GenerateMinMax(HBinaryOperation*, bool is_min); void GenerateAbsFP(LocationSummary* locations, DataType::Type type, bool isR2OrNewer, bool isR6); // Generate a heap reference load using one register `out`: @@ -617,6 +620,8 @@ class CodeGeneratorMIPS : public CodeGenerator { DISALLOW_COPY_AND_ASSIGN(PcRelativePatchInfo); }; + PcRelativePatchInfo* NewBootImageRelRoPatch(uint32_t boot_image_offset, + const PcRelativePatchInfo* info_high = nullptr); PcRelativePatchInfo* NewBootImageMethodPatch(MethodReference target_method, const PcRelativePatchInfo* info_high = nullptr); PcRelativePatchInfo* NewMethodBssEntryPatch(MethodReference target_method, @@ -691,7 +696,8 @@ class CodeGeneratorMIPS : public CodeGenerator { // Deduplication map for 32-bit literals, used for non-patchable boot image addresses. Uint32ToLiteralMap uint32_literals_; - // PC-relative method patch info for kBootImageLinkTimePcRelative. + // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo. + // Also used for type/string patches for kBootImageRelRo (same linker patch as for methods). ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_; // PC-relative method patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_; @@ -699,7 +705,7 @@ class CodeGeneratorMIPS : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> boot_image_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // PC-relative String patch info; type depends on configuration (intern table or boot image PIC). + // PC-relative String patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> boot_image_string_patches_; // PC-relative String patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> string_bss_entry_patches_; diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index 9593eec455..f8851b4eea 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -1509,6 +1509,14 @@ inline void CodeGeneratorMIPS64::EmitPcRelativeLinkerPatches( } } +linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset, + const DexFile* target_dex_file, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + DCHECK(target_dex_file == nullptr); // Unused for DataBimgRelRoPatch(), should be null. + return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset); +} + void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) { DCHECK(linker_patches->empty()); size_t size = @@ -1527,11 +1535,10 @@ void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* li EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>( boot_image_string_patches_, linker_patches); } else { - DCHECK(boot_image_method_patches_.empty()); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>( - boot_image_type_patches_, linker_patches); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>( - boot_image_string_patches_, linker_patches); + EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>( + boot_image_method_patches_, linker_patches); + DCHECK(boot_image_type_patches_.empty()); + DCHECK(boot_image_string_patches_.empty()); } EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>( method_bss_entry_patches_, linker_patches); @@ -1542,6 +1549,13 @@ void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* li DCHECK_EQ(size, linker_patches->size()); } +CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewBootImageRelRoPatch( + uint32_t boot_image_offset, + const PcRelativePatchInfo* info_high) { + return NewPcRelativePatch( + /* dex_file */ nullptr, boot_image_offset, info_high, &boot_image_method_patches_); +} + CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewBootImageMethodPatch( MethodReference target_method, const PcRelativePatchInfo* info_high) { @@ -5839,7 +5853,7 @@ HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind( bool fallback_load = false; switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -5866,7 +5880,7 @@ HLoadClass::LoadKind CodeGeneratorMIPS64::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -5926,6 +5940,15 @@ void CodeGeneratorMIPS64::GenerateStaticOrDirectCall( kLoadDoubleword, DeduplicateUint64Literal(invoke->GetMethodAddress())); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: { + uint32_t boot_image_offset = GetBootImageOffset(invoke); + PcRelativePatchInfo* info_high = NewBootImageRelRoPatch(boot_image_offset); + PcRelativePatchInfo* info_low = NewBootImageRelRoPatch(boot_image_offset, info_high); + EmitPcRelativeAddressPlaceholderHigh(info_high, AT, info_low); + // Note: Boot image is in the low 4GiB and the entry is 32-bit, so emit a 32-bit load. + __ Lwu(temp.AsRegister<GpuRegister>(), AT, /* placeholder */ 0x5678); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: { PcRelativePatchInfo* info_high = NewMethodBssEntryPatch( MethodReference(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex())); @@ -6113,20 +6136,15 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S codegen_->DeduplicateBootImageAddressLiteral(address)); break; } - case HLoadClass::LoadKind::kBootImageClassTable: { + case HLoadClass::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); + uint32_t boot_image_offset = codegen_->GetBootImageOffset(cls); CodeGeneratorMIPS64::PcRelativePatchInfo* info_high = - codegen_->NewBootImageTypePatch(cls->GetDexFile(), cls->GetTypeIndex()); + codegen_->NewBootImageRelRoPatch(boot_image_offset); CodeGeneratorMIPS64::PcRelativePatchInfo* info_low = - codegen_->NewBootImageTypePatch(cls->GetDexFile(), cls->GetTypeIndex(), info_high); + codegen_->NewBootImageRelRoPatch(boot_image_offset, info_high); codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high, AT, info_low); __ Lwu(out, AT, /* placeholder */ 0x5678); - // Extract the reference from the slot data, i.e. clear the hash bits. - int32_t masked_hash = ClassTable::TableSlot::MaskHash( - ComputeModifiedUtf8Hash(cls->GetDexFile().StringByTypeIdx(cls->GetTypeIndex()))); - if (masked_hash != 0) { - __ Daddiu(out, out, -masked_hash); - } break; } case HLoadClass::LoadKind::kBssEntry: { @@ -6248,12 +6266,13 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA codegen_->DeduplicateBootImageAddressLiteral(address)); return; } - case HLoadString::LoadKind::kBootImageInternTable: { + case HLoadString::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); + uint32_t boot_image_offset = codegen_->GetBootImageOffset(load); CodeGeneratorMIPS64::PcRelativePatchInfo* info_high = - codegen_->NewBootImageStringPatch(load->GetDexFile(), load->GetStringIndex()); + codegen_->NewBootImageRelRoPatch(boot_image_offset); CodeGeneratorMIPS64::PcRelativePatchInfo* info_low = - codegen_->NewBootImageStringPatch(load->GetDexFile(), load->GetStringIndex(), info_high); + codegen_->NewBootImageRelRoPatch(boot_image_offset, info_high); codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high, AT, info_low); __ Lwu(out, AT, /* placeholder */ 0x5678); return; @@ -6665,6 +6684,182 @@ void InstructionCodeGeneratorMIPS64::VisitRem(HRem* instruction) { } } +static void CreateMinMaxLocations(ArenaAllocator* allocator, HBinaryOperation* minmax) { + LocationSummary* locations = new (allocator) LocationSummary(minmax); + switch (minmax->GetResultType()) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << minmax->GetResultType(); + } +} + +void InstructionCodeGeneratorMIPS64::GenerateMinMaxInt(LocationSummary* locations, bool is_min) { + GpuRegister lhs = locations->InAt(0).AsRegister<GpuRegister>(); + GpuRegister rhs = locations->InAt(1).AsRegister<GpuRegister>(); + GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + + if (lhs == rhs) { + if (out != lhs) { + __ Move(out, lhs); + } + } else { + // Some architectures, such as ARM and MIPS (prior to r6), have a + // conditional move instruction which only changes the target + // (output) register if the condition is true (MIPS prior to r6 had + // MOVF, MOVT, and MOVZ). The SELEQZ and SELNEZ instructions always + // change the target (output) register. If the condition is true the + // output register gets the contents of the "rs" register; otherwise, + // the output register is set to zero. One consequence of this is + // that to implement something like "rd = c==0 ? rs : rt" MIPS64r6 + // needs to use a pair of SELEQZ/SELNEZ instructions. After + // executing this pair of instructions one of the output registers + // from the pair will necessarily contain zero. Then the code ORs the + // output registers from the SELEQZ/SELNEZ instructions to get the + // final result. + // + // The initial test to see if the output register is same as the + // first input register is needed to make sure that value in the + // first input register isn't clobbered before we've finished + // computing the output value. The logic in the corresponding else + // clause performs the same task but makes sure the second input + // register isn't clobbered in the event that it's the same register + // as the output register; the else clause also handles the case + // where the output register is distinct from both the first, and the + // second input registers. + if (out == lhs) { + __ Slt(AT, rhs, lhs); + if (is_min) { + __ Seleqz(out, lhs, AT); + __ Selnez(AT, rhs, AT); + } else { + __ Selnez(out, lhs, AT); + __ Seleqz(AT, rhs, AT); + } + } else { + __ Slt(AT, lhs, rhs); + if (is_min) { + __ Seleqz(out, rhs, AT); + __ Selnez(AT, lhs, AT); + } else { + __ Selnez(out, rhs, AT); + __ Seleqz(AT, lhs, AT); + } + } + __ Or(out, out, AT); + } +} + +void InstructionCodeGeneratorMIPS64::GenerateMinMaxFP(LocationSummary* locations, + bool is_min, + DataType::Type type) { + FpuRegister a = locations->InAt(0).AsFpuRegister<FpuRegister>(); + FpuRegister b = locations->InAt(1).AsFpuRegister<FpuRegister>(); + FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>(); + + Mips64Label noNaNs; + Mips64Label done; + FpuRegister ftmp = ((out != a) && (out != b)) ? out : FTMP; + + // When Java computes min/max it prefers a NaN to a number; the + // behavior of MIPSR6 is to prefer numbers to NaNs, i.e., if one of + // the inputs is a NaN and the other is a valid number, the MIPS + // instruction will return the number; Java wants the NaN value + // returned. This is why there is extra logic preceding the use of + // the MIPS min.fmt/max.fmt instructions. If either a, or b holds a + // NaN, return the NaN, otherwise return the min/max. + if (type == DataType::Type::kFloat64) { + __ CmpUnD(FTMP, a, b); + __ Bc1eqz(FTMP, &noNaNs); + + // One of the inputs is a NaN + __ CmpEqD(ftmp, a, a); + // If a == a then b is the NaN, otherwise a is the NaN. + __ SelD(ftmp, a, b); + + if (ftmp != out) { + __ MovD(out, ftmp); + } + + __ Bc(&done); + + __ Bind(&noNaNs); + + if (is_min) { + __ MinD(out, a, b); + } else { + __ MaxD(out, a, b); + } + } else { + DCHECK_EQ(type, DataType::Type::kFloat32); + __ CmpUnS(FTMP, a, b); + __ Bc1eqz(FTMP, &noNaNs); + + // One of the inputs is a NaN + __ CmpEqS(ftmp, a, a); + // If a == a then b is the NaN, otherwise a is the NaN. + __ SelS(ftmp, a, b); + + if (ftmp != out) { + __ MovS(out, ftmp); + } + + __ Bc(&done); + + __ Bind(&noNaNs); + + if (is_min) { + __ MinS(out, a, b); + } else { + __ MaxS(out, a, b); + } + } + + __ Bind(&done); +} + +void InstructionCodeGeneratorMIPS64::GenerateMinMax(HBinaryOperation* minmax, bool is_min) { + DataType::Type type = minmax->GetResultType(); + switch (type) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + GenerateMinMaxInt(minmax->GetLocations(), is_min); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + GenerateMinMaxFP(minmax->GetLocations(), is_min, type); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << type; + } +} + +void LocationsBuilderMIPS64::VisitMin(HMin* min) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), min); +} + +void InstructionCodeGeneratorMIPS64::VisitMin(HMin* min) { + GenerateMinMax(min, /*is_min*/ true); +} + +void LocationsBuilderMIPS64::VisitMax(HMax* max) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), max); +} + +void InstructionCodeGeneratorMIPS64::VisitMax(HMax* max) { + GenerateMinMax(max, /*is_min*/ false); +} + void LocationsBuilderMIPS64::VisitAbs(HAbs* abs) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(abs); switch (abs->GetResultType()) { diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index ddeb3eb90c..d1da1cec15 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -242,6 +242,10 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator { bool value_can_be_null); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); + void GenerateMinMaxInt(LocationSummary* locations, bool is_min); + void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMax(HBinaryOperation* minmax, bool is_min); + // Generate a heap reference load using one register `out`: // // out <- *(out + offset) @@ -586,6 +590,8 @@ class CodeGeneratorMIPS64 : public CodeGenerator { DISALLOW_COPY_AND_ASSIGN(PcRelativePatchInfo); }; + PcRelativePatchInfo* NewBootImageRelRoPatch(uint32_t boot_image_offset, + const PcRelativePatchInfo* info_high = nullptr); PcRelativePatchInfo* NewBootImageMethodPatch(MethodReference target_method, const PcRelativePatchInfo* info_high = nullptr); PcRelativePatchInfo* NewMethodBssEntryPatch(MethodReference target_method, @@ -655,7 +661,8 @@ class CodeGeneratorMIPS64 : public CodeGenerator { // Deduplication map for 64-bit literals, used for non-patchable method address or method code // address. Uint64ToLiteralMap uint64_literals_; - // PC-relative method patch info for kBootImageLinkTimePcRelative. + // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo. + // Also used for type/string patches for kBootImageRelRo (same linker patch as for methods). ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_; // PC-relative method patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_; @@ -663,7 +670,7 @@ class CodeGeneratorMIPS64 : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> boot_image_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // PC-relative String patch info; type depends on configuration (intern table or boot image PIC). + // PC-relative String patch info for kBootImageLinkTimePcRelative. ArenaDeque<PcRelativePatchInfo> boot_image_string_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> string_bss_entry_patches_; diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 51b96be0f8..4818084a72 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -51,6 +51,9 @@ static constexpr int kC2ConditionMask = 0x400; static constexpr int kFakeReturnRegister = Register(8); +static constexpr int64_t kDoubleNaN = INT64_C(0x7FF8000000000000); +static constexpr int32_t kFloatNaN = INT32_C(0x7FC00000); + // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, x).Int32Value() @@ -3802,6 +3805,211 @@ void InstructionCodeGeneratorX86::VisitRem(HRem* rem) { } } +static void CreateMinMaxLocations(ArenaAllocator* allocator, HBinaryOperation* minmax) { + LocationSummary* locations = new (allocator) LocationSummary(minmax); + switch (minmax->GetResultType()) { + case DataType::Type::kInt32: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + // Register to use to perform a long subtract to set cc. + locations->AddTemp(Location::RequiresRegister()); + break; + case DataType::Type::kFloat32: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + locations->AddTemp(Location::RequiresRegister()); + break; + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << minmax->GetResultType(); + } +} + +void InstructionCodeGeneratorX86::GenerateMinMaxInt(LocationSummary* locations, + bool is_min, + DataType::Type type) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + + // Shortcut for same input locations. + if (op1_loc.Equals(op2_loc)) { + // Can return immediately, as op1_loc == out_loc. + // Note: if we ever support separate registers, e.g., output into memory, we need to check for + // a copy here. + DCHECK(locations->Out().Equals(op1_loc)); + return; + } + + if (type == DataType::Type::kInt64) { + // Need to perform a subtract to get the sign right. + // op1 is already in the same location as the output. + Location output = locations->Out(); + Register output_lo = output.AsRegisterPairLow<Register>(); + Register output_hi = output.AsRegisterPairHigh<Register>(); + + Register op2_lo = op2_loc.AsRegisterPairLow<Register>(); + Register op2_hi = op2_loc.AsRegisterPairHigh<Register>(); + + // The comparison is performed by subtracting the second operand from + // the first operand and then setting the status flags in the same + // manner as the SUB instruction." + __ cmpl(output_lo, op2_lo); + + // Now use a temp and the borrow to finish the subtraction of op2_hi. + Register temp = locations->GetTemp(0).AsRegister<Register>(); + __ movl(temp, output_hi); + __ sbbl(temp, op2_hi); + + // Now the condition code is correct. + Condition cond = is_min ? Condition::kGreaterEqual : Condition::kLess; + __ cmovl(cond, output_lo, op2_lo); + __ cmovl(cond, output_hi, op2_hi); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + Register out = locations->Out().AsRegister<Register>(); + Register op2 = op2_loc.AsRegister<Register>(); + + // (out := op1) + // out <=? op2 + // if out is min jmp done + // out := op2 + // done: + + __ cmpl(out, op2); + Condition cond = is_min ? Condition::kGreater : Condition::kLess; + __ cmovl(cond, out, op2); + } +} + +void InstructionCodeGeneratorX86::GenerateMinMaxFP(LocationSummary* locations, + bool is_min, + DataType::Type type) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + XmmRegister out = out_loc.AsFpuRegister<XmmRegister>(); + + // Shortcut for same input locations. + if (op1_loc.Equals(op2_loc)) { + DCHECK(out_loc.Equals(op1_loc)); + return; + } + + // (out := op1) + // out <=? op2 + // if Nan jmp Nan_label + // if out is min jmp done + // if op2 is min jmp op2_label + // handle -0/+0 + // jmp done + // Nan_label: + // out := NaN + // op2_label: + // out := op2 + // done: + // + // This removes one jmp, but needs to copy one input (op1) to out. + // + // TODO: This is straight from Quick (except literal pool). Make NaN an out-of-line slowpath? + + XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>(); + + NearLabel nan, done, op2_label; + if (type == DataType::Type::kFloat64) { + __ ucomisd(out, op2); + } else { + DCHECK_EQ(type, DataType::Type::kFloat32); + __ ucomiss(out, op2); + } + + __ j(Condition::kParityEven, &nan); + + __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label); + __ j(is_min ? Condition::kBelow : Condition::kAbove, &done); + + // Handle 0.0/-0.0. + if (is_min) { + if (type == DataType::Type::kFloat64) { + __ orpd(out, op2); + } else { + __ orps(out, op2); + } + } else { + if (type == DataType::Type::kFloat64) { + __ andpd(out, op2); + } else { + __ andps(out, op2); + } + } + __ jmp(&done); + + // NaN handling. + __ Bind(&nan); + if (type == DataType::Type::kFloat64) { + // TODO: Use a constant from the constant table (requires extra input). + __ LoadLongConstant(out, kDoubleNaN); + } else { + Register constant = locations->GetTemp(0).AsRegister<Register>(); + __ movl(constant, Immediate(kFloatNaN)); + __ movd(out, constant); + } + __ jmp(&done); + + // out := op2; + __ Bind(&op2_label); + if (type == DataType::Type::kFloat64) { + __ movsd(out, op2); + } else { + __ movss(out, op2); + } + + // Done. + __ Bind(&done); +} + +void InstructionCodeGeneratorX86::GenerateMinMax(HBinaryOperation* minmax, bool is_min) { + DataType::Type type = minmax->GetResultType(); + switch (type) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + GenerateMinMaxInt(minmax->GetLocations(), is_min, type); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + GenerateMinMaxFP(minmax->GetLocations(), is_min, type); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << type; + } +} + +void LocationsBuilderX86::VisitMin(HMin* min) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), min); +} + +void InstructionCodeGeneratorX86::VisitMin(HMin* min) { + GenerateMinMax(min, /*is_min*/ true); +} + +void LocationsBuilderX86::VisitMax(HMax* max) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), max); +} + +void InstructionCodeGeneratorX86::VisitMax(HMax* max) { + GenerateMinMax(max, /*is_min*/ false); +} + void LocationsBuilderX86::VisitAbs(HAbs* abs) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(abs); switch (abs->GetResultType()) { @@ -4624,6 +4832,15 @@ void CodeGeneratorX86::GenerateStaticOrDirectCall( case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress())); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: { + Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke, + temp.AsRegister<Register>()); + __ movl(temp.AsRegister<Register>(), Address(base_reg, kDummy32BitOffset)); + RecordBootImageRelRoPatch( + invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress(), + GetBootImageOffset(invoke)); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: { Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke, temp.AsRegister<Register>()); @@ -4685,6 +4902,13 @@ void CodeGeneratorX86::GenerateVirtualCall( RecordPcInfo(invoke, invoke->GetDexPc(), slow_path); } +void CodeGeneratorX86::RecordBootImageRelRoPatch(HX86ComputeBaseMethodAddress* method_address, + uint32_t boot_image_offset) { + boot_image_method_patches_.emplace_back( + method_address, /* target_dex_file */ nullptr, boot_image_offset); + __ Bind(&boot_image_method_patches_.back().label); +} + void CodeGeneratorX86::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) { DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u); HX86ComputeBaseMethodAddress* method_address = @@ -4754,6 +4978,14 @@ inline void CodeGeneratorX86::EmitPcRelativeLinkerPatches( } } +linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset, + const DexFile* target_dex_file, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + DCHECK(target_dex_file == nullptr); // Unused for DataBimgRelRoPatch(), should be null. + return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset); +} + void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) { DCHECK(linker_patches->empty()); size_t size = @@ -4772,11 +5004,10 @@ void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linke EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>( boot_image_string_patches_, linker_patches); } else { - DCHECK(boot_image_method_patches_.empty()); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>( - boot_image_type_patches_, linker_patches); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>( - boot_image_string_patches_, linker_patches); + EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>( + boot_image_method_patches_, linker_patches); + DCHECK(boot_image_type_patches_.empty()); + DCHECK(boot_image_string_patches_.empty()); } EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>( method_bss_entry_patches_, linker_patches); @@ -6145,7 +6376,7 @@ HLoadClass::LoadKind CodeGeneratorX86::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -6183,7 +6414,7 @@ void LocationsBuilderX86::VisitLoadClass(HLoadClass* cls) { if (load_kind == HLoadClass::LoadKind::kReferrersClass || load_kind == HLoadClass::LoadKind::kBootImageLinkTimePcRelative || - load_kind == HLoadClass::LoadKind::kBootImageClassTable || + load_kind == HLoadClass::LoadKind::kBootImageRelRo || load_kind == HLoadClass::LoadKind::kBssEntry) { locations->SetInAt(0, Location::RequiresRegister()); } @@ -6259,17 +6490,12 @@ void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE __ movl(out, Immediate(address)); break; } - case HLoadClass::LoadKind::kBootImageClassTable: { + case HLoadClass::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); Register method_address = locations->InAt(0).AsRegister<Register>(); __ movl(out, Address(method_address, CodeGeneratorX86::kDummy32BitOffset)); - codegen_->RecordBootImageTypePatch(cls); - // Extract the reference from the slot data, i.e. clear the hash bits. - int32_t masked_hash = ClassTable::TableSlot::MaskHash( - ComputeModifiedUtf8Hash(cls->GetDexFile().StringByTypeIdx(cls->GetTypeIndex()))); - if (masked_hash != 0) { - __ subl(out, Immediate(masked_hash)); - } + codegen_->RecordBootImageRelRoPatch(cls->InputAt(0)->AsX86ComputeBaseMethodAddress(), + codegen_->GetBootImageOffset(cls)); break; } case HLoadClass::LoadKind::kBssEntry: { @@ -6349,7 +6575,7 @@ HLoadString::LoadKind CodeGeneratorX86::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -6368,7 +6594,7 @@ void LocationsBuilderX86::VisitLoadString(HLoadString* load) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(load, call_kind); HLoadString::LoadKind load_kind = load->GetLoadKind(); if (load_kind == HLoadString::LoadKind::kBootImageLinkTimePcRelative || - load_kind == HLoadString::LoadKind::kBootImageInternTable || + load_kind == HLoadString::LoadKind::kBootImageRelRo || load_kind == HLoadString::LoadKind::kBssEntry) { locations->SetInAt(0, Location::RequiresRegister()); } @@ -6422,11 +6648,12 @@ void InstructionCodeGeneratorX86::VisitLoadString(HLoadString* load) NO_THREAD_S __ movl(out, Immediate(address)); return; } - case HLoadString::LoadKind::kBootImageInternTable: { + case HLoadString::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); Register method_address = locations->InAt(0).AsRegister<Register>(); __ movl(out, Address(method_address, CodeGeneratorX86::kDummy32BitOffset)); - codegen_->RecordBootImageStringPatch(load); + codegen_->RecordBootImageRelRoPatch(load->InputAt(0)->AsX86ComputeBaseMethodAddress(), + codegen_->GetBootImageOffset(load)); return; } case HLoadString::LoadKind::kBssEntry: { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 51e5bca00b..9c537a7371 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -225,6 +225,9 @@ class InstructionCodeGeneratorX86 : public InstructionCodeGenerator { void GenerateShlLong(const Location& loc, int shift); void GenerateShrLong(const Location& loc, int shift); void GenerateUShrLong(const Location& loc, int shift); + void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMax(HBinaryOperation* minmax, bool is_min); void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info, @@ -414,6 +417,8 @@ class CodeGeneratorX86 : public CodeGenerator { void GenerateVirtualCall( HInvokeVirtual* invoke, Location temp, SlowPathCode* slow_path = nullptr) OVERRIDE; + void RecordBootImageRelRoPatch(HX86ComputeBaseMethodAddress* method_address, + uint32_t boot_image_offset); void RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke); void RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke); void RecordBootImageTypePatch(HLoadClass* load_class); @@ -631,17 +636,18 @@ class CodeGeneratorX86 : public CodeGenerator { X86Assembler assembler_; const X86InstructionSetFeatures& isa_features_; - // PC-relative method patch info for kBootImageLinkTimePcRelative. + // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo. + // Also used for type/string patches for kBootImageRelRo (same linker patch as for methods). ArenaDeque<X86PcRelativePatchInfo> boot_image_method_patches_; // PC-relative method patch info for kBssEntry. ArenaDeque<X86PcRelativePatchInfo> method_bss_entry_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<X86PcRelativePatchInfo> boot_image_type_patches_; - // Type patch locations for kBssEntry. + // PC-relative type patch info for kBssEntry. ArenaDeque<X86PcRelativePatchInfo> type_bss_entry_patches_; - // String patch locations; type depends on configuration (intern table or boot image PIC). + // PC-relative String patch info for kBootImageLinkTimePcRelative. ArenaDeque<X86PcRelativePatchInfo> boot_image_string_patches_; - // String patch locations for kBssEntry. + // PC-relative String patch info for kBssEntry. ArenaDeque<X86PcRelativePatchInfo> string_bss_entry_patches_; // Patches for string root accesses in JIT compiled code. diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 0bb56a2b4a..c378c5b957 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -998,6 +998,13 @@ void CodeGeneratorX86_64::GenerateStaticOrDirectCall( case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: Load64BitValue(temp.AsRegister<CpuRegister>(), invoke->GetMethodAddress()); break; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: { + // Note: Boot image is in the low 4GiB and the entry is 32-bit, so emit a 32-bit load. + __ movl(temp.AsRegister<CpuRegister>(), + Address::Absolute(kDummy32BitOffset, /* no_rip */ false)); + RecordBootImageRelRoPatch(GetBootImageOffset(invoke)); + break; + } case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: { __ movq(temp.AsRegister<CpuRegister>(), Address::Absolute(kDummy32BitOffset, /* no_rip */ false)); @@ -1059,6 +1066,11 @@ void CodeGeneratorX86_64::GenerateVirtualCall( RecordPcInfo(invoke, invoke->GetDexPc(), slow_path); } +void CodeGeneratorX86_64::RecordBootImageRelRoPatch(uint32_t boot_image_offset) { + boot_image_method_patches_.emplace_back(/* target_dex_file */ nullptr, boot_image_offset); + __ Bind(&boot_image_method_patches_.back().label); +} + void CodeGeneratorX86_64::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) { boot_image_method_patches_.emplace_back( invoke->GetTargetMethod().dex_file, invoke->GetTargetMethod().index); @@ -1110,6 +1122,14 @@ inline void CodeGeneratorX86_64::EmitPcRelativeLinkerPatches( } } +linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset, + const DexFile* target_dex_file, + uint32_t pc_insn_offset, + uint32_t boot_image_offset) { + DCHECK(target_dex_file == nullptr); // Unused for DataBimgRelRoPatch(), should be null. + return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset); +} + void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) { DCHECK(linker_patches->empty()); size_t size = @@ -1128,11 +1148,10 @@ void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* li EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>( boot_image_string_patches_, linker_patches); } else { - DCHECK(boot_image_method_patches_.empty()); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>( - boot_image_type_patches_, linker_patches); - EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>( - boot_image_string_patches_, linker_patches); + EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>( + boot_image_method_patches_, linker_patches); + DCHECK(boot_image_type_patches_.empty()); + DCHECK(boot_image_string_patches_.empty()); } EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>( method_bss_entry_patches_, linker_patches); @@ -3821,6 +3840,177 @@ void InstructionCodeGeneratorX86_64::VisitRem(HRem* rem) { } } +static void CreateMinMaxLocations(ArenaAllocator* allocator, HBinaryOperation* minmax) { + LocationSummary* locations = new (allocator) LocationSummary(minmax); + switch (minmax->GetResultType()) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + // The following is sub-optimal, but all we can do for now. It would be fine to also accept + // the second input to be the output (we can simply swap inputs). + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << minmax->GetResultType(); + } +} + +void InstructionCodeGeneratorX86_64::GenerateMinMaxInt(LocationSummary* locations, + bool is_min, + DataType::Type type) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + + // Shortcut for same input locations. + if (op1_loc.Equals(op2_loc)) { + // Can return immediately, as op1_loc == out_loc. + // Note: if we ever support separate registers, e.g., output into memory, we need to check for + // a copy here. + DCHECK(locations->Out().Equals(op1_loc)); + return; + } + + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + CpuRegister op2 = op2_loc.AsRegister<CpuRegister>(); + + // (out := op1) + // out <=? op2 + // if out is min jmp done + // out := op2 + // done: + + if (type == DataType::Type::kInt64) { + __ cmpq(out, op2); + __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, /*is64bit*/ true); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + __ cmpl(out, op2); + __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, /*is64bit*/ false); + } +} + +void InstructionCodeGeneratorX86_64::GenerateMinMaxFP(LocationSummary* locations, + bool is_min, + DataType::Type type) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + XmmRegister out = out_loc.AsFpuRegister<XmmRegister>(); + + // Shortcut for same input locations. + if (op1_loc.Equals(op2_loc)) { + DCHECK(out_loc.Equals(op1_loc)); + return; + } + + // (out := op1) + // out <=? op2 + // if Nan jmp Nan_label + // if out is min jmp done + // if op2 is min jmp op2_label + // handle -0/+0 + // jmp done + // Nan_label: + // out := NaN + // op2_label: + // out := op2 + // done: + // + // This removes one jmp, but needs to copy one input (op1) to out. + // + // TODO: This is straight from Quick. Make NaN an out-of-line slowpath? + + XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>(); + + NearLabel nan, done, op2_label; + if (type == DataType::Type::kFloat64) { + __ ucomisd(out, op2); + } else { + DCHECK_EQ(type, DataType::Type::kFloat32); + __ ucomiss(out, op2); + } + + __ j(Condition::kParityEven, &nan); + + __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label); + __ j(is_min ? Condition::kBelow : Condition::kAbove, &done); + + // Handle 0.0/-0.0. + if (is_min) { + if (type == DataType::Type::kFloat64) { + __ orpd(out, op2); + } else { + __ orps(out, op2); + } + } else { + if (type == DataType::Type::kFloat64) { + __ andpd(out, op2); + } else { + __ andps(out, op2); + } + } + __ jmp(&done); + + // NaN handling. + __ Bind(&nan); + if (type == DataType::Type::kFloat64) { + __ movsd(out, codegen_->LiteralInt64Address(INT64_C(0x7FF8000000000000))); + } else { + __ movss(out, codegen_->LiteralInt32Address(INT32_C(0x7FC00000))); + } + __ jmp(&done); + + // out := op2; + __ Bind(&op2_label); + if (type == DataType::Type::kFloat64) { + __ movsd(out, op2); + } else { + __ movss(out, op2); + } + + // Done. + __ Bind(&done); +} + +void InstructionCodeGeneratorX86_64::GenerateMinMax(HBinaryOperation* minmax, bool is_min) { + DataType::Type type = minmax->GetResultType(); + switch (type) { + case DataType::Type::kInt32: + case DataType::Type::kInt64: + GenerateMinMaxInt(minmax->GetLocations(), is_min, type); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + GenerateMinMaxFP(minmax->GetLocations(), is_min, type); + break; + default: + LOG(FATAL) << "Unexpected type for HMinMax " << type; + } +} + +void LocationsBuilderX86_64::VisitMin(HMin* min) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), min); +} + +void InstructionCodeGeneratorX86_64::VisitMin(HMin* min) { + GenerateMinMax(min, /*is_min*/ true); +} + +void LocationsBuilderX86_64::VisitMax(HMax* max) { + CreateMinMaxLocations(GetGraph()->GetAllocator(), max); +} + +void InstructionCodeGeneratorX86_64::VisitMax(HMax* max) { + GenerateMinMax(max, /*is_min*/ false); +} + void LocationsBuilderX86_64::VisitAbs(HAbs* abs) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(abs); switch (abs->GetResultType()) { @@ -5535,7 +5725,7 @@ HLoadClass::LoadKind CodeGeneratorX86_64::GetSupportedLoadClassKind( case HLoadClass::LoadKind::kReferrersClass: break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -5643,16 +5833,10 @@ void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S __ movl(out, Immediate(static_cast<int32_t>(address))); // Zero-extended. break; } - case HLoadClass::LoadKind::kBootImageClassTable: { + case HLoadClass::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); __ movl(out, Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset, /* no_rip */ false)); - codegen_->RecordBootImageTypePatch(cls); - // Extract the reference from the slot data, i.e. clear the hash bits. - int32_t masked_hash = ClassTable::TableSlot::MaskHash( - ComputeModifiedUtf8Hash(cls->GetDexFile().StringByTypeIdx(cls->GetTypeIndex()))); - if (masked_hash != 0) { - __ subl(out, Immediate(masked_hash)); - } + codegen_->RecordBootImageRelRoPatch(codegen_->GetBootImageOffset(cls)); break; } case HLoadClass::LoadKind::kBssEntry: { @@ -5717,7 +5901,7 @@ HLoadString::LoadKind CodeGeneratorX86_64::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: DCHECK(!Runtime::Current()->UseJitCompilation()); break; @@ -5783,10 +5967,10 @@ void InstructionCodeGeneratorX86_64::VisitLoadString(HLoadString* load) NO_THREA __ movl(out, Immediate(static_cast<int32_t>(address))); // Zero-extended. return; } - case HLoadString::LoadKind::kBootImageInternTable: { + case HLoadString::LoadKind::kBootImageRelRo: { DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); __ movl(out, Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset, /* no_rip */ false)); - codegen_->RecordBootImageStringPatch(load); + codegen_->RecordBootImageRelRoPatch(codegen_->GetBootImageOffset(load)); return; } case HLoadString::LoadKind::kBssEntry: { diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 1079e94dfc..e8d1efe702 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -222,6 +222,10 @@ class InstructionCodeGeneratorX86_64 : public InstructionCodeGenerator { bool value_can_be_null); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); + void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type); + void GenerateMinMax(HBinaryOperation* minmax, bool is_min); + // Generate a heap reference load using one register `out`: // // out <- *(out + offset) @@ -410,6 +414,7 @@ class CodeGeneratorX86_64 : public CodeGenerator { void GenerateVirtualCall( HInvokeVirtual* invoke, Location temp, SlowPathCode* slow_path = nullptr) OVERRIDE; + void RecordBootImageRelRoPatch(uint32_t boot_image_offset); void RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke); void RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke); void RecordBootImageTypePatch(HLoadClass* load_class); @@ -604,17 +609,18 @@ class CodeGeneratorX86_64 : public CodeGenerator { // Used for fixups to the constant area. int constant_area_start_; - // PC-relative method patch info for kBootImageLinkTimePcRelative. + // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo. + // Also used for type/string patches for kBootImageRelRo (same linker patch as for methods). ArenaDeque<PatchInfo<Label>> boot_image_method_patches_; // PC-relative method patch info for kBssEntry. ArenaDeque<PatchInfo<Label>> method_bss_entry_patches_; // PC-relative type patch info for kBootImageLinkTimePcRelative. ArenaDeque<PatchInfo<Label>> boot_image_type_patches_; - // Type patch locations for kBssEntry. + // PC-relative type patch info for kBssEntry. ArenaDeque<PatchInfo<Label>> type_bss_entry_patches_; - // String patch locations; type depends on configuration (intern table or boot image PIC). + // PC-relative String patch info for kBootImageLinkTimePcRelative. ArenaDeque<PatchInfo<Label>> boot_image_string_patches_; - // String patch locations for kBssEntry. + // PC-relative String patch info for kBssEntry. ArenaDeque<PatchInfo<Label>> string_bss_entry_patches_; // Patches for string literals in JIT compiled code. diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index ba4040acad..a0fd5ffcb1 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -18,6 +18,7 @@ #include <memory> #include "base/macros.h" +#include "base/utils.h" #include "builder.h" #include "codegen_test_utils.h" #include "dex/dex_file.h" @@ -26,7 +27,6 @@ #include "nodes.h" #include "optimizing_unit_test.h" #include "register_allocator_linear_scan.h" -#include "utils.h" #include "utils/arm/assembler_arm_vixl.h" #include "utils/arm/managed_register_arm.h" #include "utils/mips/managed_register_mips.h" diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc index 71c394ec1f..f05159b735 100644 --- a/compiler/optimizing/gvn.cc +++ b/compiler/optimizing/gvn.cc @@ -17,11 +17,11 @@ #include "gvn.h" #include "base/arena_bit_vector.h" +#include "base/bit_vector-inl.h" #include "base/scoped_arena_allocator.h" #include "base/scoped_arena_containers.h" -#include "base/bit_vector-inl.h" +#include "base/utils.h" #include "side_effects_analysis.h" -#include "utils.h" namespace art { diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc index d699d01ecb..0a310ca940 100644 --- a/compiler/optimizing/induction_var_range.cc +++ b/compiler/optimizing/induction_var_range.cc @@ -78,16 +78,10 @@ static bool IsGEZero(HInstruction* instruction) { DCHECK(instruction != nullptr); if (instruction->IsArrayLength()) { return true; - } else if (instruction->IsInvokeStaticOrDirect()) { - switch (instruction->AsInvoke()->GetIntrinsic()) { - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - // Instruction MIN(>=0, >=0) is >= 0. - return IsGEZero(instruction->InputAt(0)) && - IsGEZero(instruction->InputAt(1)); - default: - break; - } + } else if (instruction->IsMin()) { + // Instruction MIN(>=0, >=0) is >= 0. + return IsGEZero(instruction->InputAt(0)) && + IsGEZero(instruction->InputAt(1)); } else if (instruction->IsAbs()) { // Instruction ABS(>=0) is >= 0. // NOTE: ABS(minint) = minint prevents assuming @@ -101,21 +95,14 @@ static bool IsGEZero(HInstruction* instruction) { /** Hunts "under the hood" for a suitable instruction at the hint. */ static bool IsMaxAtHint( HInstruction* instruction, HInstruction* hint, /*out*/HInstruction** suitable) { - if (instruction->IsInvokeStaticOrDirect()) { - switch (instruction->AsInvoke()->GetIntrinsic()) { - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - // For MIN(x, y), return most suitable x or y as maximum. - return IsMaxAtHint(instruction->InputAt(0), hint, suitable) || - IsMaxAtHint(instruction->InputAt(1), hint, suitable); - default: - break; - } + if (instruction->IsMin()) { + // For MIN(x, y), return most suitable x or y as maximum. + return IsMaxAtHint(instruction->InputAt(0), hint, suitable) || + IsMaxAtHint(instruction->InputAt(1), hint, suitable); } else { *suitable = instruction; return HuntForDeclaration(instruction) == hint; } - return false; } /** Post-analysis simplification of a minimum value that makes the bound more useful to clients. */ @@ -364,11 +351,11 @@ void InductionVarRange::Replace(HInstruction* instruction, } } -bool InductionVarRange::IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) const { +bool InductionVarRange::IsFinite(HLoopInformation* loop, /*out*/ int64_t* trip_count) const { HInductionVarAnalysis::InductionInfo *trip = induction_analysis_->LookupInfo(loop, GetLoopControl(loop)); if (trip != nullptr && !IsUnsafeTripCount(trip)) { - IsConstant(trip->op_a, kExact, tc); + IsConstant(trip->op_a, kExact, trip_count); return true; } return false; diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc index eac8d2f593..34837700a2 100644 --- a/compiler/optimizing/instruction_simplifier.cc +++ b/compiler/optimizing/instruction_simplifier.cc @@ -120,6 +120,8 @@ class InstructionSimplifierVisitor : public HGraphDelegateVisitor { void SimplifyReturnThis(HInvoke* invoke); void SimplifyAllocationIntrinsic(HInvoke* invoke); void SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind); + void SimplifyMin(HInvoke* invoke, DataType::Type type); + void SimplifyMax(HInvoke* invoke, DataType::Type type); void SimplifyAbs(HInvoke* invoke, DataType::Type type); CodeGenerator* codegen_; @@ -2407,6 +2409,20 @@ void InstructionSimplifierVisitor::SimplifyMemBarrier(HInvoke* invoke, invoke->GetBlock()->ReplaceAndRemoveInstructionWith(invoke, mem_barrier); } +void InstructionSimplifierVisitor::SimplifyMin(HInvoke* invoke, DataType::Type type) { + DCHECK(invoke->IsInvokeStaticOrDirect()); + HMin* min = new (GetGraph()->GetAllocator()) + HMin(type, invoke->InputAt(0), invoke->InputAt(1), invoke->GetDexPc()); + invoke->GetBlock()->ReplaceAndRemoveInstructionWith(invoke, min); +} + +void InstructionSimplifierVisitor::SimplifyMax(HInvoke* invoke, DataType::Type type) { + DCHECK(invoke->IsInvokeStaticOrDirect()); + HMax* max = new (GetGraph()->GetAllocator()) + HMax(type, invoke->InputAt(0), invoke->InputAt(1), invoke->GetDexPc()); + invoke->GetBlock()->ReplaceAndRemoveInstructionWith(invoke, max); +} + void InstructionSimplifierVisitor::SimplifyAbs(HInvoke* invoke, DataType::Type type) { DCHECK(invoke->IsInvokeStaticOrDirect()); HAbs* abs = new (GetGraph()->GetAllocator()) @@ -2497,6 +2513,30 @@ void InstructionSimplifierVisitor::VisitInvoke(HInvoke* instruction) { case Intrinsics::kVarHandleStoreStoreFence: SimplifyMemBarrier(instruction, MemBarrierKind::kStoreStore); break; + case Intrinsics::kMathMinIntInt: + SimplifyMin(instruction, DataType::Type::kInt32); + break; + case Intrinsics::kMathMinLongLong: + SimplifyMin(instruction, DataType::Type::kInt64); + break; + case Intrinsics::kMathMinFloatFloat: + SimplifyMin(instruction, DataType::Type::kFloat32); + break; + case Intrinsics::kMathMinDoubleDouble: + SimplifyMin(instruction, DataType::Type::kFloat64); + break; + case Intrinsics::kMathMaxIntInt: + SimplifyMax(instruction, DataType::Type::kInt32); + break; + case Intrinsics::kMathMaxLongLong: + SimplifyMax(instruction, DataType::Type::kInt64); + break; + case Intrinsics::kMathMaxFloatFloat: + SimplifyMax(instruction, DataType::Type::kFloat32); + break; + case Intrinsics::kMathMaxDoubleDouble: + SimplifyMax(instruction, DataType::Type::kFloat64); + break; case Intrinsics::kMathAbsInt: SimplifyAbs(instruction, DataType::Type::kInt32); break; diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc index acb830e524..f8dc316e45 100644 --- a/compiler/optimizing/intrinsics.cc +++ b/compiler/optimizing/intrinsics.cc @@ -18,6 +18,7 @@ #include "art_field-inl.h" #include "art_method-inl.h" +#include "base/utils.h" #include "class_linker.h" #include "dex/invoke_type.h" #include "driver/compiler_driver.h" @@ -26,7 +27,6 @@ #include "nodes.h" #include "scoped_thread_state_change-inl.h" #include "thread-current-inl.h" -#include "utils.h" namespace art { diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h index 24aff226ca..1035cbc2c4 100644 --- a/compiler/optimizing/intrinsics.h +++ b/compiler/optimizing/intrinsics.h @@ -266,6 +266,14 @@ void IntrinsicCodeGenerator ## Arch::Visit ## Name(HInvoke* invoke) { \ << " should have been converted to HIR"; \ } #define UNREACHABLE_INTRINSICS(Arch) \ +UNREACHABLE_INTRINSIC(Arch, MathMinIntInt) \ +UNREACHABLE_INTRINSIC(Arch, MathMinLongLong) \ +UNREACHABLE_INTRINSIC(Arch, MathMinFloatFloat) \ +UNREACHABLE_INTRINSIC(Arch, MathMinDoubleDouble) \ +UNREACHABLE_INTRINSIC(Arch, MathMaxIntInt) \ +UNREACHABLE_INTRINSIC(Arch, MathMaxLongLong) \ +UNREACHABLE_INTRINSIC(Arch, MathMaxFloatFloat) \ +UNREACHABLE_INTRINSIC(Arch, MathMaxDoubleDouble) \ UNREACHABLE_INTRINSIC(Arch, MathAbsInt) \ UNREACHABLE_INTRINSIC(Arch, MathAbsLong) \ UNREACHABLE_INTRINSIC(Arch, MathAbsFloat) \ diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 0b04fff927..81c0b50932 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -344,14 +344,6 @@ void IntrinsicCodeGeneratorARM64::VisitShortReverseBytes(HInvoke* invoke) { GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetVIXLAssembler()); } -static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); -} - static void GenNumberOfLeadingZeros(LocationSummary* locations, DataType::Type type, MacroAssembler* masm) { @@ -529,113 +521,6 @@ void IntrinsicCodeGeneratorARM64::VisitLongLowestOneBit(HInvoke* invoke) { GenLowestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler()); } -static void GenMinMaxFP(LocationSummary* locations, - bool is_min, - bool is_double, - MacroAssembler* masm) { - Location op1 = locations->InAt(0); - Location op2 = locations->InAt(1); - Location out = locations->Out(); - - FPRegister op1_reg = is_double ? DRegisterFrom(op1) : SRegisterFrom(op1); - FPRegister op2_reg = is_double ? DRegisterFrom(op2) : SRegisterFrom(op2); - FPRegister out_reg = is_double ? DRegisterFrom(out) : SRegisterFrom(out); - if (is_min) { - __ Fmin(out_reg, op1_reg, op2_reg); - } else { - __ Fmax(out_reg, op1_reg, op2_reg); - } -} - -static void CreateFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); - locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetVIXLAssembler()); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetVIXLAssembler()); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetVIXLAssembler()); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP( - invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetVIXLAssembler()); -} - -static void GenMinMax(LocationSummary* locations, - bool is_min, - bool is_long, - MacroAssembler* masm) { - Location op1 = locations->InAt(0); - Location op2 = locations->InAt(1); - Location out = locations->Out(); - - Register op1_reg = is_long ? XRegisterFrom(op1) : WRegisterFrom(op1); - Register op2_reg = is_long ? XRegisterFrom(op2) : WRegisterFrom(op2); - Register out_reg = is_long ? XRegisterFrom(out) : WRegisterFrom(out); - - __ Cmp(op1_reg, op2_reg); - __ Csel(out_reg, op1_reg, op2_reg, is_min ? lt : gt); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMinIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMinIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetVIXLAssembler()); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMinLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMinLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetVIXLAssembler()); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMaxIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMaxIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetVIXLAssembler()); -} - -void IntrinsicLocationsBuilderARM64::VisitMathMaxLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARM64::VisitMathMaxLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetVIXLAssembler()); -} - static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { LocationSummary* locations = new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index e351fcc706..e61a0b0809 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -432,264 +432,6 @@ void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invo GenNumberOfTrailingZeros(invoke, DataType::Type::kInt64, codegen_); } -static void GenMinMaxFloat(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) { - ArmVIXLAssembler* assembler = codegen->GetAssembler(); - Location op1_loc = invoke->GetLocations()->InAt(0); - Location op2_loc = invoke->GetLocations()->InAt(1); - Location out_loc = invoke->GetLocations()->Out(); - - // Optimization: don't generate any code if inputs are the same. - if (op1_loc.Equals(op2_loc)) { - DCHECK(out_loc.Equals(op1_loc)); // out_loc is set as SameAsFirstInput() in location builder. - return; - } - - vixl32::SRegister op1 = SRegisterFrom(op1_loc); - vixl32::SRegister op2 = SRegisterFrom(op2_loc); - vixl32::SRegister out = OutputSRegister(invoke); - UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); - const vixl32::Register temp1 = temps.Acquire(); - vixl32::Register temp2 = RegisterFrom(invoke->GetLocations()->GetTemp(0)); - vixl32::Label nan, done; - vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done); - - DCHECK(op1.Is(out)); - - __ Vcmp(op1, op2); - __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); - __ B(vs, &nan, /* far_target */ false); // if un-ordered, go to NaN handling. - - // op1 <> op2 - vixl32::ConditionType cond = is_min ? gt : lt; - { - ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(), - 2 * kMaxInstructionSizeInBytes, - CodeBufferCheckScope::kMaximumSize); - __ it(cond); - __ vmov(cond, F32, out, op2); - } - // for <>(not equal), we've done min/max calculation. - __ B(ne, final_label, /* far_target */ false); - - // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0). - __ Vmov(temp1, op1); - __ Vmov(temp2, op2); - if (is_min) { - __ Orr(temp1, temp1, temp2); - } else { - __ And(temp1, temp1, temp2); - } - __ Vmov(out, temp1); - __ B(final_label); - - // handle NaN input. - __ Bind(&nan); - __ Movt(temp1, High16Bits(kNanFloat)); // 0x7FC0xxxx is a NaN. - __ Vmov(out, temp1); - - if (done.IsReferenced()) { - __ Bind(&done); - } -} - -static void CreateFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); - locations->SetOut(Location::SameAsFirstInput()); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); - invoke->GetLocations()->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFloat(invoke, /* is_min */ true, codegen_); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); - invoke->GetLocations()->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFloat(invoke, /* is_min */ false, codegen_); -} - -static void GenMinMaxDouble(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) { - ArmVIXLAssembler* assembler = codegen->GetAssembler(); - Location op1_loc = invoke->GetLocations()->InAt(0); - Location op2_loc = invoke->GetLocations()->InAt(1); - Location out_loc = invoke->GetLocations()->Out(); - - // Optimization: don't generate any code if inputs are the same. - if (op1_loc.Equals(op2_loc)) { - DCHECK(out_loc.Equals(op1_loc)); // out_loc is set as SameAsFirstInput() in. - return; - } - - vixl32::DRegister op1 = DRegisterFrom(op1_loc); - vixl32::DRegister op2 = DRegisterFrom(op2_loc); - vixl32::DRegister out = OutputDRegister(invoke); - vixl32::Label handle_nan_eq, done; - vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done); - - DCHECK(op1.Is(out)); - - __ Vcmp(op1, op2); - __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); - __ B(vs, &handle_nan_eq, /* far_target */ false); // if un-ordered, go to NaN handling. - - // op1 <> op2 - vixl32::ConditionType cond = is_min ? gt : lt; - { - ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(), - 2 * kMaxInstructionSizeInBytes, - CodeBufferCheckScope::kMaximumSize); - __ it(cond); - __ vmov(cond, F64, out, op2); - } - // for <>(not equal), we've done min/max calculation. - __ B(ne, final_label, /* far_target */ false); - - // handle op1 == op2, max(+0.0,-0.0). - if (!is_min) { - __ Vand(F64, out, op1, op2); - __ B(final_label); - } - - // handle op1 == op2, min(+0.0,-0.0), NaN input. - __ Bind(&handle_nan_eq); - __ Vorr(F64, out, op1, op2); // assemble op1/-0.0/NaN. - - if (done.IsReferenced()) { - __ Bind(&done); - } -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxDouble(invoke, /* is_min */ true , codegen_); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxDouble(invoke, /* is_min */ false, codegen_); -} - -static void GenMinMaxLong(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) { - Location op1_loc = invoke->GetLocations()->InAt(0); - Location op2_loc = invoke->GetLocations()->InAt(1); - Location out_loc = invoke->GetLocations()->Out(); - - // Optimization: don't generate any code if inputs are the same. - if (op1_loc.Equals(op2_loc)) { - DCHECK(out_loc.Equals(op1_loc)); // out_loc is set as SameAsFirstInput() in location builder. - return; - } - - vixl32::Register op1_lo = LowRegisterFrom(op1_loc); - vixl32::Register op1_hi = HighRegisterFrom(op1_loc); - vixl32::Register op2_lo = LowRegisterFrom(op2_loc); - vixl32::Register op2_hi = HighRegisterFrom(op2_loc); - vixl32::Register out_lo = LowRegisterFrom(out_loc); - vixl32::Register out_hi = HighRegisterFrom(out_loc); - UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); - const vixl32::Register temp = temps.Acquire(); - - DCHECK(op1_lo.Is(out_lo)); - DCHECK(op1_hi.Is(out_hi)); - - // Compare op1 >= op2, or op1 < op2. - __ Cmp(out_lo, op2_lo); - __ Sbcs(temp, out_hi, op2_hi); - - // Now GE/LT condition code is correct for the long comparison. - { - vixl32::ConditionType cond = is_min ? ge : lt; - ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(), - 3 * kMaxInstructionSizeInBytes, - CodeBufferCheckScope::kMaximumSize); - __ itt(cond); - __ mov(cond, out_lo, op2_lo); - __ mov(cond, out_hi, op2_hi); - } -} - -static void CreateLongLongToLongLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMinLongLong(HInvoke* invoke) { - CreateLongLongToLongLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMinLongLong(HInvoke* invoke) { - GenMinMaxLong(invoke, /* is_min */ true, GetAssembler()); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxLongLong(HInvoke* invoke) { - CreateLongLongToLongLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxLongLong(HInvoke* invoke) { - GenMinMaxLong(invoke, /* is_min */ false, GetAssembler()); -} - -static void GenMinMax(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) { - vixl32::Register op1 = InputRegisterAt(invoke, 0); - vixl32::Register op2 = InputRegisterAt(invoke, 1); - vixl32::Register out = OutputRegister(invoke); - - __ Cmp(op1, op2); - - { - ExactAssemblyScope aas(assembler->GetVIXLAssembler(), - 3 * kMaxInstructionSizeInBytes, - CodeBufferCheckScope::kMaximumSize); - - __ ite(is_min ? lt : gt); - __ mov(is_min ? lt : gt, out, op1); - __ mov(is_min ? ge : le, out, op2); - } -} - -static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMinIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMinIntInt(HInvoke* invoke) { - GenMinMax(invoke, /* is_min */ true, GetAssembler()); -} - -void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxIntInt(HInvoke* invoke) { - GenMinMax(invoke, /* is_min */ false, GetAssembler()); -} - void IntrinsicLocationsBuilderARMVIXL::VisitMathSqrt(HInvoke* invoke) { CreateFPToFPLocations(allocator_, invoke); } diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index 6d6ff7576b..bc1292b2b7 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -58,6 +58,10 @@ inline bool IntrinsicCodeGeneratorMIPS::Is32BitFPU() const { return codegen_->GetInstructionSetFeatures().Is32BitFloatingPoint(); } +inline bool IntrinsicCodeGeneratorMIPS::HasMsa() const { + return codegen_->GetInstructionSetFeatures().HasMsa(); +} + #define __ codegen->GetAssembler()-> static void MoveFromReturnRegister(Location trg, @@ -612,6 +616,7 @@ static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { static void GenBitCount(LocationSummary* locations, DataType::Type type, bool isR6, + bool hasMsa, MipsAssembler* assembler) { Register out = locations->Out().AsRegister<Register>(); @@ -637,85 +642,102 @@ static void GenBitCount(LocationSummary* locations, // instructions compared to a loop-based algorithm which required 47 // instructions. - if (type == DataType::Type::kInt32) { - Register in = locations->InAt(0).AsRegister<Register>(); - - __ Srl(TMP, in, 1); - __ LoadConst32(AT, 0x55555555); - __ And(TMP, TMP, AT); - __ Subu(TMP, in, TMP); - __ LoadConst32(AT, 0x33333333); - __ And(out, TMP, AT); - __ Srl(TMP, TMP, 2); - __ And(TMP, TMP, AT); - __ Addu(TMP, out, TMP); - __ Srl(out, TMP, 4); - __ Addu(out, out, TMP); - __ LoadConst32(AT, 0x0F0F0F0F); - __ And(out, out, AT); - __ LoadConst32(TMP, 0x01010101); - if (isR6) { - __ MulR6(out, out, TMP); + if (hasMsa) { + if (type == DataType::Type::kInt32) { + Register in = locations->InAt(0).AsRegister<Register>(); + __ Mtc1(in, FTMP); + __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP)); + __ Mfc1(out, FTMP); } else { - __ MulR2(out, out, TMP); + DCHECK_EQ(type, DataType::Type::kInt64); + Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>(); + Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); + __ Mtc1(in_lo, FTMP); + __ Mthc1(in_hi, FTMP); + __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP)); + __ Mfc1(out, FTMP); } - __ Srl(out, out, 24); } else { - DCHECK_EQ(type, DataType::Type::kInt64); - Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>(); - Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); - Register tmp_hi = locations->GetTemp(0).AsRegister<Register>(); - Register out_hi = locations->GetTemp(1).AsRegister<Register>(); - Register tmp_lo = TMP; - Register out_lo = out; + if (type == DataType::Type::kInt32) { + Register in = locations->InAt(0).AsRegister<Register>(); - __ Srl(tmp_lo, in_lo, 1); - __ Srl(tmp_hi, in_hi, 1); + __ Srl(TMP, in, 1); + __ LoadConst32(AT, 0x55555555); + __ And(TMP, TMP, AT); + __ Subu(TMP, in, TMP); + __ LoadConst32(AT, 0x33333333); + __ And(out, TMP, AT); + __ Srl(TMP, TMP, 2); + __ And(TMP, TMP, AT); + __ Addu(TMP, out, TMP); + __ Srl(out, TMP, 4); + __ Addu(out, out, TMP); + __ LoadConst32(AT, 0x0F0F0F0F); + __ And(out, out, AT); + __ LoadConst32(TMP, 0x01010101); + if (isR6) { + __ MulR6(out, out, TMP); + } else { + __ MulR2(out, out, TMP); + } + __ Srl(out, out, 24); + } else { + DCHECK_EQ(type, DataType::Type::kInt64); + Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>(); + Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); + Register tmp_hi = locations->GetTemp(0).AsRegister<Register>(); + Register out_hi = locations->GetTemp(1).AsRegister<Register>(); + Register tmp_lo = TMP; + Register out_lo = out; - __ LoadConst32(AT, 0x55555555); + __ Srl(tmp_lo, in_lo, 1); + __ Srl(tmp_hi, in_hi, 1); - __ And(tmp_lo, tmp_lo, AT); - __ Subu(tmp_lo, in_lo, tmp_lo); + __ LoadConst32(AT, 0x55555555); - __ And(tmp_hi, tmp_hi, AT); - __ Subu(tmp_hi, in_hi, tmp_hi); + __ And(tmp_lo, tmp_lo, AT); + __ Subu(tmp_lo, in_lo, tmp_lo); - __ LoadConst32(AT, 0x33333333); + __ And(tmp_hi, tmp_hi, AT); + __ Subu(tmp_hi, in_hi, tmp_hi); - __ And(out_lo, tmp_lo, AT); - __ Srl(tmp_lo, tmp_lo, 2); - __ And(tmp_lo, tmp_lo, AT); - __ Addu(tmp_lo, out_lo, tmp_lo); + __ LoadConst32(AT, 0x33333333); - __ And(out_hi, tmp_hi, AT); - __ Srl(tmp_hi, tmp_hi, 2); - __ And(tmp_hi, tmp_hi, AT); - __ Addu(tmp_hi, out_hi, tmp_hi); + __ And(out_lo, tmp_lo, AT); + __ Srl(tmp_lo, tmp_lo, 2); + __ And(tmp_lo, tmp_lo, AT); + __ Addu(tmp_lo, out_lo, tmp_lo); - // Here we deviate from the original algorithm a bit. We've reached - // the stage where the bitfields holding the subtotals are large - // enough to hold the combined subtotals for both the low word, and - // the high word. This means that we can add the subtotals for the - // the high, and low words into a single word, and compute the final - // result for both the high, and low words using fewer instructions. - __ LoadConst32(AT, 0x0F0F0F0F); + __ And(out_hi, tmp_hi, AT); + __ Srl(tmp_hi, tmp_hi, 2); + __ And(tmp_hi, tmp_hi, AT); + __ Addu(tmp_hi, out_hi, tmp_hi); - __ Addu(TMP, tmp_hi, tmp_lo); + // Here we deviate from the original algorithm a bit. We've reached + // the stage where the bitfields holding the subtotals are large + // enough to hold the combined subtotals for both the low word, and + // the high word. This means that we can add the subtotals for the + // the high, and low words into a single word, and compute the final + // result for both the high, and low words using fewer instructions. + __ LoadConst32(AT, 0x0F0F0F0F); - __ Srl(out, TMP, 4); - __ And(out, out, AT); - __ And(TMP, TMP, AT); - __ Addu(out, out, TMP); + __ Addu(TMP, tmp_hi, tmp_lo); - __ LoadConst32(AT, 0x01010101); + __ Srl(out, TMP, 4); + __ And(out, out, AT); + __ And(TMP, TMP, AT); + __ Addu(out, out, TMP); - if (isR6) { - __ MulR6(out, out, AT); - } else { - __ MulR2(out, out, AT); - } + __ LoadConst32(AT, 0x01010101); - __ Srl(out, out, 24); + if (isR6) { + __ MulR6(out, out, AT); + } else { + __ MulR2(out, out, AT); + } + + __ Srl(out, out, 24); + } } } @@ -725,7 +747,7 @@ void IntrinsicLocationsBuilderMIPS::VisitIntegerBitCount(HInvoke* invoke) { } void IntrinsicCodeGeneratorMIPS::VisitIntegerBitCount(HInvoke* invoke) { - GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), GetAssembler()); + GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), HasMsa(), GetAssembler()); } // int java.lang.Long.bitCount(int) @@ -739,459 +761,7 @@ void IntrinsicLocationsBuilderMIPS::VisitLongBitCount(HInvoke* invoke) { } void IntrinsicCodeGeneratorMIPS::VisitLongBitCount(HInvoke* invoke) { - GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), GetAssembler()); -} - -static void GenMinMaxFP(LocationSummary* locations, - bool is_min, - DataType::Type type, - bool is_R6, - MipsAssembler* assembler) { - FRegister out = locations->Out().AsFpuRegister<FRegister>(); - FRegister a = locations->InAt(0).AsFpuRegister<FRegister>(); - FRegister b = locations->InAt(1).AsFpuRegister<FRegister>(); - - if (is_R6) { - MipsLabel noNaNs; - MipsLabel done; - FRegister ftmp = ((out != a) && (out != b)) ? out : FTMP; - - // When Java computes min/max it prefers a NaN to a number; the - // behavior of MIPSR6 is to prefer numbers to NaNs, i.e., if one of - // the inputs is a NaN and the other is a valid number, the MIPS - // instruction will return the number; Java wants the NaN value - // returned. This is why there is extra logic preceding the use of - // the MIPS min.fmt/max.fmt instructions. If either a, or b holds a - // NaN, return the NaN, otherwise return the min/max. - if (type == DataType::Type::kFloat64) { - __ CmpUnD(FTMP, a, b); - __ Bc1eqz(FTMP, &noNaNs); - - // One of the inputs is a NaN - __ CmpEqD(ftmp, a, a); - // If a == a then b is the NaN, otherwise a is the NaN. - __ SelD(ftmp, a, b); - - if (ftmp != out) { - __ MovD(out, ftmp); - } - - __ B(&done); - - __ Bind(&noNaNs); - - if (is_min) { - __ MinD(out, a, b); - } else { - __ MaxD(out, a, b); - } - } else { - DCHECK_EQ(type, DataType::Type::kFloat32); - __ CmpUnS(FTMP, a, b); - __ Bc1eqz(FTMP, &noNaNs); - - // One of the inputs is a NaN - __ CmpEqS(ftmp, a, a); - // If a == a then b is the NaN, otherwise a is the NaN. - __ SelS(ftmp, a, b); - - if (ftmp != out) { - __ MovS(out, ftmp); - } - - __ B(&done); - - __ Bind(&noNaNs); - - if (is_min) { - __ MinS(out, a, b); - } else { - __ MaxS(out, a, b); - } - } - - __ Bind(&done); - } else { - MipsLabel ordered; - MipsLabel compare; - MipsLabel select; - MipsLabel done; - - if (type == DataType::Type::kFloat64) { - __ CunD(a, b); - } else { - DCHECK_EQ(type, DataType::Type::kFloat32); - __ CunS(a, b); - } - __ Bc1f(&ordered); - - // a or b (or both) is a NaN. Return one, which is a NaN. - if (type == DataType::Type::kFloat64) { - __ CeqD(b, b); - } else { - __ CeqS(b, b); - } - __ B(&select); - - __ Bind(&ordered); - - // Neither is a NaN. - // a == b? (-0.0 compares equal with +0.0) - // If equal, handle zeroes, else compare further. - if (type == DataType::Type::kFloat64) { - __ CeqD(a, b); - } else { - __ CeqS(a, b); - } - __ Bc1f(&compare); - - // a == b either bit for bit or one is -0.0 and the other is +0.0. - if (type == DataType::Type::kFloat64) { - __ MoveFromFpuHigh(TMP, a); - __ MoveFromFpuHigh(AT, b); - } else { - __ Mfc1(TMP, a); - __ Mfc1(AT, b); - } - - if (is_min) { - // -0.0 prevails over +0.0. - __ Or(TMP, TMP, AT); - } else { - // +0.0 prevails over -0.0. - __ And(TMP, TMP, AT); - } - - if (type == DataType::Type::kFloat64) { - __ Mfc1(AT, a); - __ Mtc1(AT, out); - __ MoveToFpuHigh(TMP, out); - } else { - __ Mtc1(TMP, out); - } - __ B(&done); - - __ Bind(&compare); - - if (type == DataType::Type::kFloat64) { - if (is_min) { - // return (a <= b) ? a : b; - __ ColeD(a, b); - } else { - // return (a >= b) ? a : b; - __ ColeD(b, a); // b <= a - } - } else { - if (is_min) { - // return (a <= b) ? a : b; - __ ColeS(a, b); - } else { - // return (a >= b) ? a : b; - __ ColeS(b, a); // b <= a - } - } - - __ Bind(&select); - - if (type == DataType::Type::kFloat64) { - __ MovtD(out, a); - __ MovfD(out, b); - } else { - __ MovtS(out, a); - __ MovfS(out, b); - } - - __ Bind(&done); - } -} - -static void CreateFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); - locations->SetOut(Location::RequiresFpuRegister(), Location::kOutputOverlap); -} - -// double java.lang.Math.min(double, double) -void IntrinsicLocationsBuilderMIPS::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), - /* is_min */ true, - DataType::Type::kFloat64, - IsR6(), - GetAssembler()); -} - -// float java.lang.Math.min(float, float) -void IntrinsicLocationsBuilderMIPS::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), - /* is_min */ true, - DataType::Type::kFloat32, - IsR6(), - GetAssembler()); -} - -// double java.lang.Math.max(double, double) -void IntrinsicLocationsBuilderMIPS::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), - /* is_min */ false, - DataType::Type::kFloat64, - IsR6(), - GetAssembler()); -} - -// float java.lang.Math.max(float, float) -void IntrinsicLocationsBuilderMIPS::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), - /* is_min */ false, - DataType::Type::kFloat32, - IsR6(), - GetAssembler()); -} - -static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); -} - -static void GenMinMax(LocationSummary* locations, - bool is_min, - DataType::Type type, - bool is_R6, - MipsAssembler* assembler) { - if (is_R6) { - // Some architectures, such as ARM and MIPS (prior to r6), have a - // conditional move instruction which only changes the target - // (output) register if the condition is true (MIPS prior to r6 had - // MOVF, MOVT, MOVN, and MOVZ). The SELEQZ and SELNEZ instructions - // always change the target (output) register. If the condition is - // true the output register gets the contents of the "rs" register; - // otherwise, the output register is set to zero. One consequence - // of this is that to implement something like "rd = c==0 ? rs : rt" - // MIPS64r6 needs to use a pair of SELEQZ/SELNEZ instructions. - // After executing this pair of instructions one of the output - // registers from the pair will necessarily contain zero. Then the - // code ORs the output registers from the SELEQZ/SELNEZ instructions - // to get the final result. - // - // The initial test to see if the output register is same as the - // first input register is needed to make sure that value in the - // first input register isn't clobbered before we've finished - // computing the output value. The logic in the corresponding else - // clause performs the same task but makes sure the second input - // register isn't clobbered in the event that it's the same register - // as the output register; the else clause also handles the case - // where the output register is distinct from both the first, and the - // second input registers. - if (type == DataType::Type::kInt64) { - Register a_lo = locations->InAt(0).AsRegisterPairLow<Register>(); - Register a_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); - Register b_lo = locations->InAt(1).AsRegisterPairLow<Register>(); - Register b_hi = locations->InAt(1).AsRegisterPairHigh<Register>(); - Register out_lo = locations->Out().AsRegisterPairLow<Register>(); - Register out_hi = locations->Out().AsRegisterPairHigh<Register>(); - - MipsLabel compare_done; - - if (a_lo == b_lo) { - if (out_lo != a_lo) { - __ Move(out_lo, a_lo); - __ Move(out_hi, a_hi); - } - } else { - __ Slt(TMP, b_hi, a_hi); - __ Bne(b_hi, a_hi, &compare_done); - - __ Sltu(TMP, b_lo, a_lo); - - __ Bind(&compare_done); - - if (is_min) { - __ Seleqz(AT, a_lo, TMP); - __ Selnez(out_lo, b_lo, TMP); // Safe even if out_lo == a_lo/b_lo - // because at this point we're - // done using a_lo/b_lo. - } else { - __ Selnez(AT, a_lo, TMP); - __ Seleqz(out_lo, b_lo, TMP); // ditto - } - __ Or(out_lo, out_lo, AT); - if (is_min) { - __ Seleqz(AT, a_hi, TMP); - __ Selnez(out_hi, b_hi, TMP); // ditto but for out_hi & a_hi/b_hi - } else { - __ Selnez(AT, a_hi, TMP); - __ Seleqz(out_hi, b_hi, TMP); // ditto but for out_hi & a_hi/b_hi - } - __ Or(out_hi, out_hi, AT); - } - } else { - DCHECK_EQ(type, DataType::Type::kInt32); - Register a = locations->InAt(0).AsRegister<Register>(); - Register b = locations->InAt(1).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); - - if (a == b) { - if (out != a) { - __ Move(out, a); - } - } else { - __ Slt(AT, b, a); - if (is_min) { - __ Seleqz(TMP, a, AT); - __ Selnez(AT, b, AT); - } else { - __ Selnez(TMP, a, AT); - __ Seleqz(AT, b, AT); - } - __ Or(out, TMP, AT); - } - } - } else { - if (type == DataType::Type::kInt64) { - Register a_lo = locations->InAt(0).AsRegisterPairLow<Register>(); - Register a_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); - Register b_lo = locations->InAt(1).AsRegisterPairLow<Register>(); - Register b_hi = locations->InAt(1).AsRegisterPairHigh<Register>(); - Register out_lo = locations->Out().AsRegisterPairLow<Register>(); - Register out_hi = locations->Out().AsRegisterPairHigh<Register>(); - - MipsLabel compare_done; - - if (a_lo == b_lo) { - if (out_lo != a_lo) { - __ Move(out_lo, a_lo); - __ Move(out_hi, a_hi); - } - } else { - __ Slt(TMP, a_hi, b_hi); - __ Bne(a_hi, b_hi, &compare_done); - - __ Sltu(TMP, a_lo, b_lo); - - __ Bind(&compare_done); - - if (is_min) { - if (out_lo != a_lo) { - __ Movn(out_hi, a_hi, TMP); - __ Movn(out_lo, a_lo, TMP); - } - if (out_lo != b_lo) { - __ Movz(out_hi, b_hi, TMP); - __ Movz(out_lo, b_lo, TMP); - } - } else { - if (out_lo != a_lo) { - __ Movz(out_hi, a_hi, TMP); - __ Movz(out_lo, a_lo, TMP); - } - if (out_lo != b_lo) { - __ Movn(out_hi, b_hi, TMP); - __ Movn(out_lo, b_lo, TMP); - } - } - } - } else { - DCHECK_EQ(type, DataType::Type::kInt32); - Register a = locations->InAt(0).AsRegister<Register>(); - Register b = locations->InAt(1).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); - - if (a == b) { - if (out != a) { - __ Move(out, a); - } - } else { - __ Slt(AT, a, b); - if (is_min) { - if (out != a) { - __ Movn(out, a, AT); - } - if (out != b) { - __ Movz(out, b, AT); - } - } else { - if (out != a) { - __ Movz(out, a, AT); - } - if (out != b) { - __ Movn(out, b, AT); - } - } - } - } - } -} - -// int java.lang.Math.min(int, int) -void IntrinsicLocationsBuilderMIPS::VisitMathMinIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMinIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), - /* is_min */ true, - DataType::Type::kInt32, - IsR6(), - GetAssembler()); -} - -// long java.lang.Math.min(long, long) -void IntrinsicLocationsBuilderMIPS::VisitMathMinLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMinLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), - /* is_min */ true, - DataType::Type::kInt64, - IsR6(), - GetAssembler()); -} - -// int java.lang.Math.max(int, int) -void IntrinsicLocationsBuilderMIPS::VisitMathMaxIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMaxIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), - /* is_min */ false, - DataType::Type::kInt32, - IsR6(), - GetAssembler()); -} - -// long java.lang.Math.max(long, long) -void IntrinsicLocationsBuilderMIPS::VisitMathMaxLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitMathMaxLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), - /* is_min */ false, - DataType::Type::kInt64, - IsR6(), - GetAssembler()); + GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), HasMsa(), GetAssembler()); } // double java.lang.Math.sqrt(double) diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h index 13397f11d4..1c1ba40132 100644 --- a/compiler/optimizing/intrinsics_mips.h +++ b/compiler/optimizing/intrinsics_mips.h @@ -71,6 +71,7 @@ class IntrinsicCodeGeneratorMIPS FINAL : public IntrinsicVisitor { bool IsR2OrNewer() const; bool IsR6() const; bool Is32BitFPU() const; + bool HasMsa() const; private: MipsAssembler* GetAssembler(); diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index 5debd26c5a..f429afde2c 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -46,6 +46,10 @@ ArenaAllocator* IntrinsicCodeGeneratorMIPS64::GetAllocator() { return codegen_->GetGraph()->GetAllocator(); } +inline bool IntrinsicCodeGeneratorMIPS64::HasMsa() const { + return codegen_->GetInstructionSetFeatures().HasMsa(); +} + #define __ codegen->GetAssembler()-> static void MoveFromReturnRegister(Location trg, @@ -386,6 +390,7 @@ static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { static void GenBitCount(LocationSummary* locations, const DataType::Type type, + const bool hasMsa, Mips64Assembler* assembler) { GpuRegister out = locations->Out().AsRegister<GpuRegister>(); GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>(); @@ -414,41 +419,52 @@ static void GenBitCount(LocationSummary* locations, // bits are set but the algorithm here attempts to minimize the total // number of instructions executed even when a large number of bits // are set. - - if (type == DataType::Type::kInt32) { - __ Srl(TMP, in, 1); - __ LoadConst32(AT, 0x55555555); - __ And(TMP, TMP, AT); - __ Subu(TMP, in, TMP); - __ LoadConst32(AT, 0x33333333); - __ And(out, TMP, AT); - __ Srl(TMP, TMP, 2); - __ And(TMP, TMP, AT); - __ Addu(TMP, out, TMP); - __ Srl(out, TMP, 4); - __ Addu(out, out, TMP); - __ LoadConst32(AT, 0x0F0F0F0F); - __ And(out, out, AT); - __ LoadConst32(TMP, 0x01010101); - __ MulR6(out, out, TMP); - __ Srl(out, out, 24); - } else if (type == DataType::Type::kInt64) { - __ Dsrl(TMP, in, 1); - __ LoadConst64(AT, 0x5555555555555555L); - __ And(TMP, TMP, AT); - __ Dsubu(TMP, in, TMP); - __ LoadConst64(AT, 0x3333333333333333L); - __ And(out, TMP, AT); - __ Dsrl(TMP, TMP, 2); - __ And(TMP, TMP, AT); - __ Daddu(TMP, out, TMP); - __ Dsrl(out, TMP, 4); - __ Daddu(out, out, TMP); - __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL); - __ And(out, out, AT); - __ LoadConst64(TMP, 0x0101010101010101L); - __ Dmul(out, out, TMP); - __ Dsrl32(out, out, 24); + if (hasMsa) { + if (type == DataType::Type::kInt32) { + __ Mtc1(in, FTMP); + __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP)); + __ Mfc1(out, FTMP); + } else { + __ Dmtc1(in, FTMP); + __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP)); + __ Dmfc1(out, FTMP); + } + } else { + if (type == DataType::Type::kInt32) { + __ Srl(TMP, in, 1); + __ LoadConst32(AT, 0x55555555); + __ And(TMP, TMP, AT); + __ Subu(TMP, in, TMP); + __ LoadConst32(AT, 0x33333333); + __ And(out, TMP, AT); + __ Srl(TMP, TMP, 2); + __ And(TMP, TMP, AT); + __ Addu(TMP, out, TMP); + __ Srl(out, TMP, 4); + __ Addu(out, out, TMP); + __ LoadConst32(AT, 0x0F0F0F0F); + __ And(out, out, AT); + __ LoadConst32(TMP, 0x01010101); + __ MulR6(out, out, TMP); + __ Srl(out, out, 24); + } else { + __ Dsrl(TMP, in, 1); + __ LoadConst64(AT, 0x5555555555555555L); + __ And(TMP, TMP, AT); + __ Dsubu(TMP, in, TMP); + __ LoadConst64(AT, 0x3333333333333333L); + __ And(out, TMP, AT); + __ Dsrl(TMP, TMP, 2); + __ And(TMP, TMP, AT); + __ Daddu(TMP, out, TMP); + __ Dsrl(out, TMP, 4); + __ Daddu(out, out, TMP); + __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL); + __ And(out, out, AT); + __ LoadConst64(TMP, 0x0101010101010101L); + __ Dmul(out, out, TMP); + __ Dsrl32(out, out, 24); + } } } @@ -458,7 +474,7 @@ void IntrinsicLocationsBuilderMIPS64::VisitIntegerBitCount(HInvoke* invoke) { } void IntrinsicCodeGeneratorMIPS64::VisitIntegerBitCount(HInvoke* invoke) { - GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler()); + GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, HasMsa(), GetAssembler()); } // int java.lang.Long.bitCount(long) @@ -467,222 +483,7 @@ void IntrinsicLocationsBuilderMIPS64::VisitLongBitCount(HInvoke* invoke) { } void IntrinsicCodeGeneratorMIPS64::VisitLongBitCount(HInvoke* invoke) { - GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler()); -} - -static void GenMinMaxFP(LocationSummary* locations, - bool is_min, - DataType::Type type, - Mips64Assembler* assembler) { - FpuRegister a = locations->InAt(0).AsFpuRegister<FpuRegister>(); - FpuRegister b = locations->InAt(1).AsFpuRegister<FpuRegister>(); - FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>(); - - Mips64Label noNaNs; - Mips64Label done; - FpuRegister ftmp = ((out != a) && (out != b)) ? out : FTMP; - - // When Java computes min/max it prefers a NaN to a number; the - // behavior of MIPSR6 is to prefer numbers to NaNs, i.e., if one of - // the inputs is a NaN and the other is a valid number, the MIPS - // instruction will return the number; Java wants the NaN value - // returned. This is why there is extra logic preceding the use of - // the MIPS min.fmt/max.fmt instructions. If either a, or b holds a - // NaN, return the NaN, otherwise return the min/max. - if (type == DataType::Type::kFloat64) { - __ CmpUnD(FTMP, a, b); - __ Bc1eqz(FTMP, &noNaNs); - - // One of the inputs is a NaN - __ CmpEqD(ftmp, a, a); - // If a == a then b is the NaN, otherwise a is the NaN. - __ SelD(ftmp, a, b); - - if (ftmp != out) { - __ MovD(out, ftmp); - } - - __ Bc(&done); - - __ Bind(&noNaNs); - - if (is_min) { - __ MinD(out, a, b); - } else { - __ MaxD(out, a, b); - } - } else { - DCHECK_EQ(type, DataType::Type::kFloat32); - __ CmpUnS(FTMP, a, b); - __ Bc1eqz(FTMP, &noNaNs); - - // One of the inputs is a NaN - __ CmpEqS(ftmp, a, a); - // If a == a then b is the NaN, otherwise a is the NaN. - __ SelS(ftmp, a, b); - - if (ftmp != out) { - __ MovS(out, ftmp); - } - - __ Bc(&done); - - __ Bind(&noNaNs); - - if (is_min) { - __ MinS(out, a, b); - } else { - __ MaxS(out, a, b); - } - } - - __ Bind(&done); -} - -static void CreateFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); - locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); -} - -// double java.lang.Math.min(double, double) -void IntrinsicLocationsBuilderMIPS64::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, DataType::Type::kFloat64, GetAssembler()); -} - -// float java.lang.Math.min(float, float) -void IntrinsicLocationsBuilderMIPS64::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, DataType::Type::kFloat32, GetAssembler()); -} - -// double java.lang.Math.max(double, double) -void IntrinsicLocationsBuilderMIPS64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, DataType::Type::kFloat64, GetAssembler()); -} - -// float java.lang.Math.max(float, float) -void IntrinsicLocationsBuilderMIPS64::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, DataType::Type::kFloat32, GetAssembler()); -} - -static void GenMinMax(LocationSummary* locations, - bool is_min, - Mips64Assembler* assembler) { - GpuRegister lhs = locations->InAt(0).AsRegister<GpuRegister>(); - GpuRegister rhs = locations->InAt(1).AsRegister<GpuRegister>(); - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); - - if (lhs == rhs) { - if (out != lhs) { - __ Move(out, lhs); - } - } else { - // Some architectures, such as ARM and MIPS (prior to r6), have a - // conditional move instruction which only changes the target - // (output) register if the condition is true (MIPS prior to r6 had - // MOVF, MOVT, and MOVZ). The SELEQZ and SELNEZ instructions always - // change the target (output) register. If the condition is true the - // output register gets the contents of the "rs" register; otherwise, - // the output register is set to zero. One consequence of this is - // that to implement something like "rd = c==0 ? rs : rt" MIPS64r6 - // needs to use a pair of SELEQZ/SELNEZ instructions. After - // executing this pair of instructions one of the output registers - // from the pair will necessarily contain zero. Then the code ORs the - // output registers from the SELEQZ/SELNEZ instructions to get the - // final result. - // - // The initial test to see if the output register is same as the - // first input register is needed to make sure that value in the - // first input register isn't clobbered before we've finished - // computing the output value. The logic in the corresponding else - // clause performs the same task but makes sure the second input - // register isn't clobbered in the event that it's the same register - // as the output register; the else clause also handles the case - // where the output register is distinct from both the first, and the - // second input registers. - if (out == lhs) { - __ Slt(AT, rhs, lhs); - if (is_min) { - __ Seleqz(out, lhs, AT); - __ Selnez(AT, rhs, AT); - } else { - __ Selnez(out, lhs, AT); - __ Seleqz(AT, rhs, AT); - } - } else { - __ Slt(AT, lhs, rhs); - if (is_min) { - __ Seleqz(out, rhs, AT); - __ Selnez(AT, lhs, AT); - } else { - __ Selnez(out, rhs, AT); - __ Seleqz(AT, lhs, AT); - } - } - __ Or(out, out, AT); - } -} - -static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); -} - -// int java.lang.Math.min(int, int) -void IntrinsicLocationsBuilderMIPS64::VisitMathMinIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMinIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, GetAssembler()); -} - -// long java.lang.Math.min(long, long) -void IntrinsicLocationsBuilderMIPS64::VisitMathMinLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMinLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, GetAssembler()); -} - -// int java.lang.Math.max(int, int) -void IntrinsicLocationsBuilderMIPS64::VisitMathMaxIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMaxIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, GetAssembler()); -} - -// long java.lang.Math.max(long, long) -void IntrinsicLocationsBuilderMIPS64::VisitMathMaxLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorMIPS64::VisitMathMaxLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, GetAssembler()); + GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, HasMsa(), GetAssembler()); } // double java.lang.Math.sqrt(double) diff --git a/compiler/optimizing/intrinsics_mips64.h b/compiler/optimizing/intrinsics_mips64.h index 6f40d90ddb..748b0b02b2 100644 --- a/compiler/optimizing/intrinsics_mips64.h +++ b/compiler/optimizing/intrinsics_mips64.h @@ -68,6 +68,8 @@ class IntrinsicCodeGeneratorMIPS64 FINAL : public IntrinsicVisitor { #undef INTRINSICS_LIST #undef OPTIMIZING_INTRINSICS + bool HasMsa() const; + private: Mips64Assembler* GetAssembler(); diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 0edc061e97..c4f322bf0c 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -40,11 +40,6 @@ namespace art { namespace x86 { -static constexpr int kDoubleNaNHigh = 0x7FF80000; -static constexpr int kDoubleNaNLow = 0x00000000; -static constexpr int64_t kDoubleNaN = INT64_C(0x7FF8000000000000); -static constexpr int32_t kFloatNaN = INT32_C(0x7FC00000); - IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen) : allocator_(codegen->GetGraph()->GetAllocator()), codegen_(codegen) { @@ -333,278 +328,6 @@ void IntrinsicCodeGeneratorX86::VisitShortReverseBytes(HInvoke* invoke) { GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler()); } -static void GenMinMaxFP(HInvoke* invoke, - bool is_min, - bool is_double, - X86Assembler* assembler, - CodeGeneratorX86* codegen) { - LocationSummary* locations = invoke->GetLocations(); - Location op1_loc = locations->InAt(0); - Location op2_loc = locations->InAt(1); - Location out_loc = locations->Out(); - XmmRegister out = out_loc.AsFpuRegister<XmmRegister>(); - - // Shortcut for same input locations. - if (op1_loc.Equals(op2_loc)) { - DCHECK(out_loc.Equals(op1_loc)); - return; - } - - // (out := op1) - // out <=? op2 - // if Nan jmp Nan_label - // if out is min jmp done - // if op2 is min jmp op2_label - // handle -0/+0 - // jmp done - // Nan_label: - // out := NaN - // op2_label: - // out := op2 - // done: - // - // This removes one jmp, but needs to copy one input (op1) to out. - // - // TODO: This is straight from Quick (except literal pool). Make NaN an out-of-line slowpath? - - XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>(); - - NearLabel nan, done, op2_label; - if (is_double) { - __ ucomisd(out, op2); - } else { - __ ucomiss(out, op2); - } - - __ j(Condition::kParityEven, &nan); - - __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label); - __ j(is_min ? Condition::kBelow : Condition::kAbove, &done); - - // Handle 0.0/-0.0. - if (is_min) { - if (is_double) { - __ orpd(out, op2); - } else { - __ orps(out, op2); - } - } else { - if (is_double) { - __ andpd(out, op2); - } else { - __ andps(out, op2); - } - } - __ jmp(&done); - - // NaN handling. - __ Bind(&nan); - // Do we have a constant area pointer? - if (locations->GetInputCount() == 3 && locations->InAt(2).IsValid()) { - HX86ComputeBaseMethodAddress* method_address = - invoke->InputAt(2)->AsX86ComputeBaseMethodAddress(); - DCHECK(locations->InAt(2).IsRegister()); - Register constant_area = locations->InAt(2).AsRegister<Register>(); - if (is_double) { - __ movsd(out, codegen->LiteralInt64Address(kDoubleNaN, method_address, constant_area)); - } else { - __ movss(out, codegen->LiteralInt32Address(kFloatNaN, method_address, constant_area)); - } - } else { - if (is_double) { - __ pushl(Immediate(kDoubleNaNHigh)); - __ pushl(Immediate(kDoubleNaNLow)); - __ movsd(out, Address(ESP, 0)); - __ addl(ESP, Immediate(8)); - } else { - __ pushl(Immediate(kFloatNaN)); - __ movss(out, Address(ESP, 0)); - __ addl(ESP, Immediate(4)); - } - } - __ jmp(&done); - - // out := op2; - __ Bind(&op2_label); - if (is_double) { - __ movsd(out, op2); - } else { - __ movss(out, op2); - } - - // Done. - __ Bind(&done); -} - -static void CreateFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); - // The following is sub-optimal, but all we can do for now. It would be fine to also accept - // the second input to be the output (we can simply swap inputs). - locations->SetOut(Location::SameAsFirstInput()); - HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect(); - DCHECK(static_or_direct != nullptr); - if (static_or_direct->HasSpecialInput() && - invoke->InputAt(static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) { - locations->SetInAt(2, Location::RequiresRegister()); - } -} - -void IntrinsicLocationsBuilderX86::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke, - /* is_min */ true, - /* is_double */ true, - GetAssembler(), - codegen_); -} - -void IntrinsicLocationsBuilderX86::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke, - /* is_min */ true, - /* is_double */ false, - GetAssembler(), - codegen_); -} - -void IntrinsicLocationsBuilderX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke, - /* is_min */ false, - /* is_double */ true, - GetAssembler(), - codegen_); -} - -void IntrinsicLocationsBuilderX86::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFPLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke, - /* is_min */ false, - /* is_double */ false, - GetAssembler(), - codegen_); -} - -static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long, - X86Assembler* assembler) { - Location op1_loc = locations->InAt(0); - Location op2_loc = locations->InAt(1); - - // Shortcut for same input locations. - if (op1_loc.Equals(op2_loc)) { - // Can return immediately, as op1_loc == out_loc. - // Note: if we ever support separate registers, e.g., output into memory, we need to check for - // a copy here. - DCHECK(locations->Out().Equals(op1_loc)); - return; - } - - if (is_long) { - // Need to perform a subtract to get the sign right. - // op1 is already in the same location as the output. - Location output = locations->Out(); - Register output_lo = output.AsRegisterPairLow<Register>(); - Register output_hi = output.AsRegisterPairHigh<Register>(); - - Register op2_lo = op2_loc.AsRegisterPairLow<Register>(); - Register op2_hi = op2_loc.AsRegisterPairHigh<Register>(); - - // Spare register to compute the subtraction to set condition code. - Register temp = locations->GetTemp(0).AsRegister<Register>(); - - // Subtract off op2_low. - __ movl(temp, output_lo); - __ subl(temp, op2_lo); - - // Now use the same tempo and the borrow to finish the subtraction of op2_hi. - __ movl(temp, output_hi); - __ sbbl(temp, op2_hi); - - // Now the condition code is correct. - Condition cond = is_min ? Condition::kGreaterEqual : Condition::kLess; - __ cmovl(cond, output_lo, op2_lo); - __ cmovl(cond, output_hi, op2_hi); - } else { - Register out = locations->Out().AsRegister<Register>(); - Register op2 = op2_loc.AsRegister<Register>(); - - // (out := op1) - // out <=? op2 - // if out is min jmp done - // out := op2 - // done: - - __ cmpl(out, op2); - Condition cond = is_min ? Condition::kGreater : Condition::kLess; - __ cmovl(cond, out, op2); - } -} - -static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); -} - -static void CreateLongLongToLongLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); - // Register to use to perform a long subtract to set cc. - locations->AddTemp(Location::RequiresRegister()); -} - -void IntrinsicLocationsBuilderX86::VisitMathMinIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMinIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler()); -} - -void IntrinsicLocationsBuilderX86::VisitMathMinLongLong(HInvoke* invoke) { - CreateLongLongToLongLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMinLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler()); -} - -void IntrinsicLocationsBuilderX86::VisitMathMaxIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMaxIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler()); -} - -void IntrinsicLocationsBuilderX86::VisitMathMaxLongLong(HInvoke* invoke) { - CreateLongLongToLongLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86::VisitMathMaxLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler()); -} - static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { LocationSummary* locations = new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 9d378d6b59..437bc3dd3c 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -236,208 +236,6 @@ void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) { GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler()); } -static void GenMinMaxFP(LocationSummary* locations, - bool is_min, - bool is_double, - X86_64Assembler* assembler, - CodeGeneratorX86_64* codegen) { - Location op1_loc = locations->InAt(0); - Location op2_loc = locations->InAt(1); - Location out_loc = locations->Out(); - XmmRegister out = out_loc.AsFpuRegister<XmmRegister>(); - - // Shortcut for same input locations. - if (op1_loc.Equals(op2_loc)) { - DCHECK(out_loc.Equals(op1_loc)); - return; - } - - // (out := op1) - // out <=? op2 - // if Nan jmp Nan_label - // if out is min jmp done - // if op2 is min jmp op2_label - // handle -0/+0 - // jmp done - // Nan_label: - // out := NaN - // op2_label: - // out := op2 - // done: - // - // This removes one jmp, but needs to copy one input (op1) to out. - // - // TODO: This is straight from Quick. Make NaN an out-of-line slowpath? - - XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>(); - - NearLabel nan, done, op2_label; - if (is_double) { - __ ucomisd(out, op2); - } else { - __ ucomiss(out, op2); - } - - __ j(Condition::kParityEven, &nan); - - __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label); - __ j(is_min ? Condition::kBelow : Condition::kAbove, &done); - - // Handle 0.0/-0.0. - if (is_min) { - if (is_double) { - __ orpd(out, op2); - } else { - __ orps(out, op2); - } - } else { - if (is_double) { - __ andpd(out, op2); - } else { - __ andps(out, op2); - } - } - __ jmp(&done); - - // NaN handling. - __ Bind(&nan); - if (is_double) { - __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000))); - } else { - __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000))); - } - __ jmp(&done); - - // out := op2; - __ Bind(&op2_label); - if (is_double) { - __ movsd(out, op2); - } else { - __ movss(out, op2); - } - - // Done. - __ Bind(&done); -} - -static void CreateFPFPToFP(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); - // The following is sub-optimal, but all we can do for now. It would be fine to also accept - // the second input to be the output (we can simply swap inputs). - locations->SetOut(Location::SameAsFirstInput()); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFP(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP( - invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler(), codegen_); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFP(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP( - invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler(), codegen_); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFP(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP( - invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler(), codegen_); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFP(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP( - invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler(), codegen_); -} - -static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long, - X86_64Assembler* assembler) { - Location op1_loc = locations->InAt(0); - Location op2_loc = locations->InAt(1); - - // Shortcut for same input locations. - if (op1_loc.Equals(op2_loc)) { - // Can return immediately, as op1_loc == out_loc. - // Note: if we ever support separate registers, e.g., output into memory, we need to check for - // a copy here. - DCHECK(locations->Out().Equals(op1_loc)); - return; - } - - CpuRegister out = locations->Out().AsRegister<CpuRegister>(); - CpuRegister op2 = op2_loc.AsRegister<CpuRegister>(); - - // (out := op1) - // out <=? op2 - // if out is min jmp done - // out := op2 - // done: - - if (is_long) { - __ cmpq(out, op2); - } else { - __ cmpl(out, op2); - } - - __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long); -} - -static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) { - LocationSummary* locations = - new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); - locations->SetOut(Location::SameAsFirstInput()); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler()); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler()); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler()); -} - -void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) { - CreateIntIntToIntLocations(allocator_, invoke); -} - -void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) { - GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler()); -} - static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) { LocationSummary* locations = new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 899496328e..d3b081e005 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -334,29 +334,14 @@ static bool IsAddConst(HInstruction* instruction, // Detect reductions of the following forms, // x = x_phi + .. // x = x_phi - .. -// x = max(x_phi, ..) // x = min(x_phi, ..) +// x = max(x_phi, ..) static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) { - if (reduction->IsAdd()) { + if (reduction->IsAdd() || reduction->IsMin() || reduction->IsMax()) { return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) || (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi); } else if (reduction->IsSub()) { return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi); - } else if (reduction->IsInvokeStaticOrDirect()) { - switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) { - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - case Intrinsics::kMathMinFloatFloat: - case Intrinsics::kMathMinDoubleDouble: - case Intrinsics::kMathMaxIntInt: - case Intrinsics::kMathMaxLongLong: - case Intrinsics::kMathMaxFloatFloat: - case Intrinsics::kMathMaxDoubleDouble: - return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) || - (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi); - default: - return false; - } } return false; } @@ -1297,80 +1282,59 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return true; } } - } else if (instruction->IsInvokeStaticOrDirect()) { - // Accept particular intrinsics. - HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect(); - switch (invoke->GetIntrinsic()) { - case Intrinsics::kMathAbsInt: - case Intrinsics::kMathAbsLong: - case Intrinsics::kMathAbsFloat: - case Intrinsics::kMathAbsDouble: { - // Deal with vector restrictions. - HInstruction* opa = instruction->InputAt(0); - HInstruction* r = opa; - bool is_unsigned = false; - if (HasVectorRestrictions(restrictions, kNoAbs)) { - return false; - } else if (HasVectorRestrictions(restrictions, kNoHiBits) && - (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { - return false; // reject, unless operand is sign-extension narrower - } - // Accept ABS(x) for vectorizable operand. - DCHECK(r != nullptr); - if (generate_code && vector_mode_ != kVector) { // de-idiom - r = opa; - } - if (VectorizeUse(node, r, generate_code, type, restrictions)) { - if (generate_code) { - GenerateVecOp(instruction, - vector_map_->Get(r), - nullptr, - HVecOperation::ToProperType(type, is_unsigned)); - } - return true; - } - return false; + } else if (instruction->IsAbs()) { + // Deal with vector restrictions. + HInstruction* opa = instruction->InputAt(0); + HInstruction* r = opa; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoAbs)) { + return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { + return false; // reject, unless operand is sign-extension narrower + } + // Accept ABS(x) for vectorizable operand. + DCHECK(r != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + } + if (VectorizeUse(node, r, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, + vector_map_->Get(r), + nullptr, + HVecOperation::ToProperType(type, is_unsigned)); } - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - case Intrinsics::kMathMinFloatFloat: - case Intrinsics::kMathMinDoubleDouble: - case Intrinsics::kMathMaxIntInt: - case Intrinsics::kMathMaxLongLong: - case Intrinsics::kMathMaxFloatFloat: - case Intrinsics::kMathMaxDoubleDouble: { - // Deal with vector restrictions. - HInstruction* opa = instruction->InputAt(0); - HInstruction* opb = instruction->InputAt(1); - HInstruction* r = opa; - HInstruction* s = opb; - bool is_unsigned = false; - if (HasVectorRestrictions(restrictions, kNoMinMax)) { - return false; - } else if (HasVectorRestrictions(restrictions, kNoHiBits) && - !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) { - return false; // reject, unless all operands are same-extension narrower - } - // Accept MIN/MAX(x, y) for vectorizable operands. - DCHECK(r != nullptr); - DCHECK(s != nullptr); - if (generate_code && vector_mode_ != kVector) { // de-idiom - r = opa; - s = opb; - } - if (VectorizeUse(node, r, generate_code, type, restrictions) && - VectorizeUse(node, s, generate_code, type, restrictions)) { - if (generate_code) { - GenerateVecOp( - instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned); - } - return true; - } - return false; + return true; + } + } else if (instruction->IsMin() || instruction->IsMax()) { + // Deal with vector restrictions. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + HInstruction* r = opa; + HInstruction* s = opb; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoMinMax)) { + return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) { + return false; // reject, unless all operands are same-extension narrower + } + // Accept MIN/MAX(x, y) for vectorizable operands. + DCHECK(r != nullptr); + DCHECK(s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + s = opb; + } + if (VectorizeUse(node, r, generate_code, type, restrictions) && + VectorizeUse(node, s, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp( + instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned); } - default: - return false; - } // switch + return true; + } } return false; } @@ -1811,83 +1775,29 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, GENERATE_VEC( new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_, dex_pc), new (global_allocator_) HUShr(org_type, opa, opb, dex_pc)); - case HInstruction::kInvokeStaticOrDirect: { - HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect(); - if (vector_mode_ == kVector) { - switch (invoke->GetIntrinsic()) { - case Intrinsics::kMathAbsInt: - case Intrinsics::kMathAbsLong: - case Intrinsics::kMathAbsFloat: - case Intrinsics::kMathAbsDouble: - DCHECK(opb == nullptr); - vector = new (global_allocator_) - HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc); - break; - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - case Intrinsics::kMathMinFloatFloat: - case Intrinsics::kMathMinDoubleDouble: { - vector = new (global_allocator_) - HVecMin(global_allocator_, - opa, - opb, - HVecOperation::ToProperType(type, is_unsigned), - vector_length_, - dex_pc); - break; - } - case Intrinsics::kMathMaxIntInt: - case Intrinsics::kMathMaxLongLong: - case Intrinsics::kMathMaxFloatFloat: - case Intrinsics::kMathMaxDoubleDouble: { - vector = new (global_allocator_) - HVecMax(global_allocator_, - opa, - opb, - HVecOperation::ToProperType(type, is_unsigned), - vector_length_, - dex_pc); - break; - } - default: - LOG(FATAL) << "Unsupported SIMD intrinsic " << org->GetId(); - UNREACHABLE(); - } // switch invoke - } else { - // In scalar code, simply clone the method invoke, and replace its operands with the - // corresponding new scalar instructions in the loop. The instruction will get an - // environment while being inserted from the instruction map in original program order. - DCHECK(vector_mode_ == kSequential); - size_t num_args = invoke->GetNumberOfArguments(); - HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect( - global_allocator_, - num_args, - invoke->GetType(), - invoke->GetDexPc(), - invoke->GetDexMethodIndex(), - invoke->GetResolvedMethod(), - invoke->GetDispatchInfo(), - invoke->GetInvokeType(), - invoke->GetTargetMethod(), - invoke->GetClinitCheckRequirement()); - HInputsRef inputs = invoke->GetInputs(); - size_t num_inputs = inputs.size(); - DCHECK_LE(num_args, num_inputs); - DCHECK_EQ(num_inputs, new_invoke->GetInputs().size()); // both invokes agree - for (size_t index = 0; index < num_inputs; ++index) { - HInstruction* new_input = index < num_args - ? vector_map_->Get(inputs[index]) - : inputs[index]; // beyond arguments: just pass through - new_invoke->SetArgumentAt(index, new_input); - } - new_invoke->SetIntrinsic(invoke->GetIntrinsic(), - kNeedsEnvironmentOrCache, - kNoSideEffects, - kNoThrow); - vector = new_invoke; - } - break; - } + case HInstruction::kMin: + GENERATE_VEC( + new (global_allocator_) HVecMin(global_allocator_, + opa, + opb, + HVecOperation::ToProperType(type, is_unsigned), + vector_length_, + dex_pc), + new (global_allocator_) HMin(org_type, opa, opb, dex_pc)); + case HInstruction::kMax: + GENERATE_VEC( + new (global_allocator_) HVecMax(global_allocator_, + opa, + opb, + HVecOperation::ToProperType(type, is_unsigned), + vector_length_, + dex_pc), + new (global_allocator_) HMax(org_type, opa, opb, dex_pc)); + case HInstruction::kAbs: + DCHECK(opb == nullptr); + GENERATE_VEC( + new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc), + new (global_allocator_) HAbs(org_type, opa, dex_pc)); default: break; } // switch @@ -1998,9 +1908,7 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node, HInstruction* v = instruction->InputAt(1); HInstruction* a = nullptr; HInstruction* b = nullptr; - if (v->IsInvokeStaticOrDirect() && - (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt || - v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) { + if (v->GetType() == reduction_type && v->IsAbs()) { HInstruction* x = v->InputAt(0); if (x->GetType() == reduction_type) { int64_t c = 0; @@ -2054,14 +1962,13 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node, VectorizeUse(node, r, generate_code, sub_type, restrictions) && VectorizeUse(node, s, generate_code, sub_type, restrictions)) { if (generate_code) { - reduction_type = HVecOperation::ToProperType(reduction_type, is_unsigned); if (vector_mode_ == kVector) { vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate( global_allocator_, vector_map_->Get(q), vector_map_->Get(r), vector_map_->Get(s), - reduction_type, + HVecOperation::ToProperType(reduction_type, is_unsigned), GetOtherVL(reduction_type, sub_type, vector_length_), kNoDexPc)); MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom); diff --git a/compiler/optimizing/loop_optimization_test.cc b/compiler/optimizing/loop_optimization_test.cc index db8368986c..c21bd65d97 100644 --- a/compiler/optimizing/loop_optimization_test.cc +++ b/compiler/optimizing/loop_optimization_test.cc @@ -227,11 +227,14 @@ TEST_F(LoopOptimizationTest, SimplifyLoopReoderPredecessors) { graph_->ClearDominanceInformation(); graph_->BuildDominatorTree(); + // BuildDominatorTree inserts a block beetween loop header and entry block. + EXPECT_EQ(header->GetPredecessors()[0]->GetSinglePredecessor(), entry_block_); + // Check that after optimizations in BuildDominatorTree()/SimplifyCFG() phi inputs // are still mapped correctly to the block predecessors. for (size_t i = 0, e = phi->InputCount(); i < e; i++) { HInstruction* input = phi->InputAt(i); - ASSERT_TRUE(input->GetBlock()->Dominates(header->GetPredecessors()[i])); + EXPECT_TRUE(input->GetBlock()->Dominates(header->GetPredecessors()[i])); } } diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index f6ba19f22a..d3212cbbc0 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -2891,6 +2891,8 @@ std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind return os << "BootImageLinkTimePcRelative"; case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: return os << "DirectAddress"; + case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: + return os << "BootImageRelRo"; case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: return os << "BssEntry"; case HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall: @@ -2925,7 +2927,7 @@ bool HLoadClass::InstructionDataEquals(const HInstruction* other) const { } switch (GetLoadKind()) { case LoadKind::kBootImageAddress: - case LoadKind::kBootImageClassTable: + case LoadKind::kBootImageRelRo: case LoadKind::kJitTableAddress: { ScopedObjectAccess soa(Thread::Current()); return GetClass().Get() == other_load_class->GetClass().Get(); @@ -2944,8 +2946,8 @@ std::ostream& operator<<(std::ostream& os, HLoadClass::LoadKind rhs) { return os << "BootImageLinkTimePcRelative"; case HLoadClass::LoadKind::kBootImageAddress: return os << "BootImageAddress"; - case HLoadClass::LoadKind::kBootImageClassTable: - return os << "BootImageClassTable"; + case HLoadClass::LoadKind::kBootImageRelRo: + return os << "BootImageRelRo"; case HLoadClass::LoadKind::kBssEntry: return os << "BssEntry"; case HLoadClass::LoadKind::kJitTableAddress: @@ -2968,7 +2970,7 @@ bool HLoadString::InstructionDataEquals(const HInstruction* other) const { } switch (GetLoadKind()) { case LoadKind::kBootImageAddress: - case LoadKind::kBootImageInternTable: + case LoadKind::kBootImageRelRo: case LoadKind::kJitTableAddress: { ScopedObjectAccess soa(Thread::Current()); return GetString().Get() == other_load_string->GetString().Get(); @@ -2984,8 +2986,8 @@ std::ostream& operator<<(std::ostream& os, HLoadString::LoadKind rhs) { return os << "BootImageLinkTimePcRelative"; case HLoadString::LoadKind::kBootImageAddress: return os << "BootImageAddress"; - case HLoadString::LoadKind::kBootImageInternTable: - return os << "BootImageInternTable"; + case HLoadString::LoadKind::kBootImageRelRo: + return os << "BootImageRelRo"; case HLoadString::LoadKind::kBssEntry: return os << "BssEntry"; case HLoadString::LoadKind::kJitTableAddress: diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 99d80d77c5..a8364e0680 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -26,6 +26,7 @@ #include "base/arena_object.h" #include "base/array_ref.h" #include "base/iteration_range.h" +#include "base/quasi_atomic.h" #include "base/stl_util.h" #include "base/transform_array_ref.h" #include "data_type.h" @@ -1383,7 +1384,9 @@ class HLoopInformationOutwardIterator : public ValueObject { M(LoadException, Instruction) \ M(LoadString, Instruction) \ M(LongConstant, Constant) \ + M(Max, Instruction) \ M(MemoryBarrier, Instruction) \ + M(Min, BinaryOperation) \ M(MonitorOperation, Instruction) \ M(Mul, BinaryOperation) \ M(NativeDebugInfo, Instruction) \ @@ -4428,6 +4431,10 @@ class HInvokeStaticOrDirect FINAL : public HInvoke { // Used for app->boot calls with non-relocatable image and for JIT-compiled calls. kDirectAddress, + // Load from an entry in the .data.bimg.rel.ro using a PC-relative load. + // Used for app->boot calls with relocatable image. + kBootImageRelRo, + // Load from an entry in the .bss section using a PC-relative load. // Used for classes outside boot image when .bss is accessible with a PC-relative load. kBssEntry, @@ -4560,6 +4567,7 @@ class HInvokeStaticOrDirect FINAL : public HInvoke { bool HasMethodAddress() const { return GetMethodLoadKind() == MethodLoadKind::kDirectAddress; } bool HasPcRelativeMethodLoadKind() const { return GetMethodLoadKind() == MethodLoadKind::kBootImageLinkTimePcRelative || + GetMethodLoadKind() == MethodLoadKind::kBootImageRelRo || GetMethodLoadKind() == MethodLoadKind::kBssEntry; } bool HasCurrentMethodInput() const { @@ -5016,6 +5024,76 @@ class HRem FINAL : public HBinaryOperation { DEFAULT_COPY_CONSTRUCTOR(Rem); }; +class HMin FINAL : public HBinaryOperation { + public: + HMin(DataType::Type result_type, + HInstruction* left, + HInstruction* right, + uint32_t dex_pc) + : HBinaryOperation(kMin, result_type, left, right, SideEffects::None(), dex_pc) {} + + bool IsCommutative() const OVERRIDE { return true; } + + // Evaluation for integral values. + template <typename T> static T ComputeIntegral(T x, T y) { + return (x <= y) ? x : y; + } + + HConstant* Evaluate(HIntConstant* x, HIntConstant* y) const OVERRIDE { + return GetBlock()->GetGraph()->GetIntConstant( + ComputeIntegral(x->GetValue(), y->GetValue()), GetDexPc()); + } + HConstant* Evaluate(HLongConstant* x, HLongConstant* y) const OVERRIDE { + return GetBlock()->GetGraph()->GetLongConstant( + ComputeIntegral(x->GetValue(), y->GetValue()), GetDexPc()); + } + // TODO: Evaluation for floating-point values. + HConstant* Evaluate(HFloatConstant* x ATTRIBUTE_UNUSED, + HFloatConstant* y ATTRIBUTE_UNUSED) const OVERRIDE { return nullptr; } + HConstant* Evaluate(HDoubleConstant* x ATTRIBUTE_UNUSED, + HDoubleConstant* y ATTRIBUTE_UNUSED) const OVERRIDE { return nullptr; } + + DECLARE_INSTRUCTION(Min); + + protected: + DEFAULT_COPY_CONSTRUCTOR(Min); +}; + +class HMax FINAL : public HBinaryOperation { + public: + HMax(DataType::Type result_type, + HInstruction* left, + HInstruction* right, + uint32_t dex_pc) + : HBinaryOperation(kMax, result_type, left, right, SideEffects::None(), dex_pc) {} + + bool IsCommutative() const OVERRIDE { return true; } + + // Evaluation for integral values. + template <typename T> static T ComputeIntegral(T x, T y) { + return (x >= y) ? x : y; + } + + HConstant* Evaluate(HIntConstant* x, HIntConstant* y) const OVERRIDE { + return GetBlock()->GetGraph()->GetIntConstant( + ComputeIntegral(x->GetValue(), y->GetValue()), GetDexPc()); + } + HConstant* Evaluate(HLongConstant* x, HLongConstant* y) const OVERRIDE { + return GetBlock()->GetGraph()->GetLongConstant( + ComputeIntegral(x->GetValue(), y->GetValue()), GetDexPc()); + } + // TODO: Evaluation for floating-point values. + HConstant* Evaluate(HFloatConstant* x ATTRIBUTE_UNUSED, + HFloatConstant* y ATTRIBUTE_UNUSED) const OVERRIDE { return nullptr; } + HConstant* Evaluate(HDoubleConstant* x ATTRIBUTE_UNUSED, + HDoubleConstant* y ATTRIBUTE_UNUSED) const OVERRIDE { return nullptr; } + + DECLARE_INSTRUCTION(Max); + + protected: + DEFAULT_COPY_CONSTRUCTOR(Max); +}; + class HAbs FINAL : public HUnaryOperation { public: HAbs(DataType::Type result_type, HInstruction* input, uint32_t dex_pc = kNoDexPc) @@ -6066,12 +6144,12 @@ class HLoadClass FINAL : public HInstruction { kBootImageLinkTimePcRelative, // Use a known boot image Class* address, embedded in the code by the codegen. - // Used for boot image classes referenced by apps in AOT- and JIT-compiled code. + // Used for boot image classes referenced by apps in JIT- and AOT-compiled code (non-PIC). kBootImageAddress, - // Use a PC-relative load from a boot image ClassTable mmapped into the .bss - // of the oat file. - kBootImageClassTable, + // Load from an entry in the .data.bimg.rel.ro using a PC-relative load. + // Used for boot image classes referenced by apps in AOT-compiled code (PIC). + kBootImageRelRo, // Load from an entry in the .bss section using a PC-relative load. // Used for classes outside boot image when .bss is accessible with a PC-relative load. @@ -6119,6 +6197,12 @@ class HLoadClass FINAL : public HInstruction { return GetPackedField<LoadKindField>(); } + bool HasPcRelativeLoadKind() const { + return GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || + GetLoadKind() == LoadKind::kBootImageRelRo || + GetLoadKind() == LoadKind::kBssEntry; + } + bool CanBeMoved() const OVERRIDE { return true; } bool InstructionDataEquals(const HInstruction* other) const; @@ -6223,7 +6307,6 @@ class HLoadClass FINAL : public HInstruction { static bool HasTypeReference(LoadKind load_kind) { return load_kind == LoadKind::kReferrersClass || load_kind == LoadKind::kBootImageLinkTimePcRelative || - load_kind == LoadKind::kBootImageClassTable || load_kind == LoadKind::kBssEntry || load_kind == LoadKind::kRuntimeCall; } @@ -6269,7 +6352,7 @@ inline void HLoadClass::AddSpecialInput(HInstruction* special_input) { // including literal pool loads, which are PC-relative too. DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || GetLoadKind() == LoadKind::kBootImageAddress || - GetLoadKind() == LoadKind::kBootImageClassTable || + GetLoadKind() == LoadKind::kBootImageRelRo || GetLoadKind() == LoadKind::kBssEntry) << GetLoadKind(); DCHECK(special_input_.GetInstruction() == nullptr); special_input_ = HUserRecord<HInstruction*>(special_input); @@ -6285,12 +6368,12 @@ class HLoadString FINAL : public HInstruction { kBootImageLinkTimePcRelative, // Use a known boot image String* address, embedded in the code by the codegen. - // Used for boot image strings referenced by apps in AOT- and JIT-compiled code. + // Used for boot image strings referenced by apps in JIT- and AOT-compiled code (non-PIC). kBootImageAddress, - // Use a PC-relative load from a boot image InternTable mmapped into the .bss - // of the oat file. - kBootImageInternTable, + // Load from an entry in the .data.bimg.rel.ro using a PC-relative load. + // Used for boot image strings referenced by apps in AOT-compiled code (PIC). + kBootImageRelRo, // Load from an entry in the .bss section using a PC-relative load. // Used for strings outside boot image when .bss is accessible with a PC-relative load. @@ -6325,6 +6408,12 @@ class HLoadString FINAL : public HInstruction { return GetPackedField<LoadKindField>(); } + bool HasPcRelativeLoadKind() const { + return GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || + GetLoadKind() == LoadKind::kBootImageRelRo || + GetLoadKind() == LoadKind::kBssEntry; + } + const DexFile& GetDexFile() const { return dex_file_; } @@ -6353,7 +6442,7 @@ class HLoadString FINAL : public HInstruction { LoadKind load_kind = GetLoadKind(); if (load_kind == LoadKind::kBootImageLinkTimePcRelative || load_kind == LoadKind::kBootImageAddress || - load_kind == LoadKind::kBootImageInternTable || + load_kind == LoadKind::kBootImageRelRo || load_kind == LoadKind::kJitTableAddress) { return false; } @@ -6431,7 +6520,7 @@ inline void HLoadString::AddSpecialInput(HInstruction* special_input) { // including literal pool loads, which are PC-relative too. DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative || GetLoadKind() == LoadKind::kBootImageAddress || - GetLoadKind() == LoadKind::kBootImageInternTable || + GetLoadKind() == LoadKind::kBootImageRelRo || GetLoadKind() == LoadKind::kBssEntry) << GetLoadKind(); // HLoadString::GetInputRecords() returns an empty array at this point, // so use the GetInputRecords() from the base class to set the input record. diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h index 0023265e50..00194ff1fe 100644 --- a/compiler/optimizing/optimizing_compiler_stats.h +++ b/compiler/optimizing/optimizing_compiler_stats.h @@ -22,9 +22,9 @@ #include <string> #include <type_traits> -#include "atomic.h" +#include "base/atomic.h" +#include "base/globals.h" #include "base/logging.h" // For VLOG_IS_ON. -#include "globals.h" namespace art { diff --git a/compiler/optimizing/pc_relative_fixups_mips.cc b/compiler/optimizing/pc_relative_fixups_mips.cc index 9d5358514e..0102254206 100644 --- a/compiler/optimizing/pc_relative_fixups_mips.cc +++ b/compiler/optimizing/pc_relative_fixups_mips.cc @@ -75,7 +75,7 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { switch (load_kind) { case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: case HLoadClass::LoadKind::kBootImageAddress: - case HLoadClass::LoadKind::kBootImageClassTable: + case HLoadClass::LoadKind::kBootImageRelRo: case HLoadClass::LoadKind::kBssEntry: // Add a base register for PC-relative literals on R2. InitializePCRelativeBasePointer(); @@ -91,7 +91,7 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { switch (load_kind) { case HLoadString::LoadKind::kBootImageLinkTimePcRelative: case HLoadString::LoadKind::kBootImageAddress: - case HLoadString::LoadKind::kBootImageInternTable: + case HLoadString::LoadKind::kBootImageRelRo: case HLoadString::LoadKind::kBssEntry: // Add a base register for PC-relative literals on R2. InitializePCRelativeBasePointer(); diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc index a3ca6311c2..647336b6b9 100644 --- a/compiler/optimizing/pc_relative_fixups_x86.cc +++ b/compiler/optimizing/pc_relative_fixups_x86.cc @@ -81,20 +81,14 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { } void VisitLoadClass(HLoadClass* load_class) OVERRIDE { - HLoadClass::LoadKind load_kind = load_class->GetLoadKind(); - if (load_kind == HLoadClass::LoadKind::kBootImageLinkTimePcRelative || - load_kind == HLoadClass::LoadKind::kBootImageClassTable || - load_kind == HLoadClass::LoadKind::kBssEntry) { + if (load_class->HasPcRelativeLoadKind()) { HX86ComputeBaseMethodAddress* method_address = GetPCRelativeBasePointer(load_class); load_class->AddSpecialInput(method_address); } } void VisitLoadString(HLoadString* load_string) OVERRIDE { - HLoadString::LoadKind load_kind = load_string->GetLoadKind(); - if (load_kind == HLoadString::LoadKind::kBootImageLinkTimePcRelative || - load_kind == HLoadString::LoadKind::kBootImageInternTable || - load_kind == HLoadString::LoadKind::kBssEntry) { + if (load_string->HasPcRelativeLoadKind()) { HX86ComputeBaseMethodAddress* method_address = GetPCRelativeBasePointer(load_string); load_string->AddSpecialInput(method_address); } @@ -234,12 +228,13 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { switch (invoke->GetIntrinsic()) { case Intrinsics::kMathAbsDouble: case Intrinsics::kMathAbsFloat: - LOG(FATAL) << "Unreachable abs"; - UNREACHABLE(); case Intrinsics::kMathMaxDoubleDouble: case Intrinsics::kMathMaxFloatFloat: case Intrinsics::kMathMinDoubleDouble: case Intrinsics::kMathMinFloatFloat: + LOG(FATAL) << "Unreachable min/max/abs: intrinsics should have been lowered " + "to IR nodes by instruction simplifier"; + UNREACHABLE(); case Intrinsics::kMathRoundFloat: if (!base_added) { DCHECK(invoke_static_or_direct != nullptr); diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index cdbe48342d..bca538fb17 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -679,6 +679,8 @@ bool HScheduler::IsSchedulable(const HInstruction* instruction) const { instruction->IsCompare() || instruction->IsCondition() || instruction->IsDiv() || + instruction->IsMin() || + instruction->IsMax() || instruction->IsMul() || instruction->IsOr() || instruction->IsRem() || diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc index 1e49411c72..7dffb2a378 100644 --- a/compiler/optimizing/sharpening.cc +++ b/compiler/optimizing/sharpening.cc @@ -125,8 +125,12 @@ void HSharpening::SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, BootImageAOTCanEmbedMethod(callee, compiler_driver)) { method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative; code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; + } else if (IsInBootImage(callee)) { + // Use PC-relative access to the .data.bimg.rel.ro methods array. + method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo; + code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; } else { - // Use PC-relative access to the .bss methods arrays. + // Use PC-relative access to the .bss methods array. method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBssEntry; code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; } @@ -207,7 +211,7 @@ HLoadClass::LoadKind HSharpening::ComputeLoadClassKind( } else if (is_in_boot_image) { // AOT app compilation, boot image class. if (codegen->GetCompilerOptions().GetCompilePic()) { - desired_load_kind = HLoadClass::LoadKind::kBootImageClassTable; + desired_load_kind = HLoadClass::LoadKind::kBootImageRelRo; } else { desired_load_kind = HLoadClass::LoadKind::kBootImageAddress; } @@ -288,7 +292,7 @@ void HSharpening::ProcessLoadString( string = class_linker->LookupString(string_index, dex_cache.Get()); if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string)) { if (codegen->GetCompilerOptions().GetCompilePic()) { - desired_load_kind = HLoadString::LoadKind::kBootImageInternTable; + desired_load_kind = HLoadString::LoadKind::kBootImageRelRo; } else { desired_load_kind = HLoadString::LoadKind::kBootImageAddress; } diff --git a/compiler/optimizing/superblock_cloner.cc b/compiler/optimizing/superblock_cloner.cc index a7c23bef7e..04942f9a4a 100644 --- a/compiler/optimizing/superblock_cloner.cc +++ b/compiler/optimizing/superblock_cloner.cc @@ -28,6 +28,11 @@ using HInstructionMap = SuperblockCloner::HInstructionMap; using HBasicBlockSet = SuperblockCloner::HBasicBlockSet; using HEdgeSet = SuperblockCloner::HEdgeSet; +// When doing peeling we can choose whether to keep original loop (made of original basic blocks) +// and form a peeled iteration of the copy blocks (preserve the header) or transfer original loop +// blocks to the peeled iteration and create new loop from the copy blocks. Similar for unrolling. +static const bool kPeelUnrollPreserveHeader = true; + void HEdge::Dump(std::ostream& stream) const { stream << "(" << from_ << "->" << to_ << ")"; } @@ -70,20 +75,18 @@ static bool ArePhiInputsTheSame(const HPhi* phi) { return true; } -// Returns a common predecessor of loop1 and loop2 in the loop tree or nullptr if it is the whole -// graph. -static HLoopInformation* FindCommonLoop(HLoopInformation* loop1, HLoopInformation* loop2) { - if (loop1 != nullptr || loop2 != nullptr) { - return nullptr; +// Returns whether two Edge sets are equal (ArenaHashSet doesn't have "Equal" method). +static bool EdgeHashSetsEqual(const HEdgeSet* set1, const HEdgeSet* set2) { + if (set1->Size() != set2->Size()) { + return false; } - if (loop1->IsIn(*loop2)) { - return loop2; - } else if (loop2->IsIn(*loop1)) { - return loop1; + for (auto e : *set1) { + if (set2->Find(e) == set2->end()) { + return false; + } } - HBasicBlock* block = CommonDominator::ForPair(loop1->GetHeader(), loop2->GetHeader()); - return block->GetLoopInformation(); + return true; } // Calls HGraph::OrderLoopHeaderPredecessors for each loop in the graph. @@ -95,6 +98,21 @@ static void OrderLoopsHeadersPredecessors(HGraph* graph) { } } +// Performs DFS on the subgraph (specified by 'bb_set') starting from the specified block; while +// traversing the function removes basic blocks from the bb_set (instead of traditional DFS +// 'marking'). So what is left in the 'bb_set' after the traversal is not reachable from the start +// block. +static void TraverseSubgraphForConnectivity(HBasicBlock* block, HBasicBlockSet* bb_set) { + DCHECK(bb_set->IsBitSet(block->GetBlockId())); + bb_set->ClearBit(block->GetBlockId()); + + for (HBasicBlock* succ : block->GetSuccessors()) { + if (bb_set->IsBitSet(succ->GetBlockId())) { + TraverseSubgraphForConnectivity(succ, bb_set); + } + } +} + // // Helpers for CloneBasicBlock. // @@ -268,7 +286,6 @@ void SuperblockCloner::FindBackEdgesLocal(HBasicBlock* entry_block, ArenaBitVect } void SuperblockCloner::RecalculateBackEdgesInfo(ArenaBitVector* outer_loop_bb_set) { - // TODO: DCHECK that after the transformation the graph is connected. HBasicBlock* block_entry = nullptr; if (outer_loop_ == nullptr) { @@ -424,6 +441,11 @@ void SuperblockCloner::FindAndSetLocalAreaForAdjustments() { outer_loop_ = nullptr; break; } + if (outer_loop_ == nullptr) { + // We should not use the initial outer_loop_ value 'nullptr' when finding the most outer + // common loop. + outer_loop_ = loop_exit_loop_info; + } outer_loop_ = FindCommonLoop(outer_loop_, loop_exit_loop_info); } @@ -507,6 +529,34 @@ void SuperblockCloner::ResolveDataFlow() { // Debug and logging methods. // +// Debug function to dump graph' BasicBlocks info. +void DumpBB(HGraph* graph) { + for (HBasicBlock* bb : graph->GetBlocks()) { + if (bb == nullptr) { + continue; + } + std::cout << bb->GetBlockId(); + std::cout << " <- "; + for (HBasicBlock* pred : bb->GetPredecessors()) { + std::cout << pred->GetBlockId() << " "; + } + std::cout << " -> "; + for (HBasicBlock* succ : bb->GetSuccessors()) { + std::cout << succ->GetBlockId() << " "; + } + + if (bb->GetDominator()) { + std::cout << " dom " << bb->GetDominator()->GetBlockId(); + } + + if (bb->GetLoopInformation()) { + std::cout << "\tloop: " << bb->GetLoopInformation()->GetHeader()->GetBlockId(); + } + + std::cout << std::endl; + } +} + void SuperblockCloner::CheckInstructionInputsRemapping(HInstruction* orig_instr) { DCHECK(!orig_instr->IsPhi()); HInstruction* copy_instr = GetInstrCopy(orig_instr); @@ -542,6 +592,82 @@ void SuperblockCloner::CheckInstructionInputsRemapping(HInstruction* orig_instr) } } +bool SuperblockCloner::CheckRemappingInfoIsValid() { + for (HEdge edge : *remap_orig_internal_) { + if (!IsEdgeValid(edge, graph_) || + !IsInOrigBBSet(edge.GetFrom()) || + !IsInOrigBBSet(edge.GetTo())) { + return false; + } + } + + for (auto edge : *remap_copy_internal_) { + if (!IsEdgeValid(edge, graph_) || + !IsInOrigBBSet(edge.GetFrom()) || + !IsInOrigBBSet(edge.GetTo())) { + return false; + } + } + + for (auto edge : *remap_incoming_) { + if (!IsEdgeValid(edge, graph_) || + IsInOrigBBSet(edge.GetFrom()) || + !IsInOrigBBSet(edge.GetTo())) { + return false; + } + } + + return true; +} + +void SuperblockCloner::VerifyGraph() { + for (auto it : *hir_map_) { + HInstruction* orig_instr = it.first; + HInstruction* copy_instr = it.second; + if (!orig_instr->IsPhi() && !orig_instr->IsSuspendCheck()) { + DCHECK(it.first->GetBlock() != nullptr); + } + if (!copy_instr->IsPhi() && !copy_instr->IsSuspendCheck()) { + DCHECK(it.second->GetBlock() != nullptr); + } + } + + GraphChecker checker(graph_); + checker.Run(); + if (!checker.IsValid()) { + for (const std::string& error : checker.GetErrors()) { + std::cout << error << std::endl; + } + LOG(FATAL) << "GraphChecker failed: superblock cloner\n"; + } +} + +void DumpBBSet(const ArenaBitVector* set) { + for (uint32_t idx : set->Indexes()) { + std::cout << idx << "\n"; + } +} + +void SuperblockCloner::DumpInputSets() { + std::cout << graph_->PrettyMethod() << "\n"; + std::cout << "orig_bb_set:\n"; + for (uint32_t idx : orig_bb_set_.Indexes()) { + std::cout << idx << "\n"; + } + std::cout << "remap_orig_internal:\n"; + for (HEdge e : *remap_orig_internal_) { + std::cout << e << "\n"; + } + std::cout << "remap_copy_internal:\n"; + for (auto e : *remap_copy_internal_) { + std::cout << e << "\n"; + } + std::cout << "remap_incoming:\n"; + for (auto e : *remap_incoming_) { + std::cout << e << "\n"; + } +} + // // Public methods. // @@ -569,6 +695,7 @@ void SuperblockCloner::SetSuccessorRemappingInfo(const HEdgeSet* remap_orig_inte remap_orig_internal_ = remap_orig_internal; remap_copy_internal_ = remap_copy_internal; remap_incoming_ = remap_incoming; + DCHECK(CheckRemappingInfoIsValid()); } bool SuperblockCloner::IsSubgraphClonable() const { @@ -602,6 +729,63 @@ bool SuperblockCloner::IsSubgraphClonable() const { return true; } +bool SuperblockCloner::IsFastCase() const { + // Check that loop unrolling/loop peeling is being conducted. + // Check that all the basic blocks belong to the same loop. + bool flag = false; + HLoopInformation* common_loop_info = nullptr; + for (uint32_t idx : orig_bb_set_.Indexes()) { + HBasicBlock* block = GetBlockById(idx); + HLoopInformation* block_loop_info = block->GetLoopInformation(); + if (!flag) { + common_loop_info = block_loop_info; + } else { + if (block_loop_info != common_loop_info) { + return false; + } + } + } + + // Check that orig_bb_set_ corresponds to loop peeling/unrolling. + if (common_loop_info == nullptr || !orig_bb_set_.SameBitsSet(&common_loop_info->GetBlocks())) { + return false; + } + + bool peeling_or_unrolling = false; + HEdgeSet remap_orig_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HEdgeSet remap_copy_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HEdgeSet remap_incoming(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + + + // Check whether remapping info corresponds to loop unrolling. + CollectRemappingInfoForPeelUnroll(/* to_unroll*/ true, + common_loop_info, + &remap_orig_internal, + &remap_copy_internal, + &remap_incoming); + + peeling_or_unrolling |= EdgeHashSetsEqual(&remap_orig_internal, remap_orig_internal_) && + EdgeHashSetsEqual(&remap_copy_internal, remap_copy_internal_) && + EdgeHashSetsEqual(&remap_incoming, remap_incoming_); + + remap_orig_internal.Clear(); + remap_copy_internal.Clear(); + remap_incoming.Clear(); + + // Check whether remapping info corresponds to loop peeling. + CollectRemappingInfoForPeelUnroll(/* to_unroll*/ false, + common_loop_info, + &remap_orig_internal, + &remap_copy_internal, + &remap_incoming); + + peeling_or_unrolling |= EdgeHashSetsEqual(&remap_orig_internal, remap_orig_internal_) && + EdgeHashSetsEqual(&remap_copy_internal, remap_copy_internal_) && + EdgeHashSetsEqual(&remap_incoming, remap_incoming_); + + return peeling_or_unrolling; +} + void SuperblockCloner::Run() { DCHECK(bb_map_ != nullptr); DCHECK(hir_map_ != nullptr); @@ -609,6 +793,11 @@ void SuperblockCloner::Run() { remap_copy_internal_ != nullptr && remap_incoming_ != nullptr); DCHECK(IsSubgraphClonable()); + DCHECK(IsFastCase()); + + if (kSuperblockClonerLogging) { + DumpInputSets(); + } // Find an area in the graph for which control flow information should be adjusted. FindAndSetLocalAreaForAdjustments(); @@ -618,6 +807,19 @@ void SuperblockCloner::Run() { // Connect the blocks together/remap successors and fix phis which are directly affected my the // remapping. RemapEdgesSuccessors(); + + // Check that the subgraph is connected. + if (kIsDebugBuild) { + HBasicBlockSet work_set(arena_, orig_bb_set_.GetSizeOf(), true, kArenaAllocSuperblockCloner); + + // Add original and copy blocks of the subgraph to the work set. + for (auto iter : *bb_map_) { + work_set.SetBit(iter.first->GetBlockId()); // Original block. + work_set.SetBit(iter.second->GetBlockId()); // Copy block. + } + CHECK(IsSubgraphConnected(&work_set, graph_)); + } + // Recalculate dominance and backedge information which is required by the next stage. AdjustControlFlowInfo(); // Fix data flow of the graph. @@ -650,6 +852,10 @@ void SuperblockCloner::CleanUp() { } } } + + if (kSuperblockClonerVerify) { + VerifyGraph(); + } } HBasicBlock* SuperblockCloner::CloneBasicBlock(const HBasicBlock* orig_block) { @@ -701,4 +907,125 @@ void SuperblockCloner::CloneBasicBlocks() { } } +// +// Stand-alone methods. +// + +void CollectRemappingInfoForPeelUnroll(bool to_unroll, + HLoopInformation* loop_info, + HEdgeSet* remap_orig_internal, + HEdgeSet* remap_copy_internal, + HEdgeSet* remap_incoming) { + DCHECK(loop_info != nullptr); + HBasicBlock* loop_header = loop_info->GetHeader(); + // Set up remap_orig_internal edges set - set is empty. + // Set up remap_copy_internal edges set. + for (HBasicBlock* back_edge_block : loop_info->GetBackEdges()) { + HEdge e = HEdge(back_edge_block, loop_header); + if (to_unroll) { + remap_orig_internal->Insert(e); + remap_copy_internal->Insert(e); + } else { + if (kPeelUnrollPreserveHeader) { + remap_copy_internal->Insert(e); + } else { + remap_orig_internal->Insert(e); + } + } + } + + // Set up remap_incoming edges set. + if (to_unroll != kPeelUnrollPreserveHeader) { + remap_incoming->Insert(HEdge(loop_info->GetPreHeader(), loop_header)); + } +} + +bool IsSubgraphConnected(SuperblockCloner::HBasicBlockSet* work_set, HGraph* graph) { + ArenaVector<HBasicBlock*> entry_blocks( + graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + + // Find subgraph entry blocks. + for (uint32_t orig_block_id : work_set->Indexes()) { + HBasicBlock* block = graph->GetBlocks()[orig_block_id]; + for (HBasicBlock* pred : block->GetPredecessors()) { + if (!work_set->IsBitSet(pred->GetBlockId())) { + entry_blocks.push_back(block); + break; + } + } + } + + for (HBasicBlock* entry_block : entry_blocks) { + if (work_set->IsBitSet(entry_block->GetBlockId())) { + TraverseSubgraphForConnectivity(entry_block, work_set); + } + } + + // Return whether there are unvisited - unreachable - blocks. + return work_set->NumSetBits() == 0; +} + +HLoopInformation* FindCommonLoop(HLoopInformation* loop1, HLoopInformation* loop2) { + if (loop1 == nullptr || loop2 == nullptr) { + return nullptr; + } + + if (loop1->IsIn(*loop2)) { + return loop2; + } + + HLoopInformation* current = loop1; + while (current != nullptr && !loop2->IsIn(*current)) { + current = current->GetPreHeader()->GetLoopInformation(); + } + + return current; +} + +bool PeelUnrollHelper::IsLoopClonable(HLoopInformation* loop_info) { + PeelUnrollHelper helper(loop_info, nullptr, nullptr); + return helper.IsLoopClonable(); +} + +HBasicBlock* PeelUnrollHelper::DoPeelUnrollImpl(bool to_unroll) { + // For now do peeling only for natural loops. + DCHECK(!loop_info_->IsIrreducible()); + + HBasicBlock* loop_header = loop_info_->GetHeader(); + HGraph* graph = loop_header->GetGraph(); + ArenaAllocator allocator(graph->GetAllocator()->GetArenaPool()); + + HEdgeSet remap_orig_internal(graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HEdgeSet remap_copy_internal(graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HEdgeSet remap_incoming(graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + + CollectRemappingInfoForPeelUnroll(to_unroll, + loop_info_, + &remap_orig_internal, + &remap_copy_internal, + &remap_incoming); + + cloner_.SetSuccessorRemappingInfo(&remap_orig_internal, &remap_copy_internal, &remap_incoming); + cloner_.Run(); + cloner_.CleanUp(); + + return kPeelUnrollPreserveHeader ? loop_header : cloner_.GetBlockCopy(loop_header); +} + +PeelUnrollSimpleHelper::PeelUnrollSimpleHelper(HLoopInformation* info) + : bb_map_(std::less<HBasicBlock*>(), + info->GetHeader()->GetGraph()->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)), + hir_map_(std::less<HInstruction*>(), + info->GetHeader()->GetGraph()->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)), + helper_(info, &bb_map_, &hir_map_) {} + } // namespace art + +namespace std { + +ostream& operator<<(ostream& os, const art::HEdge& e) { + e.Dump(os); + return os; +} + +} // namespace std diff --git a/compiler/optimizing/superblock_cloner.h b/compiler/optimizing/superblock_cloner.h index 23de692673..19c9dd471c 100644 --- a/compiler/optimizing/superblock_cloner.h +++ b/compiler/optimizing/superblock_cloner.h @@ -152,6 +152,15 @@ class SuperblockCloner : public ValueObject { // TODO: Start from small range of graph patterns then extend it. bool IsSubgraphClonable() const; + // Returns whether selected subgraph satisfies the criteria for fast data flow resolution + // when iterative DF algorithm is not required and dominators/instructions inputs can be + // trivially adjusted. + // + // TODO: formally describe the criteria. + // + // Loop peeling and unrolling satisfy the criteria. + bool IsFastCase() const; + // Runs the copy algorithm according to the description. void Run(); @@ -202,11 +211,17 @@ class SuperblockCloner : public ValueObject { return IsInOrigBBSet(block->GetBlockId()); } + // Returns the area (the most outer loop) in the graph for which control flow (back edges, loops, + // dominators) needs to be adjusted. + HLoopInformation* GetRegionToBeAdjusted() const { + return outer_loop_; + } + private: // Fills the 'exits' vector with the subgraph exits. void SearchForSubgraphExits(ArenaVector<HBasicBlock*>* exits); - // Finds and records information about the area in the graph for which control-flow (back edges, + // Finds and records information about the area in the graph for which control flow (back edges, // loops, dominators) needs to be adjusted. void FindAndSetLocalAreaForAdjustments(); @@ -217,7 +232,7 @@ class SuperblockCloner : public ValueObject { // phis' nor instructions' inputs values are resolved. void RemapEdgesSuccessors(); - // Adjusts control-flow (back edges, loops, dominators) for the local area defined by + // Adjusts control flow (back edges, loops, dominators) for the local area defined by // FindAndSetLocalAreaForAdjustments. void AdjustControlFlowInfo(); @@ -272,6 +287,9 @@ class SuperblockCloner : public ValueObject { // Debug and logging methods. // void CheckInstructionInputsRemapping(HInstruction* orig_instr); + bool CheckRemappingInfoIsValid(); + void VerifyGraph(); + void DumpInputSets(); HBasicBlock* GetBlockById(uint32_t block_id) const { DCHECK(block_id < graph_->GetBlocks().size()); @@ -295,15 +313,94 @@ class SuperblockCloner : public ValueObject { HBasicBlockMap* bb_map_; // Correspondence map for instructions: (original HInstruction, copy HInstruction). HInstructionMap* hir_map_; - // Area in the graph for which control-flow (back edges, loops, dominators) needs to be adjusted. + // Area in the graph for which control flow (back edges, loops, dominators) needs to be adjusted. HLoopInformation* outer_loop_; HBasicBlockSet outer_loop_bb_set_; ART_FRIEND_TEST(SuperblockClonerTest, AdjustControlFlowInfo); + ART_FRIEND_TEST(SuperblockClonerTest, IsGraphConnected); DISALLOW_COPY_AND_ASSIGN(SuperblockCloner); }; +// Helper class to perform loop peeling/unrolling. +// +// This helper should be used when correspondence map between original and copied +// basic blocks/instructions are demanded. +class PeelUnrollHelper : public ValueObject { + public: + explicit PeelUnrollHelper(HLoopInformation* info, + SuperblockCloner::HBasicBlockMap* bb_map, + SuperblockCloner::HInstructionMap* hir_map) : + loop_info_(info), + cloner_(info->GetHeader()->GetGraph(), &info->GetBlocks(), bb_map, hir_map) { + // For now do peeling/unrolling only for natural loops. + DCHECK(!info->IsIrreducible()); + } + + // Returns whether the loop can be peeled/unrolled (static function). + static bool IsLoopClonable(HLoopInformation* loop_info); + + // Returns whether the loop can be peeled/unrolled. + bool IsLoopClonable() const { return cloner_.IsSubgraphClonable(); } + + HBasicBlock* DoPeeling() { return DoPeelUnrollImpl(/* to_unroll */ false); } + HBasicBlock* DoUnrolling() { return DoPeelUnrollImpl(/* to_unroll */ true); } + HLoopInformation* GetRegionToBeAdjusted() const { return cloner_.GetRegionToBeAdjusted(); } + + protected: + // Applies loop peeling/unrolling for the loop specified by 'loop_info'. + // + // Depending on 'do_unroll' either unrolls loop by 2 or peels one iteration from it. + HBasicBlock* DoPeelUnrollImpl(bool to_unroll); + + private: + HLoopInformation* loop_info_; + SuperblockCloner cloner_; + + DISALLOW_COPY_AND_ASSIGN(PeelUnrollHelper); +}; + +// Helper class to perform loop peeling/unrolling. +// +// This helper should be used when there is no need to get correspondence information between +// original and copied basic blocks/instructions. +class PeelUnrollSimpleHelper : public ValueObject { + public: + explicit PeelUnrollSimpleHelper(HLoopInformation* info); + bool IsLoopClonable() const { return helper_.IsLoopClonable(); } + HBasicBlock* DoPeeling() { return helper_.DoPeeling(); } + HBasicBlock* DoUnrolling() { return helper_.DoUnrolling(); } + HLoopInformation* GetRegionToBeAdjusted() const { return helper_.GetRegionToBeAdjusted(); } + + private: + SuperblockCloner::HBasicBlockMap bb_map_; + SuperblockCloner::HInstructionMap hir_map_; + PeelUnrollHelper helper_; + + DISALLOW_COPY_AND_ASSIGN(PeelUnrollSimpleHelper); +}; + +// Collects edge remapping info for loop peeling/unrolling for the loop specified by loop info. +void CollectRemappingInfoForPeelUnroll(bool to_unroll, + HLoopInformation* loop_info, + SuperblockCloner::HEdgeSet* remap_orig_internal, + SuperblockCloner::HEdgeSet* remap_copy_internal, + SuperblockCloner::HEdgeSet* remap_incoming); + +// Returns whether blocks from 'work_set' are reachable from the rest of the graph. +// +// Returns whether such a set 'outer_entries' of basic blocks exists that: +// - each block from 'outer_entries' is not from 'work_set'. +// - each block from 'work_set' is reachable from at least one block from 'outer_entries'. +// +// After the function returns work_set contains only blocks from the original 'work_set' +// which are unreachable from the rest of the graph. +bool IsSubgraphConnected(SuperblockCloner::HBasicBlockSet* work_set, HGraph* graph); + +// Returns a common predecessor of loop1 and loop2 in the loop tree or nullptr if it is the whole +// graph. +HLoopInformation* FindCommonLoop(HLoopInformation* loop1, HLoopInformation* loop2); } // namespace art namespace std { @@ -312,11 +409,12 @@ template <> struct hash<art::HEdge> { size_t operator()(art::HEdge const& x) const noexcept { // Use Cantor pairing function as the hash function. - uint32_t a = x.GetFrom(); - uint32_t b = x.GetTo(); + size_t a = x.GetFrom(); + size_t b = x.GetTo(); return (a + b) * (a + b + 1) / 2 + b; } }; +ostream& operator<<(ostream& os, const art::HEdge& e); } // namespace std diff --git a/compiler/optimizing/superblock_cloner_test.cc b/compiler/optimizing/superblock_cloner_test.cc index f1b7bffdf5..df2e517aff 100644 --- a/compiler/optimizing/superblock_cloner_test.cc +++ b/compiler/optimizing/superblock_cloner_test.cc @@ -25,52 +25,65 @@ namespace art { using HBasicBlockMap = SuperblockCloner::HBasicBlockMap; using HInstructionMap = SuperblockCloner::HInstructionMap; +using HBasicBlockSet = SuperblockCloner::HBasicBlockSet; +using HEdgeSet = SuperblockCloner::HEdgeSet; // This class provides methods and helpers for testing various cloning and copying routines: // individual instruction cloning and cloning of the more coarse-grain structures. class SuperblockClonerTest : public OptimizingUnitTest { public: - SuperblockClonerTest() - : graph_(CreateGraph()), entry_block_(nullptr), exit_block_(nullptr), parameter_(nullptr) {} + SuperblockClonerTest() : graph_(CreateGraph()), + entry_block_(nullptr), + return_block_(nullptr), + exit_block_(nullptr), + parameter_(nullptr) {} - void CreateBasicLoopControlFlow(/* out */ HBasicBlock** header_p, - /* out */ HBasicBlock** body_p) { + void InitGraph() { entry_block_ = new (GetAllocator()) HBasicBlock(graph_); graph_->AddBlock(entry_block_); graph_->SetEntryBlock(entry_block_); + return_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(return_block_); + + exit_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(exit_block_); + graph_->SetExitBlock(exit_block_); + + entry_block_->AddSuccessor(return_block_); + return_block_->AddSuccessor(exit_block_); + + parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), + 0, + DataType::Type::kInt32); + entry_block_->AddInstruction(parameter_); + return_block_->AddInstruction(new (GetAllocator()) HReturnVoid()); + exit_block_->AddInstruction(new (GetAllocator()) HExit()); + } + + void CreateBasicLoopControlFlow(HBasicBlock* position, + HBasicBlock* successor, + /* out */ HBasicBlock** header_p, + /* out */ HBasicBlock** body_p) { HBasicBlock* loop_preheader = new (GetAllocator()) HBasicBlock(graph_); HBasicBlock* loop_header = new (GetAllocator()) HBasicBlock(graph_); HBasicBlock* loop_body = new (GetAllocator()) HBasicBlock(graph_); - HBasicBlock* loop_exit = new (GetAllocator()) HBasicBlock(graph_); graph_->AddBlock(loop_preheader); graph_->AddBlock(loop_header); graph_->AddBlock(loop_body); - graph_->AddBlock(loop_exit); - exit_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(exit_block_); - graph_->SetExitBlock(exit_block_); + position->ReplaceSuccessor(successor, loop_preheader); - entry_block_->AddSuccessor(loop_preheader); loop_preheader->AddSuccessor(loop_header); // Loop exit first to have a proper exit condition/target for HIf. - loop_header->AddSuccessor(loop_exit); + loop_header->AddSuccessor(successor); loop_header->AddSuccessor(loop_body); loop_body->AddSuccessor(loop_header); - loop_exit->AddSuccessor(exit_block_); *header_p = loop_header; *body_p = loop_body; - - parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), - dex::TypeIndex(0), - 0, - DataType::Type::kInt32); - entry_block_->AddInstruction(parameter_); - loop_exit->AddInstruction(new (GetAllocator()) HReturnVoid()); - exit_block_->AddInstruction(new (GetAllocator()) HExit()); } void CreateBasicLoopDataFlow(HBasicBlock* loop_header, HBasicBlock* loop_body) { @@ -84,11 +97,12 @@ class SuperblockClonerTest : public OptimizingUnitTest { // Header block. HPhi* phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32); HInstruction* suspend_check = new (GetAllocator()) HSuspendCheck(); + HInstruction* loop_check = new (GetAllocator()) HGreaterThanOrEqual(phi, const_128); loop_header->AddPhi(phi); loop_header->AddInstruction(suspend_check); - loop_header->AddInstruction(new (GetAllocator()) HGreaterThanOrEqual(phi, const_128)); - loop_header->AddInstruction(new (GetAllocator()) HIf(parameter_)); + loop_header->AddInstruction(loop_check); + loop_header->AddInstruction(new (GetAllocator()) HIf(loop_check)); // Loop body block. HInstruction* null_check = new (GetAllocator()) HNullCheck(parameter_, dex_pc); @@ -97,8 +111,8 @@ class SuperblockClonerTest : public OptimizingUnitTest { HInstruction* array_get = new (GetAllocator()) HArrayGet(null_check, bounds_check, DataType::Type::kInt32, dex_pc); HInstruction* add = new (GetAllocator()) HAdd(DataType::Type::kInt32, array_get, const_1); - HInstruction* array_set = - new (GetAllocator()) HArraySet(null_check, bounds_check, add, DataType::Type::kInt32, dex_pc); + HInstruction* array_set = new (GetAllocator()) HArraySet( + null_check, bounds_check, add, DataType::Type::kInt32, dex_pc); HInstruction* induction_inc = new (GetAllocator()) HAdd(DataType::Type::kInt32, phi, const_1); loop_body->AddInstruction(null_check); @@ -153,6 +167,7 @@ class SuperblockClonerTest : public OptimizingUnitTest { HGraph* graph_; HBasicBlock* entry_block_; + HBasicBlock* return_block_; HBasicBlock* exit_block_; HInstruction* parameter_; @@ -162,10 +177,11 @@ TEST_F(SuperblockClonerTest, IndividualInstrCloner) { HBasicBlock* header = nullptr; HBasicBlock* loop_body = nullptr; - CreateBasicLoopControlFlow(&header, &loop_body); + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); CreateBasicLoopDataFlow(header, loop_body); graph_->BuildDominatorTree(); - ASSERT_TRUE(CheckGraph()); + EXPECT_TRUE(CheckGraph()); HSuspendCheck* old_suspend_check = header->GetLoopInformation()->GetSuspendCheck(); CloneAndReplaceInstructionVisitor visitor(graph_); @@ -193,7 +209,8 @@ TEST_F(SuperblockClonerTest, CloneBasicBlocks) { HBasicBlock* loop_body = nullptr; ArenaAllocator* arena = graph_->GetAllocator(); - CreateBasicLoopControlFlow(&header, &loop_body); + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); CreateBasicLoopDataFlow(header, loop_body); graph_->BuildDominatorTree(); ASSERT_TRUE(CheckGraph()); @@ -272,7 +289,8 @@ TEST_F(SuperblockClonerTest, AdjustControlFlowInfo) { HBasicBlock* loop_body = nullptr; ArenaAllocator* arena = graph_->GetAllocator(); - CreateBasicLoopControlFlow(&header, &loop_body); + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); CreateBasicLoopDataFlow(header, loop_body); graph_->BuildDominatorTree(); ASSERT_TRUE(CheckGraph()); @@ -303,4 +321,487 @@ TEST_F(SuperblockClonerTest, AdjustControlFlowInfo) { EXPECT_TRUE(loop_info->IsBackEdge(*loop_body)); } +// Tests IsSubgraphConnected function for negative case. +TEST_F(SuperblockClonerTest, IsGraphConnected) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + ArenaAllocator* arena = graph_->GetAllocator(); + + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* unreachable_block = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(unreachable_block); + + HBasicBlockSet bb_set( + arena, graph_->GetBlocks().size(), false, kArenaAllocSuperblockCloner); + bb_set.SetBit(header->GetBlockId()); + bb_set.SetBit(loop_body->GetBlockId()); + bb_set.SetBit(unreachable_block->GetBlockId()); + + EXPECT_FALSE(IsSubgraphConnected(&bb_set, graph_)); + EXPECT_EQ(bb_set.NumSetBits(), 1u); + EXPECT_TRUE(bb_set.IsBitSet(unreachable_block->GetBlockId())); +} + +// Tests SuperblockCloner for loop peeling case. +// +// Control Flow of the example (ignoring critical edges splitting). +// +// Before After +// +// |B| |B| +// | | +// v v +// |1| |1| +// | | +// v v +// |2|<-\ (6) |2A| +// / \ / / \ +// v v/ / v +// |4| |3| / |3A| (7) +// | / / +// v | v +// |E| \ |2|<-\ +// \ / \ / +// v v / +// |4| |3| +// | +// v +// |E| +TEST_F(SuperblockClonerTest, LoopPeeling) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + HBasicBlockMap bb_map( + std::less<HBasicBlock*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HInstructionMap hir_map( + std::less<HInstruction*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + + HLoopInformation* loop_info = header->GetLoopInformation(); + PeelUnrollHelper helper(loop_info, &bb_map, &hir_map); + EXPECT_TRUE(helper.IsLoopClonable()); + HBasicBlock* new_header = helper.DoPeeling(); + HLoopInformation* new_loop_info = new_header->GetLoopInformation(); + + EXPECT_TRUE(CheckGraph()); + + // Check loop body successors. + EXPECT_EQ(loop_body->GetSingleSuccessor(), header); + EXPECT_EQ(bb_map.Get(loop_body)->GetSingleSuccessor(), header); + + // Check loop structure. + EXPECT_EQ(header, new_header); + EXPECT_EQ(new_loop_info->GetHeader(), header); + EXPECT_EQ(new_loop_info->GetBackEdges().size(), 1u); + EXPECT_EQ(new_loop_info->GetBackEdges()[0], loop_body); +} + +// Tests SuperblockCloner for loop unrolling case. +// +// Control Flow of the example (ignoring critical edges splitting). +// +// Before After +// +// |B| |B| +// | | +// v v +// |1| |1| +// | | +// v v +// |2|<-\ (6) |2A|<-\ +// / \ / / \ \ +// v v/ / v \ +// |4| |3| /(7)|3A| | +// | / / / +// v | v / +// |E| \ |2| / +// \ / \ / +// v v/ +// |4| |3| +// | +// v +// |E| +TEST_F(SuperblockClonerTest, LoopUnrolling) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + HBasicBlockMap bb_map( + std::less<HBasicBlock*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HInstructionMap hir_map( + std::less<HInstruction*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + + HLoopInformation* loop_info = header->GetLoopInformation(); + PeelUnrollHelper helper(loop_info, &bb_map, &hir_map); + EXPECT_TRUE(helper.IsLoopClonable()); + HBasicBlock* new_header = helper.DoUnrolling(); + + EXPECT_TRUE(CheckGraph()); + + // Check loop body successors. + EXPECT_EQ(loop_body->GetSingleSuccessor(), bb_map.Get(header)); + EXPECT_EQ(bb_map.Get(loop_body)->GetSingleSuccessor(), header); + + // Check loop structure. + EXPECT_EQ(header, new_header); + EXPECT_EQ(loop_info, new_header->GetLoopInformation()); + EXPECT_EQ(loop_info->GetHeader(), new_header); + EXPECT_EQ(loop_info->GetBackEdges().size(), 1u); + EXPECT_EQ(loop_info->GetBackEdges()[0], bb_map.Get(loop_body)); +} + +// Checks that loop unrolling works fine for a loop with multiple back edges. Tests that after +// the transformation the loop has a single preheader. +TEST_F(SuperblockClonerTest, LoopPeelingMultipleBackEdges) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + + // Transform a basic loop to have multiple back edges. + HBasicBlock* latch = header->GetSuccessors()[1]; + HBasicBlock* if_block = new (GetAllocator()) HBasicBlock(graph_); + HBasicBlock* temp1 = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(if_block); + graph_->AddBlock(temp1); + header->ReplaceSuccessor(latch, if_block); + if_block->AddSuccessor(latch); + if_block->AddSuccessor(temp1); + temp1->AddSuccessor(header); + + if_block->AddInstruction(new (GetAllocator()) HIf(parameter_)); + + HInstructionIterator it(header->GetPhis()); + DCHECK(!it.Done()); + HPhi* loop_phi = it.Current()->AsPhi(); + HInstruction* temp_add = new (GetAllocator()) HAdd(DataType::Type::kInt32, + loop_phi, + graph_->GetIntConstant(2)); + temp1->AddInstruction(temp_add); + temp1->AddInstruction(new (GetAllocator()) HGoto()); + loop_phi->AddInput(temp_add); + + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + HLoopInformation* loop_info = header->GetLoopInformation(); + PeelUnrollSimpleHelper helper(loop_info); + HBasicBlock* new_header = helper.DoPeeling(); + EXPECT_EQ(header, new_header); + + EXPECT_TRUE(CheckGraph()); + EXPECT_EQ(header->GetPredecessors().size(), 3u); +} + +static void CheckLoopStructureForLoopPeelingNested(HBasicBlock* loop1_header, + HBasicBlock* loop2_header, + HBasicBlock* loop3_header) { + EXPECT_EQ(loop1_header->GetLoopInformation()->GetHeader(), loop1_header); + EXPECT_EQ(loop2_header->GetLoopInformation()->GetHeader(), loop2_header); + EXPECT_EQ(loop3_header->GetLoopInformation()->GetHeader(), loop3_header); + EXPECT_EQ(loop1_header->GetLoopInformation()->GetPreHeader()->GetLoopInformation(), nullptr); + EXPECT_EQ(loop2_header->GetLoopInformation()->GetPreHeader()->GetLoopInformation(), nullptr); + EXPECT_EQ(loop3_header->GetLoopInformation()->GetPreHeader()->GetLoopInformation()->GetHeader(), + loop2_header); +} + +TEST_F(SuperblockClonerTest, LoopPeelingNested) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + + // Create the following nested structure of loops + // Headers: 1 2 3 + // [ ], [ [ ] ] + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop1_header = header; + + CreateBasicLoopControlFlow(header, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop2_header = header; + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop3_header = header; + + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + HLoopInformation* loop2_info_before = loop2_header->GetLoopInformation(); + HLoopInformation* loop3_info_before = loop3_header->GetLoopInformation(); + + // Check nested loops structure. + CheckLoopStructureForLoopPeelingNested(loop1_header, loop2_header, loop3_header); + PeelUnrollSimpleHelper helper(loop1_header->GetLoopInformation()); + helper.DoPeeling(); + // Check that nested loops structure has not changed after the transformation. + CheckLoopStructureForLoopPeelingNested(loop1_header, loop2_header, loop3_header); + + // Test that the loop info is preserved. + EXPECT_EQ(loop2_info_before, loop2_header->GetLoopInformation()); + EXPECT_EQ(loop3_info_before, loop3_header->GetLoopInformation()); + + EXPECT_EQ(loop3_info_before->GetPreHeader()->GetLoopInformation(), loop2_info_before); + EXPECT_EQ(loop2_info_before->GetPreHeader()->GetLoopInformation(), nullptr); + + EXPECT_EQ(helper.GetRegionToBeAdjusted(), nullptr); + + EXPECT_TRUE(CheckGraph()); +} + +// Checks that the loop population is correctly propagated after an inner loop is peeled. +TEST_F(SuperblockClonerTest, OuterLoopPopulationAfterInnerPeeled) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + + // Create the following nested structure of loops + // Headers: 1 2 3 4 + // [ [ [ ] ] ], [ ] + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop1_header = header; + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop2_header = header; + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop3_header = header; + + CreateBasicLoopControlFlow(loop1_header, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop4_header = header; + + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + PeelUnrollSimpleHelper helper(loop3_header->GetLoopInformation()); + helper.DoPeeling(); + HLoopInformation* loop1 = loop1_header->GetLoopInformation(); + HLoopInformation* loop2 = loop2_header->GetLoopInformation(); + HLoopInformation* loop3 = loop3_header->GetLoopInformation(); + HLoopInformation* loop4 = loop4_header->GetLoopInformation(); + + EXPECT_TRUE(loop1->Contains(*loop2_header)); + EXPECT_TRUE(loop1->Contains(*loop3_header)); + EXPECT_TRUE(loop1->Contains(*loop3_header->GetLoopInformation()->GetPreHeader())); + + // Check that loop4 info has not been touched after local run of AnalyzeLoops. + EXPECT_EQ(loop4, loop4_header->GetLoopInformation()); + + EXPECT_TRUE(loop1->IsIn(*loop1)); + EXPECT_TRUE(loop2->IsIn(*loop1)); + EXPECT_TRUE(loop3->IsIn(*loop1)); + EXPECT_TRUE(loop3->IsIn(*loop2)); + EXPECT_TRUE(!loop4->IsIn(*loop1)); + + EXPECT_EQ(loop4->GetPreHeader()->GetLoopInformation(), nullptr); + + EXPECT_EQ(helper.GetRegionToBeAdjusted(), loop2); + + EXPECT_TRUE(CheckGraph()); +} + +// Checks the case when inner loop have an exit not to its immediate outer_loop but some other loop +// in the hierarchy. Loop population information must be valid after loop peeling. +TEST_F(SuperblockClonerTest, NestedCaseExitToOutermost) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + + // Create the following nested structure of loops then peel loop3. + // Headers: 1 2 3 + // [ [ [ ] ] ] + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop1_header = header; + HBasicBlock* loop_body1 = loop_body; + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop3_header = header; + HBasicBlock* loop_body3 = loop_body; + + // Change the loop3 - insert an exit which leads to loop1. + HBasicBlock* loop3_extra_if_block = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(loop3_extra_if_block); + loop3_extra_if_block->AddInstruction(new (GetAllocator()) HIf(parameter_)); + + loop3_header->ReplaceSuccessor(loop_body3, loop3_extra_if_block); + loop3_extra_if_block->AddSuccessor(loop_body1); // Long exit. + loop3_extra_if_block->AddSuccessor(loop_body3); + + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + HBasicBlock* loop3_long_exit = loop3_extra_if_block->GetSuccessors()[0]; + EXPECT_TRUE(loop1_header->GetLoopInformation()->Contains(*loop3_long_exit)); + + PeelUnrollSimpleHelper helper(loop3_header->GetLoopInformation()); + helper.DoPeeling(); + + HLoopInformation* loop1 = loop1_header->GetLoopInformation(); + // Check that after the transformation the local area for CF adjustments has been chosen + // correctly and loop population has been updated. + loop3_long_exit = loop3_extra_if_block->GetSuccessors()[0]; + EXPECT_TRUE(loop1->Contains(*loop3_long_exit)); + + EXPECT_EQ(helper.GetRegionToBeAdjusted(), loop1); + + EXPECT_TRUE(loop1->Contains(*loop3_header)); + EXPECT_TRUE(loop1->Contains(*loop3_header->GetLoopInformation()->GetPreHeader())); + + EXPECT_TRUE(CheckGraph()); +} + +TEST_F(SuperblockClonerTest, FastCaseCheck) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + ArenaAllocator* arena = graph_->GetAllocator(); + + InitGraph(); + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + graph_->BuildDominatorTree(); + + HLoopInformation* loop_info = header->GetLoopInformation(); + + ArenaBitVector orig_bb_set( + arena, graph_->GetBlocks().size(), false, kArenaAllocSuperblockCloner); + orig_bb_set.Union(&loop_info->GetBlocks()); + + HEdgeSet remap_orig_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HEdgeSet remap_copy_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + HEdgeSet remap_incoming(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)); + + CollectRemappingInfoForPeelUnroll(true, + loop_info, + &remap_orig_internal, + &remap_copy_internal, + &remap_incoming); + + // Insert some extra nodes and edges. + HBasicBlock* preheader = loop_info->GetPreHeader(); + orig_bb_set.SetBit(preheader->GetBlockId()); + + // Adjust incoming edges. + remap_incoming.Clear(); + remap_incoming.Insert(HEdge(preheader->GetSinglePredecessor(), preheader)); + + HBasicBlockMap bb_map(std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner)); + HInstructionMap hir_map(std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner)); + + SuperblockCloner cloner(graph_, + &orig_bb_set, + &bb_map, + &hir_map); + cloner.SetSuccessorRemappingInfo(&remap_orig_internal, &remap_copy_internal, &remap_incoming); + + EXPECT_FALSE(cloner.IsFastCase()); +} + +// Helper for FindCommonLoop which also check that FindCommonLoop is symmetric. +static HLoopInformation* FindCommonLoopCheck(HLoopInformation* loop1, HLoopInformation* loop2) { + HLoopInformation* common_loop12 = FindCommonLoop(loop1, loop2); + HLoopInformation* common_loop21 = FindCommonLoop(loop2, loop1); + EXPECT_EQ(common_loop21, common_loop12); + return common_loop12; +} + +// Tests FindCommonLoop function on a loop nest. +TEST_F(SuperblockClonerTest, FindCommonLoop) { + HBasicBlock* header = nullptr; + HBasicBlock* loop_body = nullptr; + + InitGraph(); + + // Create the following nested structure of loops + // Headers: 1 2 3 4 5 + // [ [ [ ] ], [ ] ], [ ] + CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop1_header = header; + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop2_header = header; + + CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop3_header = header; + + CreateBasicLoopControlFlow(loop2_header, loop2_header->GetSuccessors()[0], &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop4_header = header; + + CreateBasicLoopControlFlow(loop1_header, return_block_, &header, &loop_body); + CreateBasicLoopDataFlow(header, loop_body); + HBasicBlock* loop5_header = header; + + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + HLoopInformation* loop1 = loop1_header->GetLoopInformation(); + HLoopInformation* loop2 = loop2_header->GetLoopInformation(); + HLoopInformation* loop3 = loop3_header->GetLoopInformation(); + HLoopInformation* loop4 = loop4_header->GetLoopInformation(); + HLoopInformation* loop5 = loop5_header->GetLoopInformation(); + + EXPECT_TRUE(loop1->IsIn(*loop1)); + EXPECT_TRUE(loop2->IsIn(*loop1)); + EXPECT_TRUE(loop3->IsIn(*loop1)); + EXPECT_TRUE(loop3->IsIn(*loop2)); + EXPECT_TRUE(loop4->IsIn(*loop1)); + + EXPECT_FALSE(loop5->IsIn(*loop1)); + EXPECT_FALSE(loop4->IsIn(*loop2)); + EXPECT_FALSE(loop4->IsIn(*loop3)); + + EXPECT_EQ(loop1->GetPreHeader()->GetLoopInformation(), nullptr); + EXPECT_EQ(loop4->GetPreHeader()->GetLoopInformation(), loop1); + + EXPECT_EQ(FindCommonLoopCheck(nullptr, nullptr), nullptr); + EXPECT_EQ(FindCommonLoopCheck(loop2, nullptr), nullptr); + + EXPECT_EQ(FindCommonLoopCheck(loop1, loop1), loop1); + EXPECT_EQ(FindCommonLoopCheck(loop1, loop2), loop1); + EXPECT_EQ(FindCommonLoopCheck(loop1, loop3), loop1); + EXPECT_EQ(FindCommonLoopCheck(loop1, loop4), loop1); + EXPECT_EQ(FindCommonLoopCheck(loop1, loop5), nullptr); + + EXPECT_EQ(FindCommonLoopCheck(loop2, loop3), loop2); + EXPECT_EQ(FindCommonLoopCheck(loop2, loop4), loop1); + EXPECT_EQ(FindCommonLoopCheck(loop2, loop5), nullptr); + + EXPECT_EQ(FindCommonLoopCheck(loop3, loop4), loop1); + EXPECT_EQ(FindCommonLoopCheck(loop3, loop5), nullptr); + + EXPECT_EQ(FindCommonLoopCheck(loop4, loop5), nullptr); + + EXPECT_EQ(FindCommonLoopCheck(loop5, loop5), loop5); +} + } // namespace art diff --git a/compiler/utils/assembler_test_base.h b/compiler/utils/assembler_test_base.h index 1482210ac4..778a01566c 100644 --- a/compiler/utils/assembler_test_base.h +++ b/compiler/utils/assembler_test_base.h @@ -25,9 +25,9 @@ #include "android-base/strings.h" +#include "base/utils.h" #include "common_runtime_test.h" // For ScratchFile #include "exec_utils.h" -#include "utils.h" namespace art { diff --git a/compiler/utils/atomic_dex_ref_map.h b/compiler/utils/atomic_dex_ref_map.h index cabd79e9f1..fc2437999e 100644 --- a/compiler/utils/atomic_dex_ref_map.h +++ b/compiler/utils/atomic_dex_ref_map.h @@ -17,7 +17,7 @@ #ifndef ART_COMPILER_UTILS_ATOMIC_DEX_REF_MAP_H_ #define ART_COMPILER_UTILS_ATOMIC_DEX_REF_MAP_H_ -#include "atomic.h" +#include "base/atomic.h" #include "base/dchecked_vector.h" #include "base/safe_map.h" #include "dex/dex_file_reference.h" diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc index 2218ef9af2..b2ad490a57 100644 --- a/compiler/utils/mips/assembler_mips.cc +++ b/compiler/utils/mips/assembler_mips.cc @@ -2793,6 +2793,26 @@ void MipsAssembler::Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt); } +void MipsAssembler::PcntB(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + DsFsmInstr(EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws); +} + +void MipsAssembler::PcntH(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + DsFsmInstr(EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws); +} + +void MipsAssembler::PcntW(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + DsFsmInstr(EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws); +} + +void MipsAssembler::PcntD(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + DsFsmInstr(EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws); +} + void MipsAssembler::ReplicateFPToVectorRegister(VectorRegister dst, FRegister src, bool is_double) { diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h index 7de8e2e366..c6ce62b4f4 100644 --- a/compiler/utils/mips/assembler_mips.h +++ b/compiler/utils/mips/assembler_mips.h @@ -756,6 +756,11 @@ class MipsAssembler FINAL : public Assembler, public JNIMacroAssembler<PointerSi void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt); void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt); + void PcntB(VectorRegister wd, VectorRegister ws); + void PcntH(VectorRegister wd, VectorRegister ws); + void PcntW(VectorRegister wd, VectorRegister ws); + void PcntD(VectorRegister wd, VectorRegister ws); + // Helper for replicating floating point value in all destination elements. void ReplicateFPToVectorRegister(VectorRegister dst, FRegister src, bool is_double); diff --git a/compiler/utils/mips/assembler_mips32r6_test.cc b/compiler/utils/mips/assembler_mips32r6_test.cc index 937ee25bcb..691c33f3e7 100644 --- a/compiler/utils/mips/assembler_mips32r6_test.cc +++ b/compiler/utils/mips/assembler_mips32r6_test.cc @@ -2277,6 +2277,22 @@ TEST_F(AssemblerMIPS32r6Test, FillW) { DriverStr(RepeatVR(&mips::MipsAssembler::FillW, "fill.w ${reg1}, ${reg2}"), "fill.w"); } +TEST_F(AssemblerMIPS32r6Test, PcntB) { + DriverStr(RepeatVV(&mips::MipsAssembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b"); +} + +TEST_F(AssemblerMIPS32r6Test, PcntH) { + DriverStr(RepeatVV(&mips::MipsAssembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h"); +} + +TEST_F(AssemblerMIPS32r6Test, PcntW) { + DriverStr(RepeatVV(&mips::MipsAssembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w"); +} + +TEST_F(AssemblerMIPS32r6Test, PcntD) { + DriverStr(RepeatVV(&mips::MipsAssembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d"); +} + TEST_F(AssemblerMIPS32r6Test, LdiB) { DriverStr(RepeatVIb(&mips::MipsAssembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b"); } diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc index e1b0e75108..5a817fa960 100644 --- a/compiler/utils/mips64/assembler_mips64.cc +++ b/compiler/utils/mips64/assembler_mips64.cc @@ -2279,6 +2279,26 @@ void Mips64Assembler::Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegist EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15); } +void Mips64Assembler::PcntB(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e); +} + +void Mips64Assembler::PcntH(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e); +} + +void Mips64Assembler::PcntW(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e); +} + +void Mips64Assembler::PcntD(VectorRegister wd, VectorRegister ws) { + CHECK(HasMsa()); + EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e); +} + void Mips64Assembler::ReplicateFPToVectorRegister(VectorRegister dst, FpuRegister src, bool is_double) { diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h index 7a61f39e64..542dbafc87 100644 --- a/compiler/utils/mips64/assembler_mips64.h +++ b/compiler/utils/mips64/assembler_mips64.h @@ -863,6 +863,11 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt); void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt); + void PcntB(VectorRegister wd, VectorRegister ws); + void PcntH(VectorRegister wd, VectorRegister ws); + void PcntW(VectorRegister wd, VectorRegister ws); + void PcntD(VectorRegister wd, VectorRegister ws); + // Helper for replicating floating point value in all destination elements. void ReplicateFPToVectorRegister(VectorRegister dst, FpuRegister src, bool is_double); diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc index b0e1d91c3f..fb5f12be93 100644 --- a/compiler/utils/mips64/assembler_mips64_test.cc +++ b/compiler/utils/mips64/assembler_mips64_test.cc @@ -3529,6 +3529,22 @@ TEST_F(AssemblerMIPS64Test, FillD) { DriverStr(RepeatVR(&mips64::Mips64Assembler::FillD, "fill.d ${reg1}, ${reg2}"), "fill.d"); } +TEST_F(AssemblerMIPS64Test, PcntB) { + DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b"); +} + +TEST_F(AssemblerMIPS64Test, PcntH) { + DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h"); +} + +TEST_F(AssemblerMIPS64Test, PcntW) { + DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w"); +} + +TEST_F(AssemblerMIPS64Test, PcntD) { + DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d"); +} + TEST_F(AssemblerMIPS64Test, LdiB) { DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b"); } diff --git a/compiler/utils/swap_space_test.cc b/compiler/utils/swap_space_test.cc index f4bca59cb3..1650080e66 100644 --- a/compiler/utils/swap_space_test.cc +++ b/compiler/utils/swap_space_test.cc @@ -24,9 +24,9 @@ #include "gtest/gtest.h" +#include "base/os.h" #include "base/unix_file/fd_file.h" #include "common_runtime_test.h" -#include "os.h" namespace art { diff --git a/compiler/utils/test_dex_file_builder_test.cc b/compiler/utils/test_dex_file_builder_test.cc index 736a17edac..788afd8e1a 100644 --- a/compiler/utils/test_dex_file_builder_test.cc +++ b/compiler/utils/test_dex_file_builder_test.cc @@ -16,9 +16,9 @@ #include "test_dex_file_builder.h" +#include "base/utils.h" #include "dex/dex_file-inl.h" #include "gtest/gtest.h" -#include "utils.h" namespace art { diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc index ea160c8993..42c2541421 100644 --- a/compiler/utils/x86/assembler_x86.cc +++ b/compiler/utils/x86/assembler_x86.cc @@ -913,6 +913,78 @@ void X86Assembler::psubq(XmmRegister dst, XmmRegister src) { } +void X86Assembler::paddusb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xDC); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::paddsb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xEC); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::paddusw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xDD); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::paddsw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xED); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::psubusb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xD8); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::psubsb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xE8); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::psubusw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xD9); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::psubsw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0xE9); + EmitXmmRegisterOperand(dst, src); +} + + void X86Assembler::cvtsi2ss(XmmRegister dst, Register src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0xF3); diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h index a085677083..22eaedce61 100644 --- a/compiler/utils/x86/assembler_x86.h +++ b/compiler/utils/x86/assembler_x86.h @@ -449,6 +449,15 @@ class X86Assembler FINAL : public Assembler { void paddq(XmmRegister dst, XmmRegister src); void psubq(XmmRegister dst, XmmRegister src); + void paddusb(XmmRegister dst, XmmRegister src); + void paddsb(XmmRegister dst, XmmRegister src); + void paddusw(XmmRegister dst, XmmRegister src); + void paddsw(XmmRegister dst, XmmRegister src); + void psubusb(XmmRegister dst, XmmRegister src); + void psubsb(XmmRegister dst, XmmRegister src); + void psubusw(XmmRegister dst, XmmRegister src); + void psubsw(XmmRegister dst, XmmRegister src); + void cvtsi2ss(XmmRegister dst, Register src); void cvtsi2sd(XmmRegister dst, Register src); diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc index 2fd1b27182..8f72db748b 100644 --- a/compiler/utils/x86/assembler_x86_test.cc +++ b/compiler/utils/x86/assembler_x86_test.cc @@ -600,6 +600,38 @@ TEST_F(AssemblerX86Test, PSubQ) { DriverStr(RepeatFF(&x86::X86Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq"); } +TEST_F(AssemblerX86Test, PAddUSB) { + DriverStr(RepeatFF(&x86::X86Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb"); +} + +TEST_F(AssemblerX86Test, PAddSB) { + DriverStr(RepeatFF(&x86::X86Assembler::paddsb, "paddsb %{reg2}, %{reg1}"), "paddsb"); +} + +TEST_F(AssemblerX86Test, PAddUSW) { + DriverStr(RepeatFF(&x86::X86Assembler::paddusw, "paddusw %{reg2}, %{reg1}"), "paddusw"); +} + +TEST_F(AssemblerX86Test, PAddSW) { + DriverStr(RepeatFF(&x86::X86Assembler::psubsw, "psubsw %{reg2}, %{reg1}"), "psubsw"); +} + +TEST_F(AssemblerX86Test, PSubUSB) { + DriverStr(RepeatFF(&x86::X86Assembler::psubusb, "psubusb %{reg2}, %{reg1}"), "psubusb"); +} + +TEST_F(AssemblerX86Test, PSubSB) { + DriverStr(RepeatFF(&x86::X86Assembler::psubsb, "psubsb %{reg2}, %{reg1}"), "psubsb"); +} + +TEST_F(AssemblerX86Test, PSubUSW) { + DriverStr(RepeatFF(&x86::X86Assembler::psubusw, "psubusw %{reg2}, %{reg1}"), "psubusw"); +} + +TEST_F(AssemblerX86Test, PSubSW) { + DriverStr(RepeatFF(&x86::X86Assembler::psubsw, "psubsw %{reg2}, %{reg1}"), "psubsw"); +} + TEST_F(AssemblerX86Test, XorPD) { DriverStr(RepeatFF(&x86::X86Assembler::xorpd, "xorpd %{reg2}, %{reg1}"), "xorpd"); } diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc index ff5a357c5e..c6e16e754c 100644 --- a/compiler/utils/x86_64/assembler_x86_64.cc +++ b/compiler/utils/x86_64/assembler_x86_64.cc @@ -1011,6 +1011,86 @@ void X86_64Assembler::psubq(XmmRegister dst, XmmRegister src) { } +void X86_64Assembler::paddusb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xDC); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::paddsb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xEC); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::paddusw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xDD); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::paddsw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xED); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::psubusb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xD8); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::psubsb(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xE8); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::psubusw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xD9); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + +void X86_64Assembler::psubsw(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitOptionalRex32(dst, src); + EmitUint8(0x0F); + EmitUint8(0xE9); + EmitXmmRegisterOperand(dst.LowBits(), src); +} + + void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src) { cvtsi2ss(dst, src, false); } diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index 7a5fdb502f..ab761fb1fc 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -485,6 +485,15 @@ class X86_64Assembler FINAL : public Assembler { void paddq(XmmRegister dst, XmmRegister src); void psubq(XmmRegister dst, XmmRegister src); + void paddusb(XmmRegister dst, XmmRegister src); + void paddsb(XmmRegister dst, XmmRegister src); + void paddusw(XmmRegister dst, XmmRegister src); + void paddsw(XmmRegister dst, XmmRegister src); + void psubusb(XmmRegister dst, XmmRegister src); + void psubsb(XmmRegister dst, XmmRegister src); + void psubusw(XmmRegister dst, XmmRegister src); + void psubsw(XmmRegister dst, XmmRegister src); + void cvtsi2ss(XmmRegister dst, CpuRegister src); // Note: this is the r/m32 version. void cvtsi2ss(XmmRegister dst, CpuRegister src, bool is64bit); void cvtsi2ss(XmmRegister dst, const Address& src, bool is64bit); diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc index 6b1e53c35a..104e215cf5 100644 --- a/compiler/utils/x86_64/assembler_x86_64_test.cc +++ b/compiler/utils/x86_64/assembler_x86_64_test.cc @@ -1282,6 +1282,38 @@ TEST_F(AssemblerX86_64Test, Psubq) { DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq"); } +TEST_F(AssemblerX86_64Test, Paddusb) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb"); +} + +TEST_F(AssemblerX86_64Test, Paddsb) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddsb, "paddsb %{reg2}, %{reg1}"), "paddsb"); +} + +TEST_F(AssemblerX86_64Test, Paddusw) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddusw, "paddusw %{reg2}, %{reg1}"), "paddusw"); +} + +TEST_F(AssemblerX86_64Test, Paddsw) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddsw, "paddsw %{reg2}, %{reg1}"), "paddsw"); +} + +TEST_F(AssemblerX86_64Test, Psubusb) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubusb, "psubusb %{reg2}, %{reg1}"), "psubusb"); +} + +TEST_F(AssemblerX86_64Test, Psubsb) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubsb, "psubsb %{reg2}, %{reg1}"), "psubsb"); +} + +TEST_F(AssemblerX86_64Test, Psubusw) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubusw, "psubusw %{reg2}, %{reg1}"), "psubusw"); +} + +TEST_F(AssemblerX86_64Test, Psubsw) { + DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubsw, "psubsw %{reg2}, %{reg1}"), "psubsw"); +} + TEST_F(AssemblerX86_64Test, Cvtsi2ss) { DriverStr(RepeatFr(&x86_64::X86_64Assembler::cvtsi2ss, "cvtsi2ss %{reg2}, %{reg1}"), "cvtsi2ss"); } |