diff options
44 files changed, 2762 insertions, 1246 deletions
diff --git a/Android.mk b/Android.mk index b006bfe129..15e8308ba0 100644 --- a/Android.mk +++ b/Android.mk @@ -67,8 +67,13 @@ ifdef TARGET_2ND_ARCH rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/JAVA_LIBRARIES/*_intermediates/javalib.odex rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/APPS/*_intermediates/*.odex endif +ifneq ($(TMPDIR),) + rm -rf $(TMPDIR)/$(USER)/test-*/dalvik-cache/* + rm -rf $(TMPDIR)/android-data/dalvik-cache/* +else rm -rf /tmp/$(USER)/test-*/dalvik-cache/* rm -rf /tmp/android-data/dalvik-cache/* +endif .PHONY: clean-oat-target clean-oat-target: @@ -309,7 +314,7 @@ else .PHONY: oat-target-$(1) oat-target-$(1): $$(OUT_OAT_FILE) -$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD) +$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY) @mkdir -p $$(dir $$@) $(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \ --boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \ diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk index 6e27190bd3..17c478c6d1 100644 --- a/build/Android.gtest.mk +++ b/build/Android.gtest.mk @@ -91,6 +91,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \ runtime/entrypoints/quick/quick_trampoline_entrypoints_test.cc \ runtime/entrypoints_order_test.cc \ runtime/exception_test.cc \ + runtime/gc/accounting/card_table_test.cc \ runtime/gc/accounting/space_bitmap_test.cc \ runtime/gc/heap_test.cc \ runtime/gc/space/dlmalloc_space_base_test.cc \ @@ -113,6 +114,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \ runtime/monitor_pool_test.cc \ runtime/monitor_test.cc \ runtime/parsed_options_test.cc \ + runtime/proxy_test.cc \ runtime/reference_table_test.cc \ runtime/thread_pool_test.cc \ runtime/transaction_test.cc \ @@ -123,7 +125,6 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \ COMPILER_GTEST_COMMON_SRC_FILES := \ runtime/jni_internal_test.cc \ - runtime/proxy_test.cc \ runtime/reflection_test.cc \ compiler/dex/global_value_numbering_test.cc \ compiler/dex/local_value_numbering_test.cc \ diff --git a/build/Android.oat.mk b/build/Android.oat.mk index cd6b13aac3..10936a45d6 100644 --- a/build/Android.oat.mk +++ b/build/Android.oat.mk @@ -26,7 +26,7 @@ include art/build/Android.common_path.mk # Use dex2oat debug version for better error reporting # $(1): 2ND_ or undefined, 2ND_ for 32-bit host builds. define create-core-oat-host-rules -$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD) +$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY) @echo "host dex2oat: $$@ ($$?)" @mkdir -p $$(dir $$@) $$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_IMAGE_XMS) --runtime-arg -Xmx$(DEX2OAT_IMAGE_XMX) \ @@ -49,7 +49,7 @@ $(eval $(call create-core-oat-host-rules,2ND_)) endif define create-core-oat-target-rules -$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD) +$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY) @echo "target dex2oat: $$@ ($$?)" @mkdir -p $$(dir $$@) $$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \ diff --git a/compiler/compilers.cc b/compiler/compilers.cc index 250924ad30..5cf846f8db 100644 --- a/compiler/compilers.cc +++ b/compiler/compilers.cc @@ -38,9 +38,6 @@ extern "C" art::CompiledMethod* ArtQuickJniCompileMethod(art::CompilerDriver* dr uint32_t access_flags, uint32_t method_idx, const art::DexFile& dex_file); -// Hack for CFI CIE initialization -extern std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64); - void QuickCompiler::Init() const { ArtInitQuickCompilerContext(GetCompilerDriver()); } @@ -126,17 +123,6 @@ Backend* QuickCompiler::GetCodeGenerator(CompilationUnit* cu, void* compilation_ return mir_to_lir; } -std::vector<uint8_t>* QuickCompiler::GetCallFrameInformationInitialization( - const CompilerDriver& driver) const { - if (driver.GetInstructionSet() == kX86) { - return X86CFIInitialization(false); - } - if (driver.GetInstructionSet() == kX86_64) { - return X86CFIInitialization(true); - } - return nullptr; -} - CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item, uint32_t access_flags, InvokeType invoke_type, diff --git a/compiler/compilers.h b/compiler/compilers.h index 2c231e1212..151bf6fa6c 100644 --- a/compiler/compilers.h +++ b/compiler/compilers.h @@ -56,17 +56,6 @@ class QuickCompiler : public Compiler { void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE {} - /* - * @brief Generate and return Dwarf CFI initialization, if supported by the - * backend. - * @param driver CompilerDriver for this compile. - * @returns nullptr if not supported by backend or a vector of bytes for CFI DWARF - * information. - * @note This is used for backtrace information in generated code. - */ - std::vector<uint8_t>* GetCallFrameInformationInitialization(const CompilerDriver& driver) const - OVERRIDE; - private: DISALLOW_COPY_AND_ASSIGN(QuickCompiler); }; diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc index 5059c5fc9a..b1339916f0 100644 --- a/compiler/dex/quick/arm/call_arm.cc +++ b/compiler/dex/quick/arm/call_arm.cc @@ -43,8 +43,7 @@ namespace art { * add rARM_PC, r_disp ; This is the branch from which we compute displacement * cbnz r_idx, lp */ -void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset, - RegLocation rl_src) { +void ArmMir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpSparseSwitchTable(table); @@ -92,8 +91,7 @@ void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset, } -void ArmMir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset, - RegLocation rl_src) { +void ArmMir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpPackedSwitchTable(table); diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h index 21322a6f15..072acbeaa7 100644 --- a/compiler/dex/quick/arm/codegen_arm.h +++ b/compiler/dex/quick/arm/codegen_arm.h @@ -130,8 +130,8 @@ class ArmMir2Lir FINAL : public Mir2Lir { int first_bit, int second_bit); void GenNegDouble(RegLocation rl_dest, RegLocation rl_src); void GenNegFloat(RegLocation rl_dest, RegLocation rl_src); - void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); - void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); + void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); + void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); // Required for target - single operation generators. LIR* OpUnconditionalBranch(LIR* target); diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc index 6fa8a4aca5..7c5c4fac40 100644 --- a/compiler/dex/quick/arm64/call_arm64.cc +++ b/compiler/dex/quick/arm64/call_arm64.cc @@ -43,8 +43,7 @@ namespace art { * br r_base * quit: */ -void Arm64Mir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset, - RegLocation rl_src) { +void Arm64Mir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpSparseSwitchTable(table); @@ -96,8 +95,7 @@ void Arm64Mir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset, } -void Arm64Mir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset, - RegLocation rl_src) { +void Arm64Mir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpPackedSwitchTable(table); diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h index b182cc027e..2cd24c6874 100644 --- a/compiler/dex/quick/arm64/codegen_arm64.h +++ b/compiler/dex/quick/arm64/codegen_arm64.h @@ -196,8 +196,8 @@ class Arm64Mir2Lir FINAL : public Mir2Lir { int first_bit, int second_bit) OVERRIDE; void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE; void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE; - void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; - void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; + void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; + void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; // Required for target - single operation generators. LIR* OpUnconditionalBranch(LIR* target) OVERRIDE; diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc index 0054f34322..f6c77fcea8 100644 --- a/compiler/dex/quick/gen_common.cc +++ b/compiler/dex/quick/gen_common.cc @@ -2008,4 +2008,92 @@ void Mir2Lir::GenConstWide(RegLocation rl_dest, int64_t value) { StoreValueWide(rl_dest, rl_result); } +void Mir2Lir::GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { + const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; + const uint16_t entries = table[1]; + // Chained cmp-and-branch. + const int32_t* as_int32 = reinterpret_cast<const int32_t*>(&table[2]); + int32_t current_key = as_int32[0]; + const int32_t* targets = &as_int32[1]; + rl_src = LoadValue(rl_src, kCoreReg); + int i = 0; + for (; i < entries; i++, current_key++) { + if (!InexpensiveConstantInt(current_key, Instruction::Code::IF_EQ)) { + // Switch to using a temp and add. + break; + } + BasicBlock* case_block = + mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]); + OpCmpImmBranch(kCondEq, rl_src.reg, current_key, &block_label_list_[case_block->id]); + } + if (i < entries) { + // The rest do not seem to be inexpensive. Try to allocate a temp and use add. + RegStorage key_temp = AllocTypedTemp(false, kCoreReg, false); + if (key_temp.Valid()) { + LoadConstantNoClobber(key_temp, current_key); + for (; i < entries - 1; i++, current_key++) { + BasicBlock* case_block = + mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]); + OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]); + OpRegImm(kOpAdd, key_temp, 1); // Increment key. + } + BasicBlock* case_block = + mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]); + OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]); + } else { + // No free temp, just finish the old loop. + for (; i < entries; i++, current_key++) { + BasicBlock* case_block = + mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]); + OpCmpImmBranch(kCondEq, rl_src.reg, current_key, &block_label_list_[case_block->id]); + } + } + } +} + +void Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { + const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; + if (cu_->verbose) { + DumpSparseSwitchTable(table); + } + + const uint16_t entries = table[1]; + if (entries <= kSmallSwitchThreshold) { + GenSmallPackedSwitch(mir, table_offset, rl_src); + } else { + // Use the backend-specific implementation. + GenLargePackedSwitch(mir, table_offset, rl_src); + } +} + +void Mir2Lir::GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { + const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; + const uint16_t entries = table[1]; + // Chained cmp-and-branch. + const int32_t* keys = reinterpret_cast<const int32_t*>(&table[2]); + const int32_t* targets = &keys[entries]; + rl_src = LoadValue(rl_src, kCoreReg); + for (int i = 0; i < entries; i++) { + int key = keys[i]; + BasicBlock* case_block = + mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]); + OpCmpImmBranch(kCondEq, rl_src.reg, key, &block_label_list_[case_block->id]); + } +} + +void Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { + const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; + if (cu_->verbose) { + DumpSparseSwitchTable(table); + } + + const uint16_t entries = table[1]; + if (entries <= kSmallSwitchThreshold) { + GenSmallSparseSwitch(mir, table_offset, rl_src); + } else { + // Use the backend-specific implementation. + GenLargeSparseSwitch(mir, table_offset, rl_src); + } +} + } // namespace art diff --git a/compiler/dex/quick/local_optimizations.cc b/compiler/dex/quick/local_optimizations.cc index eec2b32726..e0f4691063 100644 --- a/compiler/dex/quick/local_optimizations.cc +++ b/compiler/dex/quick/local_optimizations.cc @@ -200,7 +200,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) { /* Initialize alias list */ alias_list.clear(); ResourceMask alias_reg_list_mask = kEncodeNone; - if (!this_mem_mask.Intersects(kEncodeLiteral)) { + if (!this_mem_mask.Intersects(kEncodeMem) && !this_mem_mask.Intersects(kEncodeLiteral)) { alias_list.push_back(dest_reg_id); SetupRegMask(&alias_reg_list_mask, dest_reg_id); } @@ -248,7 +248,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) { bool is_check_lir_load = check_flags & IS_LOAD; bool reg_compatible = RegStorage::SameRegType(check_lir->operands[0], native_reg_id); - if (alias_mem_mask.Equals(kEncodeLiteral)) { + if (!alias_mem_mask.Intersects(kEncodeMem) && alias_mem_mask.Equals(kEncodeLiteral)) { DCHECK(check_flags & IS_LOAD); /* Same value && same register type */ if (reg_compatible && (this_lir->target == check_lir->target)) { diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc index 9adddf079d..4577a4c904 100644 --- a/compiler/dex/quick/mips/call_mips.cc +++ b/compiler/dex/quick/mips/call_mips.cc @@ -61,8 +61,7 @@ bool MipsMir2Lir::GenSpecialCase(BasicBlock* bb, MIR* mir, * done: * */ -void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, - RegLocation rl_src) { +void MipsMir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpSparseSwitchTable(table); @@ -139,8 +138,7 @@ void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, * jr rRA * done: */ -void MipsMir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset, - RegLocation rl_src) { +void MipsMir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpPackedSwitchTable(table); diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h index bd0c020ce8..43cbde7781 100644 --- a/compiler/dex/quick/mips/codegen_mips.h +++ b/compiler/dex/quick/mips/codegen_mips.h @@ -128,8 +128,8 @@ class MipsMir2Lir FINAL : public Mir2Lir { int first_bit, int second_bit); void GenNegDouble(RegLocation rl_dest, RegLocation rl_src); void GenNegFloat(RegLocation rl_dest, RegLocation rl_src); - void GenPackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src); - void GenSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src); + void GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src); + void GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src); bool GenSpecialCase(BasicBlock* bb, MIR* mir, const InlineMethod& special); // Required for target - single operation generators. diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h index 4ed9929338..0e6f36bdeb 100644 --- a/compiler/dex/quick/mir_to_lir.h +++ b/compiler/dex/quick/mir_to_lir.h @@ -229,6 +229,9 @@ class Mir2Lir : public Backend { static constexpr bool kFailOnSizeError = true && kIsDebugBuild; static constexpr bool kReportSizeError = true && kIsDebugBuild; + // TODO: If necessary, this could be made target-dependent. + static constexpr uint16_t kSmallSwitchThreshold = 5; + /* * Auxiliary information describing the location of data embedded in the Dalvik * byte code stream. @@ -1355,8 +1358,19 @@ class Mir2Lir : public Backend { int first_bit, int second_bit) = 0; virtual void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) = 0; virtual void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) = 0; - virtual void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0; - virtual void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0; + + // Create code for switch statements. Will decide between short and long versions below. + void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); + void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); + + // Potentially backend-specific versions of switch instructions for shorter switch statements. + // The default implementation will create a chained compare-and-branch. + virtual void GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); + virtual void GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src); + // Backend-specific versions of switch instructions for longer switch statements. + virtual void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0; + virtual void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0; + virtual void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index, RegLocation rl_dest, int scale) = 0; virtual void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array, diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc index 15aae9ef1e..f5f86717b4 100644 --- a/compiler/dex/quick/x86/call_x86.cc +++ b/compiler/dex/quick/x86/call_x86.cc @@ -27,8 +27,7 @@ namespace art { * The sparse table in the literal pool is an array of <key,displacement> * pairs. */ -void X86Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, - RegLocation rl_src) { +void X86Mir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpSparseSwitchTable(table); @@ -61,8 +60,7 @@ void X86Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, * jmp r_start_of_method * done: */ -void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset, - RegLocation rl_src) { +void X86Mir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) { const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset; if (cu_->verbose) { DumpPackedSwitchTable(table); diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h index 266191adcb..1d8deae3c8 100644 --- a/compiler/dex/quick/x86/codegen_x86.h +++ b/compiler/dex/quick/x86/codegen_x86.h @@ -246,8 +246,8 @@ class X86Mir2Lir : public Mir2Lir { int first_bit, int second_bit) OVERRIDE; void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE; void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE; - void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; - void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; + void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; + void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE; /** * @brief Implement instanceof a final class with x86 specific code. @@ -355,12 +355,6 @@ class X86Mir2Lir : public Mir2Lir { void InstallLiteralPools() OVERRIDE; /* - * @brief Generate the debug_frame CFI information. - * @returns pointer to vector containing CFE information - */ - static std::vector<uint8_t>* ReturnCommonCallFrameInformation(bool is_x86_64); - - /* * @brief Generate the debug_frame FDE information. * @returns pointer to vector containing CFE information */ diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc index bd2e0f31bb..62a75a598b 100755 --- a/compiler/dex/quick/x86/target_x86.cc +++ b/compiler/dex/quick/x86/target_x86.cc @@ -1437,11 +1437,6 @@ static void AdvanceLoc(std::vector<uint8_t>&buf, uint32_t increment) { } } - -std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64) { - return X86Mir2Lir::ReturnCommonCallFrameInformation(is_x86_64); -} - static void EncodeUnsignedLeb128(std::vector<uint8_t>& buf, uint32_t value) { uint8_t buffer[12]; uint8_t *ptr = EncodeUnsignedLeb128(buffer, value); @@ -1458,84 +1453,6 @@ static void EncodeSignedLeb128(std::vector<uint8_t>& buf, int32_t value) { } } -std::vector<uint8_t>* X86Mir2Lir::ReturnCommonCallFrameInformation(bool is_x86_64) { - std::vector<uint8_t>*cfi_info = new std::vector<uint8_t>; - - // Length (will be filled in later in this routine). - PushWord(*cfi_info, 0); - - // CIE id: always 0. - PushWord(*cfi_info, 0); - - // Version: always 1. - cfi_info->push_back(0x01); - - // Augmentation: 'zR\0' - cfi_info->push_back(0x7a); - cfi_info->push_back(0x52); - cfi_info->push_back(0x0); - - // Code alignment: 1. - EncodeUnsignedLeb128(*cfi_info, 1); - - // Data alignment. - if (is_x86_64) { - EncodeSignedLeb128(*cfi_info, -8); - } else { - EncodeSignedLeb128(*cfi_info, -4); - } - - // Return address register. - if (is_x86_64) { - // R16(RIP) - cfi_info->push_back(0x10); - } else { - // R8(EIP) - cfi_info->push_back(0x08); - } - - // Augmentation length: 1. - cfi_info->push_back(1); - - // Augmentation data: 0x03 ((DW_EH_PE_absptr << 4) | DW_EH_PE_udata4). - cfi_info->push_back(0x03); - - // Initial instructions. - if (is_x86_64) { - // DW_CFA_def_cfa R7(RSP) 8. - cfi_info->push_back(0x0c); - cfi_info->push_back(0x07); - cfi_info->push_back(0x08); - - // DW_CFA_offset R16(RIP) 1 (* -8). - cfi_info->push_back(0x90); - cfi_info->push_back(0x01); - } else { - // DW_CFA_def_cfa R4(ESP) 4. - cfi_info->push_back(0x0c); - cfi_info->push_back(0x04); - cfi_info->push_back(0x04); - - // DW_CFA_offset R8(EIP) 1 (* -4). - cfi_info->push_back(0x88); - cfi_info->push_back(0x01); - } - - // Padding to a multiple of 4 - while ((cfi_info->size() & 3) != 0) { - // DW_CFA_nop is encoded as 0. - cfi_info->push_back(0); - } - - // Set the length of the CIE inside the generated bytes. - uint32_t length = cfi_info->size() - 4; - (*cfi_info)[0] = length; - (*cfi_info)[1] = length >> 8; - (*cfi_info)[2] = length >> 16; - (*cfi_info)[3] = length >> 24; - return cfi_info; -} - static bool ARTRegIDToDWARFRegID(bool is_x86_64, int art_reg_id, int* dwarf_reg_id) { if (is_x86_64) { switch (art_reg_id) { diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc index ed126adfe9..810761f0d8 100644 --- a/compiler/driver/compiler_driver.cc +++ b/compiler/driver/compiler_driver.cc @@ -353,7 +353,6 @@ CompilerDriver::CompilerDriver(const CompilerOptions* compiler_options, compiler_enable_auto_elf_loading_(NULL), compiler_get_method_code_addr_(NULL), support_boot_image_fixup_(instruction_set != kMips), - cfi_info_(nullptr), dedupe_code_("dedupe code"), dedupe_mapping_table_("dedupe mapping table"), dedupe_vmap_table_("dedupe vmap table"), @@ -376,11 +375,6 @@ CompilerDriver::CompilerDriver(const CompilerOptions* compiler_options, CHECK(image_classes_.get() == nullptr); } - // Are we generating CFI information? - if (compiler_options->GetGenerateGDBInformation()) { - cfi_info_.reset(compiler_->GetCallFrameInformationInitialization(*this)); - } - // Read the profile file if one is provided. if (!profile_file.empty()) { profile_present_ = profile_file_.LoadFile(profile_file); @@ -597,7 +591,7 @@ void CompilerDriver::Resolve(jobject class_loader, const std::vector<const DexFi for (size_t i = 0; i != dex_files.size(); ++i) { const DexFile* dex_file = dex_files[i]; CHECK(dex_file != nullptr); - ResolveDexFile(class_loader, *dex_file, thread_pool, timings); + ResolveDexFile(class_loader, *dex_file, dex_files, thread_pool, timings); } } @@ -1357,12 +1351,14 @@ class ParallelCompilationManager { jobject class_loader, CompilerDriver* compiler, const DexFile* dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool) : index_(0), class_linker_(class_linker), class_loader_(class_loader), compiler_(compiler), dex_file_(dex_file), + dex_files_(dex_files), thread_pool_(thread_pool) {} ClassLinker* GetClassLinker() const { @@ -1384,6 +1380,10 @@ class ParallelCompilationManager { return dex_file_; } + const std::vector<const DexFile*>& GetDexFiles() const { + return dex_files_; + } + void ForAll(size_t begin, size_t end, Callback callback, size_t work_units) { Thread* self = Thread::Current(); self->AssertNoPendingException(); @@ -1441,11 +1441,24 @@ class ParallelCompilationManager { const jobject class_loader_; CompilerDriver* const compiler_; const DexFile* const dex_file_; + const std::vector<const DexFile*>& dex_files_; ThreadPool* const thread_pool_; DISALLOW_COPY_AND_ASSIGN(ParallelCompilationManager); }; +static bool SkipClassCheckClassPath(const char* descriptor, const DexFile& dex_file, + const std::vector<const DexFile*>& classpath) { + DexFile::ClassPathEntry pair = DexFile::FindInClassPath(descriptor, classpath); + CHECK(pair.second != NULL); + if (pair.first != &dex_file) { + LOG(WARNING) << "Skipping class " << descriptor << " from " << dex_file.GetLocation() + << " previously found in " << pair.first->GetLocation(); + return true; + } + return false; +} + // Return true if the class should be skipped during compilation. // // The first case where we skip is for redundant class definitions in @@ -1454,20 +1467,23 @@ class ParallelCompilationManager { // The second case where we skip is when an app bundles classes found // in the boot classpath. Since at runtime we will select the class from // the boot classpath, we ignore the one from the app. +// +// The third case is if the app itself has the class defined in multiple dex files. Then we skip +// it if it is not the first occurrence. static bool SkipClass(ClassLinker* class_linker, jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, const DexFile::ClassDef& class_def) { const char* descriptor = dex_file.GetClassDescriptor(class_def); + if (class_loader == NULL) { - DexFile::ClassPathEntry pair = DexFile::FindInClassPath(descriptor, class_linker->GetBootClassPath()); - CHECK(pair.second != NULL); - if (pair.first != &dex_file) { - LOG(WARNING) << "Skipping class " << descriptor << " from " << dex_file.GetLocation() - << " previously found in " << pair.first->GetLocation(); - return true; - } - return false; + return SkipClassCheckClassPath(descriptor, dex_file, class_linker->GetBootClassPath()); } - return class_linker->IsInBootClassPath(descriptor); + + if (class_linker->IsInBootClassPath(descriptor)) { + return true; + } + + return SkipClassCheckClassPath(descriptor, dex_file, dex_files); } // A fast version of SkipClass above if the class pointer is available @@ -1525,7 +1541,7 @@ static void ResolveClassFieldsAndMethods(const ParallelCompilationManager* manag // definitions, since many of them many never be referenced by // generated code. const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index); - if (!SkipClass(class_linker, jclass_loader, dex_file, class_def)) { + if (!SkipClass(class_linker, jclass_loader, dex_file, manager->GetDexFiles(), class_def)) { ScopedObjectAccess soa(self); StackHandleScope<2> hs(soa.Self()); Handle<mirror::ClassLoader> class_loader( @@ -1632,13 +1648,15 @@ static void ResolveType(const ParallelCompilationManager* manager, size_t type_i } void CompilerDriver::ResolveDexFile(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) { ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); // TODO: we could resolve strings here, although the string table is largely filled with class // and method names. - ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool); + ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, dex_files, + thread_pool); if (IsImage()) { // For images we resolve all types, such as array, whereas for applications just those with // classdefs are resolved by ResolveClassFieldsAndMethods. @@ -1655,7 +1673,7 @@ void CompilerDriver::Verify(jobject class_loader, const std::vector<const DexFil for (size_t i = 0; i != dex_files.size(); ++i) { const DexFile* dex_file = dex_files[i]; CHECK(dex_file != NULL); - VerifyDexFile(class_loader, *dex_file, thread_pool, timings); + VerifyDexFile(class_loader, *dex_file, dex_files, thread_pool, timings); } } @@ -1707,10 +1725,12 @@ static void VerifyClass(const ParallelCompilationManager* manager, size_t class_ } void CompilerDriver::VerifyDexFile(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) { TimingLogger::ScopedTiming t("Verify Dex File", timings); ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); - ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool); + ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, dex_files, + thread_pool); context.ForAll(0, dex_file.NumClassDefs(), VerifyClass, thread_count_); } @@ -1800,10 +1820,12 @@ static void InitializeClass(const ParallelCompilationManager* manager, size_t cl } void CompilerDriver::InitializeClasses(jobject jni_class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) { TimingLogger::ScopedTiming t("InitializeNoClinit", timings); ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); - ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, thread_pool); + ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, dex_files, + thread_pool); size_t thread_count; if (IsImage()) { // TODO: remove this when transactional mode supports multithreading. @@ -1824,7 +1846,7 @@ void CompilerDriver::InitializeClasses(jobject class_loader, for (size_t i = 0; i != dex_files.size(); ++i) { const DexFile* dex_file = dex_files[i]; CHECK(dex_file != NULL); - InitializeClasses(class_loader, *dex_file, thread_pool, timings); + InitializeClasses(class_loader, *dex_file, dex_files, thread_pool, timings); } } @@ -1833,7 +1855,7 @@ void CompilerDriver::Compile(jobject class_loader, const std::vector<const DexFi for (size_t i = 0; i != dex_files.size(); ++i) { const DexFile* dex_file = dex_files[i]; CHECK(dex_file != NULL); - CompileDexFile(class_loader, *dex_file, thread_pool, timings); + CompileDexFile(class_loader, *dex_file, dex_files, thread_pool, timings); } } @@ -1843,7 +1865,7 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz const DexFile& dex_file = *manager->GetDexFile(); const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index); ClassLinker* class_linker = manager->GetClassLinker(); - if (SkipClass(class_linker, jclass_loader, dex_file, class_def)) { + if (SkipClass(class_linker, jclass_loader, dex_file, manager->GetDexFiles(), class_def)) { return; } ClassReference ref(&dex_file, class_def_index); @@ -1912,10 +1934,11 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz } void CompilerDriver::CompileDexFile(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) { TimingLogger::ScopedTiming t("Compile Dex File", timings); ParallelCompilationManager context(Runtime::Current()->GetClassLinker(), class_loader, this, - &dex_file, thread_pool); + &dex_file, dex_files, thread_pool); context.ForAll(0, dex_file.NumClassDefs(), CompilerDriver::CompileClass, thread_count_); } diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h index 6dae398372..e6927d1c8d 100644 --- a/compiler/driver/compiler_driver.h +++ b/compiler/driver/compiler_driver.h @@ -402,10 +402,6 @@ class CompilerDriver { return dump_passes_; } - bool DidIncludeDebugSymbols() const { - return compiler_options_->GetIncludeDebugSymbols(); - } - CumulativeLogger* GetTimingsLogger() const { return timings_logger_; } @@ -599,14 +595,6 @@ class CompilerDriver { std::vector<uint8_t>* DeduplicateGCMap(const std::vector<uint8_t>& code); std::vector<uint8_t>* DeduplicateCFIInfo(const std::vector<uint8_t>* cfi_info); - /* - * @brief return the pointer to the Call Frame Information. - * @return pointer to call frame information for this compilation. - */ - std::vector<uint8_t>* GetCallFrameInformation() const { - return cfi_info_.get(); - } - ProfileFile profile_file_; bool profile_present_; @@ -658,12 +646,14 @@ class CompilerDriver { ThreadPool* thread_pool, TimingLogger* timings) LOCKS_EXCLUDED(Locks::mutator_lock_); void ResolveDexFile(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) LOCKS_EXCLUDED(Locks::mutator_lock_); void Verify(jobject class_loader, const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings); void VerifyDexFile(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) LOCKS_EXCLUDED(Locks::mutator_lock_); @@ -671,6 +661,7 @@ class CompilerDriver { ThreadPool* thread_pool, TimingLogger* timings) LOCKS_EXCLUDED(Locks::mutator_lock_); void InitializeClasses(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) LOCKS_EXCLUDED(Locks::mutator_lock_, compiled_classes_lock_); @@ -681,6 +672,7 @@ class CompilerDriver { void Compile(jobject class_loader, const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings); void CompileDexFile(jobject class_loader, const DexFile& dex_file, + const std::vector<const DexFile*>& dex_files, ThreadPool* thread_pool, TimingLogger* timings) LOCKS_EXCLUDED(Locks::mutator_lock_); void CompileMethod(const DexFile::CodeItem* code_item, uint32_t access_flags, @@ -766,9 +758,6 @@ class CompilerDriver { bool support_boot_image_fixup_; - // Call Frame Information, which might be generated to help stack tracebacks. - std::unique_ptr<std::vector<uint8_t>> cfi_info_; - // DeDuplication data structures, these own the corresponding byte arrays. class DedupeHashFunc { public: diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc index 1fde12ecaf..71f02d3b5d 100644 --- a/compiler/elf_writer_quick.cc +++ b/compiler/elf_writer_quick.cc @@ -24,6 +24,7 @@ #include "elf_utils.h" #include "file_output_stream.h" #include "globals.h" +#include "leb128.h" #include "oat.h" #include "oat_writer.h" #include "utils.h" @@ -38,6 +39,25 @@ static uint8_t MakeStInfo(uint8_t binding, uint8_t type) { return ((binding) << 4) + ((type) & 0xf); } +static void UpdateWord(std::vector<uint8_t>* buf, int offset, int data) { + (*buf)[offset+0] = data; + (*buf)[offset+1] = data >> 8; + (*buf)[offset+2] = data >> 16; + (*buf)[offset+3] = data >> 24; +} + +static void PushWord(std::vector<uint8_t>* buf, int data) { + buf->push_back(data & 0xff); + buf->push_back((data >> 8) & 0xff); + buf->push_back((data >> 16) & 0xff); + buf->push_back((data >> 24) & 0xff); +} + +static void PushHalf(std::vector<uint8_t>* buf, int data) { + buf->push_back(data & 0xff); + buf->push_back((data >> 8) & 0xff); +} + bool ElfWriterQuick::ElfBuilder::Write() { // The basic layout of the elf file. Order may be different in final output. // +-------------------------+ @@ -822,37 +842,131 @@ void ElfWriterQuick::ReservePatchSpace(std::vector<uint8_t>* buffer, bool debug) } } +static void EncodeUnsignedLeb128(uint32_t data, std::vector<uint8_t>* dst) { + size_t encoded_size = UnsignedLeb128Size(data); + size_t cur_index = dst->size(); + dst->resize(dst->size() + encoded_size); + uint8_t* write_pos = &((*dst)[cur_index]); + uint8_t* write_pos_after = EncodeUnsignedLeb128(write_pos, data); + DCHECK_EQ(static_cast<size_t>(write_pos_after - write_pos), encoded_size); +} + +static void EncodeSignedLeb128(int32_t data, std::vector<uint8_t>* dst) { + size_t encoded_size = SignedLeb128Size(data); + size_t cur_index = dst->size(); + dst->resize(dst->size() + encoded_size); + uint8_t* write_pos = &((*dst)[cur_index]); + uint8_t* write_pos_after = EncodeSignedLeb128(write_pos, data); + DCHECK_EQ(static_cast<size_t>(write_pos_after - write_pos), encoded_size); +} + +std::vector<uint8_t>* ConstructCIEFrameX86(bool is_x86_64) { + std::vector<uint8_t>*cfi_info = new std::vector<uint8_t>; + + // Length (will be filled in later in this routine). + PushWord(cfi_info, 0); + + // CIE id: always 0. + PushWord(cfi_info, 0); + + // Version: always 1. + cfi_info->push_back(0x01); + + // Augmentation: 'zR\0' + cfi_info->push_back(0x7a); + cfi_info->push_back(0x52); + cfi_info->push_back(0x0); + + // Code alignment: 1. + EncodeUnsignedLeb128(1, cfi_info); + + // Data alignment. + if (is_x86_64) { + EncodeSignedLeb128(-8, cfi_info); + } else { + EncodeSignedLeb128(-4, cfi_info); + } + + // Return address register. + if (is_x86_64) { + // R16(RIP) + cfi_info->push_back(0x10); + } else { + // R8(EIP) + cfi_info->push_back(0x08); + } + + // Augmentation length: 1. + cfi_info->push_back(1); + + // Augmentation data: 0x03 ((DW_EH_PE_absptr << 4) | DW_EH_PE_udata4). + cfi_info->push_back(0x03); + + // Initial instructions. + if (is_x86_64) { + // DW_CFA_def_cfa R7(RSP) 8. + cfi_info->push_back(0x0c); + cfi_info->push_back(0x07); + cfi_info->push_back(0x08); + + // DW_CFA_offset R16(RIP) 1 (* -8). + cfi_info->push_back(0x90); + cfi_info->push_back(0x01); + } else { + // DW_CFA_def_cfa R4(ESP) 4. + cfi_info->push_back(0x0c); + cfi_info->push_back(0x04); + cfi_info->push_back(0x04); + + // DW_CFA_offset R8(EIP) 1 (* -4). + cfi_info->push_back(0x88); + cfi_info->push_back(0x01); + } + + // Padding to a multiple of 4 + while ((cfi_info->size() & 3) != 0) { + // DW_CFA_nop is encoded as 0. + cfi_info->push_back(0); + } + + // Set the length of the CIE inside the generated bytes. + uint32_t length = cfi_info->size() - 4; + (*cfi_info)[0] = length; + (*cfi_info)[1] = length >> 8; + (*cfi_info)[2] = length >> 16; + (*cfi_info)[3] = length >> 24; + return cfi_info; +} + +std::vector<uint8_t>* ConstructCIEFrame(InstructionSet isa) { + switch (isa) { + case kX86: + return ConstructCIEFrameX86(false); + case kX86_64: + return ConstructCIEFrameX86(true); + + default: + // Not implemented. + return nullptr; + } +} + bool ElfWriterQuick::Write(OatWriter* oat_writer, const std::vector<const DexFile*>& dex_files_unused, const std::string& android_root_unused, bool is_host_unused) { - const bool debug = false; - const bool add_symbols = oat_writer->DidAddSymbols(); + constexpr bool debug = false; const OatHeader& oat_header = oat_writer->GetOatHeader(); Elf32_Word oat_data_size = oat_header.GetExecutableOffset(); uint32_t oat_exec_size = oat_writer->GetSize() - oat_data_size; ElfBuilder builder(oat_writer, elf_file_, compiler_driver_->GetInstructionSet(), 0, - oat_data_size, oat_data_size, oat_exec_size, add_symbols, debug); + oat_data_size, oat_data_size, oat_exec_size, + compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols(), + debug); - if (add_symbols) { - AddDebugSymbols(builder, oat_writer, debug); - } - - bool generateDebugInformation = compiler_driver_->GetCallFrameInformation() != nullptr; - if (generateDebugInformation) { - ElfRawSectionBuilder debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0); - ElfRawSectionBuilder debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0); - ElfRawSectionBuilder debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0); - ElfRawSectionBuilder eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0); - eh_frame.SetBuffer(*compiler_driver_->GetCallFrameInformation()); - - FillInCFIInformation(oat_writer, debug_info.GetBuffer(), - debug_abbrev.GetBuffer(), debug_str.GetBuffer()); - builder.RegisterRawSection(debug_info); - builder.RegisterRawSection(debug_abbrev); - builder.RegisterRawSection(eh_frame); - builder.RegisterRawSection(debug_str); + if (compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) { + WriteDebugSymbols(builder, oat_writer); } if (compiler_driver_->GetCompilerOptions().GetIncludePatchInformation()) { @@ -865,32 +979,62 @@ bool ElfWriterQuick::Write(OatWriter* oat_writer, return builder.Write(); } -void ElfWriterQuick::AddDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer, bool debug) { +void ElfWriterQuick::WriteDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer) { + std::unique_ptr<std::vector<uint8_t>> cfi_info( + ConstructCIEFrame(compiler_driver_->GetInstructionSet())); + + // Iterate over the compiled methods. const std::vector<OatWriter::DebugInfo>& method_info = oat_writer->GetCFIMethodInfo(); ElfSymtabBuilder* symtab = &builder.symtab_builder_; for (auto it = method_info.begin(); it != method_info.end(); ++it) { symtab->AddSymbol(it->method_name_, &builder.text_builder_, it->low_pc_, true, it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC); - } -} -static void UpdateWord(std::vector<uint8_t>*buf, int offset, int data) { - (*buf)[offset+0] = data; - (*buf)[offset+1] = data >> 8; - (*buf)[offset+2] = data >> 16; - (*buf)[offset+3] = data >> 24; -} + // Include CFI for compiled method, if possible. + if (cfi_info.get() != nullptr) { + DCHECK(it->compiled_method_ != nullptr); + + // Copy in the FDE, if present + const std::vector<uint8_t>* fde = it->compiled_method_->GetCFIInfo(); + if (fde != nullptr) { + // Copy the information into cfi_info and then fix the address in the new copy. + int cur_offset = cfi_info->size(); + cfi_info->insert(cfi_info->end(), fde->begin(), fde->end()); + + // Set the 'CIE_pointer' field to cur_offset+4. + uint32_t CIE_pointer = cur_offset + 4; + uint32_t offset_to_update = cur_offset + sizeof(uint32_t); + (*cfi_info)[offset_to_update+0] = CIE_pointer; + (*cfi_info)[offset_to_update+1] = CIE_pointer >> 8; + (*cfi_info)[offset_to_update+2] = CIE_pointer >> 16; + (*cfi_info)[offset_to_update+3] = CIE_pointer >> 24; + + // Set the 'initial_location' field to address the start of the method. + offset_to_update = cur_offset + 2*sizeof(uint32_t); + const uint32_t quick_code_start = it->low_pc_; + (*cfi_info)[offset_to_update+0] = quick_code_start; + (*cfi_info)[offset_to_update+1] = quick_code_start >> 8; + (*cfi_info)[offset_to_update+2] = quick_code_start >> 16; + (*cfi_info)[offset_to_update+3] = quick_code_start >> 24; + } + } + } -static void PushWord(std::vector<uint8_t>*buf, int data) { - buf->push_back(data & 0xff); - buf->push_back((data >> 8) & 0xff); - buf->push_back((data >> 16) & 0xff); - buf->push_back((data >> 24) & 0xff); -} + if (cfi_info.get() != nullptr) { + // Now lay down the Elf sections. + ElfRawSectionBuilder debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0); + ElfRawSectionBuilder debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0); + ElfRawSectionBuilder debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0); + ElfRawSectionBuilder eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0); + eh_frame.SetBuffer(std::move(*cfi_info.get())); -static void PushHalf(std::vector<uint8_t>*buf, int data) { - buf->push_back(data & 0xff); - buf->push_back((data >> 8) & 0xff); + FillInCFIInformation(oat_writer, debug_info.GetBuffer(), debug_abbrev.GetBuffer(), + debug_str.GetBuffer()); + builder.RegisterRawSection(debug_info); + builder.RegisterRawSection(debug_abbrev); + builder.RegisterRawSection(eh_frame); + builder.RegisterRawSection(debug_str); + } } void ElfWriterQuick::FillInCFIInformation(OatWriter* oat_writer, diff --git a/compiler/elf_writer_quick.h b/compiler/elf_writer_quick.h index a0d36df471..8cfe550495 100644 --- a/compiler/elf_writer_quick.h +++ b/compiler/elf_writer_quick.h @@ -48,9 +48,7 @@ class ElfWriterQuick FINAL : public ElfWriter { ~ElfWriterQuick() {} class ElfBuilder; - void AddDebugSymbols(ElfBuilder& builder, - OatWriter* oat_writer, - bool debug); + void WriteDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer); void ReservePatchSpace(std::vector<uint8_t>* buffer, bool debug); class ElfSectionBuilder { @@ -126,7 +124,7 @@ class ElfWriterQuick FINAL : public ElfWriter { : ElfSectionBuilder(sec_name, type, flags, link, info, align, entsize) {} ~ElfRawSectionBuilder() {} std::vector<uint8_t>* GetBuffer() { return &buf_; } - void SetBuffer(std::vector<uint8_t> buf) { buf_ = buf; } + void SetBuffer(std::vector<uint8_t>&& buf) { buf_ = buf; } protected: std::vector<uint8_t> buf_; diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc index 9da59ab7ff..1ba5d3218e 100644 --- a/compiler/oat_writer.cc +++ b/compiler/oat_writer.cc @@ -357,7 +357,6 @@ class OatWriter::InitCodeMethodVisitor : public OatDexMethodVisitor { uint32_t thumb_offset = compiled_method->CodeDelta(); quick_code_offset = offset_ + sizeof(OatQuickMethodHeader) + thumb_offset; - bool force_debug_capture = false; bool deduped = false; // Deduplicate code arrays. @@ -400,47 +399,22 @@ class OatWriter::InitCodeMethodVisitor : public OatDexMethodVisitor { offset_ += code_size; } - uint32_t quick_code_start = quick_code_offset - writer_->oat_header_->GetExecutableOffset(); - std::vector<uint8_t>* cfi_info = writer_->compiler_driver_->GetCallFrameInformation(); - if (cfi_info != nullptr) { - // Copy in the FDE, if present - const std::vector<uint8_t>* fde = compiled_method->GetCFIInfo(); - if (fde != nullptr) { - // Copy the information into cfi_info and then fix the address in the new copy. - int cur_offset = cfi_info->size(); - cfi_info->insert(cfi_info->end(), fde->begin(), fde->end()); - - // Set the 'CIE_pointer' field to cur_offset+4. - uint32_t CIE_pointer = cur_offset + 4; - uint32_t offset_to_update = cur_offset + sizeof(uint32_t); - (*cfi_info)[offset_to_update+0] = CIE_pointer; - (*cfi_info)[offset_to_update+1] = CIE_pointer >> 8; - (*cfi_info)[offset_to_update+2] = CIE_pointer >> 16; - (*cfi_info)[offset_to_update+3] = CIE_pointer >> 24; - - // Set the 'initial_location' field to address the start of the method. - offset_to_update = cur_offset + 2*sizeof(uint32_t); - (*cfi_info)[offset_to_update+0] = quick_code_start; - (*cfi_info)[offset_to_update+1] = quick_code_start >> 8; - (*cfi_info)[offset_to_update+2] = quick_code_start >> 16; - (*cfi_info)[offset_to_update+3] = quick_code_start >> 24; - force_debug_capture = true; - } - } + if (writer_->compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) { + // Record debug information for this function if we are doing that. - - if (writer_->compiler_driver_->DidIncludeDebugSymbols() || force_debug_capture) { - // Record debug information for this function if we are doing that or - // we have CFI and so need it. std::string name = PrettyMethod(it.GetMemberIndex(), *dex_file_, true); if (deduped) { - // TODO We should place the DEDUPED tag on the first instance of a - // deduplicated symbol so that it will show up in a debuggerd crash - // report. + // TODO We should place the DEDUPED tag on the first instance of a deduplicated symbol + // so that it will show up in a debuggerd crash report. name += " [ DEDUPED ]"; } - writer_->method_info_.push_back(DebugInfo(name, quick_code_start, - quick_code_start + code_size)); + + const uint32_t quick_code_start = quick_code_offset - + writer_->oat_header_->GetExecutableOffset(); + writer_->method_info_.push_back(DebugInfo(name, + quick_code_start, + quick_code_start + code_size, + compiled_method)); } } diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h index 945048ecb7..ef5fd6b1c2 100644 --- a/compiler/oat_writer.h +++ b/compiler/oat_writer.h @@ -30,6 +30,7 @@ namespace art { class BitVector; +class CompiledMethod; class OutputStream; // OatHeader variable length with count of D OatDexFiles @@ -97,22 +98,21 @@ class OatWriter { ~OatWriter(); struct DebugInfo { - DebugInfo(const std::string& method_name, uint32_t low_pc, uint32_t high_pc) - : method_name_(method_name), low_pc_(low_pc), high_pc_(high_pc) { + DebugInfo(const std::string& method_name, uint32_t low_pc, uint32_t high_pc, + CompiledMethod* compiled_method) + : method_name_(method_name), low_pc_(low_pc), high_pc_(high_pc), + compiled_method_(compiled_method) { } - std::string method_name_; + std::string method_name_; // Note: this name is a pretty-printed name. uint32_t low_pc_; uint32_t high_pc_; + CompiledMethod* compiled_method_; }; const std::vector<DebugInfo>& GetCFIMethodInfo() const { return method_info_; } - bool DidAddSymbols() const { - return compiler_driver_->DidIncludeDebugSymbols(); - } - private: // The DataAccess classes are helper classes that provide access to members related to // a given map, i.e. GC map, mapping table or vmap table. By abstracting these away diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index 1d6655c24e..2f814dfaf4 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -124,8 +124,8 @@ class Operand { if (index.NeedsRex()) { rex_ |= 0x42; // REX.00X0 } - encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.AsRegister()) << 3) | - static_cast<uint8_t>(base.AsRegister()); + encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.LowBits()) << 3) | + static_cast<uint8_t>(base.LowBits()); length_ = 2; } diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc index 1f4d727abb..4ed7b2015a 100644 --- a/compiler/utils/x86_64/assembler_x86_64_test.cc +++ b/compiler/utils/x86_64/assembler_x86_64_test.cc @@ -128,9 +128,18 @@ TEST_F(AssemblerX86_64Test, XorqImm) { TEST_F(AssemblerX86_64Test, Movl) { GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::CpuRegister(x86_64::R11)); GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::CpuRegister(x86_64::R11)); + GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12)); + GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12)); + GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::Address( + x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12)); const char* expected = "movl %R11d, %R8d\n" - "movl %R11d, %EAX\n"; + "movl %R11d, %EAX\n" + "movl 0xc(%RDI,%RBX,4), %EAX\n" + "movl 0xc(%RDI,%R9,4), %EAX\n" + "movl 0xc(%RDI,%R9,4), %R8d\n"; DriverStr(expected, "movl"); } diff --git a/runtime/Android.mk b/runtime/Android.mk index 8fc5e343e7..302e835e71 100644 --- a/runtime/Android.mk +++ b/runtime/Android.mk @@ -252,6 +252,7 @@ LIBART_SRC_FILES_x86_64 := \ arch/x86_64/context_x86_64.cc \ arch/x86_64/entrypoints_init_x86_64.cc \ arch/x86_64/jni_entrypoints_x86_64.S \ + arch/x86_64/memcmp16_x86_64.S \ arch/x86_64/portable_entrypoints_x86_64.S \ arch/x86_64/quick_entrypoints_x86_64.S \ arch/x86_64/thread_x86_64.cc \ diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S index 4939610e60..86cb16aab5 100644 --- a/runtime/arch/arm/quick_entrypoints_arm.S +++ b/runtime/arch/arm/quick_entrypoints_arm.S @@ -365,8 +365,9 @@ END art_quick_invoke_stub ARM_ENTRY art_quick_do_long_jump vldm r1, {s0-s31} @ load all fprs from argument fprs_ ldr r2, [r0, #60] @ r2 = r15 (PC from gprs_ 60=4*15) + ldr r14, [r0, #56] @ (LR from gprs_ 56=4*14) add r0, r0, #12 @ increment r0 to skip gprs_[0..2] 12=4*3 - ldm r0, {r3-r14} @ load remaining gprs from argument gprs_ + ldm r0, {r3-r13} @ load remaining gprs from argument gprs_ mov r0, #0 @ clear result registers r0 and r1 mov r1, #0 bx r2 @ do long jump diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h index 65d2f92d74..14dc1e3880 100644 --- a/runtime/arch/memcmp16.h +++ b/runtime/arch/memcmp16.h @@ -30,7 +30,7 @@ // // In both cases, MemCmp16 is declared. -#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) +#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) || defined(__x86_64__) extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count); #define MemCmp16 __memcmp16 diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S index 17662fae6b..a315a378ea 100644 --- a/runtime/arch/x86/memcmp16_x86.S +++ b/runtime/arch/x86/memcmp16_x86.S @@ -21,1018 +21,1018 @@ /* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */ #ifndef L -# define L(label) .L##label +# define L(label) .L##label #endif -#define CFI_PUSH(REG) \ - CFI_ADJUST_CFA_OFFSET(4); \ - CFI_REL_OFFSET(REG, 0) +#define CFI_PUSH(REG) \ + CFI_ADJUST_CFA_OFFSET(4); \ + CFI_REL_OFFSET(REG, 0) -#define CFI_POP(REG) \ - CFI_ADJUST_CFA_OFFSET(-4); \ - CFI_RESTORE(REG) +#define CFI_POP(REG) \ + CFI_ADJUST_CFA_OFFSET(-4); \ + CFI_RESTORE(REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE DEFINE_FUNCTION MEMCMP - movl LEN(%esp), %ecx + movl LEN(%esp), %ecx - shl $1, %ecx - jz L(zero) + shl $1, %ecx + jz L(zero) - movl BLK1(%esp), %eax - cmp $48, %ecx - movl BLK2(%esp), %edx - jae L(48bytesormore) + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) - PUSH (%ebx) - add %ecx, %edx - add %ecx, %eax - jmp L(less48bytes) + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) - CFI_POP (%ebx) + CFI_POP (%ebx) - .p2align 4 + .p2align 4 L(zero): - xor %eax, %eax - ret + xor %eax, %eax + ret - .p2align 4 + .p2align 4 L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) - CFI_REMEMBER_STATE - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 - movl %eax, %edi - movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%edi), %edi - - sub $0xffff, %edx - lea 16(%esi), %esi - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %edx, %edi - sub %edx, %esi - add %edx, %ecx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %edx, %esi - - cmp $0, %edx - je L(shr_0) - cmp $2, %edx - je L(shr_2) - cmp $4, %edx - je L(shr_4) - cmp $6, %edx - je L(shr_6) - cmp $8, %edx - je L(shr_8) - cmp $10, %edx - je L(shr_10) - cmp $12, %edx - je L(shr_12) - jmp L(shr_14) - - .p2align 4 + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + CFI_REMEMBER_STATE + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + + cmp $0, %edx + je L(shr_0) + cmp $2, %edx + je L(shr_2) + cmp $4, %edx + je L(shr_4) + cmp $6, %edx + je L(shr_6) + cmp $8, %edx + je L(shr_8) + cmp $10, %edx + je L(shr_10) + cmp $12, %edx + je L(shr_12) + jmp L(shr_14) + + .p2align 4 L(shr_0): - cmp $80, %ecx - jae L(shr_0_gobble) - lea -48(%ecx), %ecx - xor %eax, %eax - movaps (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - movaps 16(%esi), %xmm2 - pcmpeqb 16(%edi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - add $32, %edi - add $32, %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_0_gobble): - lea -48(%ecx), %ecx - movdqa (%esi), %xmm0 - xor %eax, %eax - pcmpeqb (%edi), %xmm0 - sub $32, %ecx - movdqa 16(%esi), %xmm2 - pcmpeqb 16(%edi), %xmm2 + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %ecx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%esi), %xmm0 - movdqa 48(%esi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%edi), %xmm0 - pcmpeqb 48(%edi), %xmm2 - lea 32(%edi), %edi - lea 32(%esi), %esi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %ecx - jge L(shr_0_gobble_loop_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx L(shr_0_gobble_loop_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_2): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $2,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_2_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $2,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $2,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $2,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $2,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_2_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_4): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $4,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_4_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $4,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $4,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $4,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $4,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_4_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_6): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $6,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_6_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $6,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $6,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $6,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $6,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_6_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_8): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $8,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_8_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $8,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $8,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $8,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $8,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_8_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_10): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $10,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_10_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $10, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $10, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $10,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $10,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_10_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_12): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_12_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $12, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $12, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $12,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $12,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_12_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_14): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_14_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $14, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $14, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $14,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $14,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_14_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(exit): - pmovmskb %xmm1, %ebx - sub $0xffff, %ebx - jz L(first16bytes) - lea -16(%esi), %esi - lea -16(%edi), %edi - mov %ebx, %edx + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx L(first16bytes): - add %eax, %esi + add %eax, %esi L(less16bytes): - test %dl, %dl - jz L(next_four_words) - test $15, %dl - jz L(second_two_words) - test $3, %dl - jz L(second_word) - movzwl -16(%edi), %eax - movzwl -16(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test %dl, %dl + jz L(next_four_words) + test $15, %dl + jz L(second_two_words) + test $3, %dl + jz L(second_word) + movzwl -16(%edi), %eax + movzwl -16(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(second_word): - movzwl -14(%edi), %eax - movzwl -14(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -14(%edi), %eax + movzwl -14(%esi), %ebx + subl %ebx, %eax + RETURN - .p2align 4 + .p2align 4 L(second_two_words): - test $63, %dl - jz L(fourth_word) - movzwl -12(%edi), %eax - movzwl -12(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test $63, %dl + jz L(fourth_word) + movzwl -12(%edi), %eax + movzwl -12(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(fourth_word): - movzwl -10(%edi), %eax - movzwl -10(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -10(%edi), %eax + movzwl -10(%esi), %ebx + subl %ebx, %eax + RETURN - .p2align 4 + .p2align 4 L(next_four_words): - test $15, %dh - jz L(fourth_two_words) - test $3, %dh - jz L(sixth_word) - movzwl -8(%edi), %eax - movzwl -8(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test $15, %dh + jz L(fourth_two_words) + test $3, %dh + jz L(sixth_word) + movzwl -8(%edi), %eax + movzwl -8(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(sixth_word): - movzwl -6(%edi), %eax - movzwl -6(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -6(%edi), %eax + movzwl -6(%esi), %ebx + subl %ebx, %eax + RETURN - .p2align 4 + .p2align 4 L(fourth_two_words): - test $63, %dh - jz L(eighth_word) - movzwl -4(%edi), %eax - movzwl -4(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test $63, %dh + jz L(eighth_word) + movzwl -4(%edi), %eax + movzwl -4(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(eighth_word): - movzwl -2(%edi), %eax - movzwl -2(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -2(%edi), %eax + movzwl -2(%esi), %ebx + subl %ebx, %eax + RETURN - CFI_PUSH (%ebx) + CFI_PUSH (%ebx) - .p2align 4 + .p2align 4 L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) - cmp $10, %ecx - je L(10bytes) - cmp $12, %ecx - je L(12bytes) - jmp L(14bytes) - - .p2align 4 + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) + cmp $10, %ecx + je L(10bytes) + cmp $12, %ecx + je L(12bytes) + jmp L(14bytes) + + .p2align 4 L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) - cmp $18, %ecx - je L(18bytes) - cmp $20, %ecx - je L(20bytes) - jmp L(22bytes) - - .p2align 4 + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) + cmp $18, %ecx + je L(18bytes) + cmp $20, %ecx + je L(20bytes) + jmp L(22bytes) + + .p2align 4 L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) - cmp $26, %ecx - je L(26bytes) - cmp $28, %ecx - je L(28bytes) - jmp L(30bytes) - - .p2align 4 + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) + cmp $26, %ecx + je L(26bytes) + cmp $28, %ecx + je L(28bytes) + jmp L(30bytes) + + .p2align 4 L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) - cmp $34, %ecx - je L(34bytes) - cmp $36, %ecx - je L(36bytes) - jmp L(38bytes) - - .p2align 4 + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) + cmp $34, %ecx + je L(34bytes) + cmp $36, %ecx + je L(36bytes) + jmp L(38bytes) + + .p2align 4 L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $2, %ecx - je L(2bytes) - cmp $4, %ecx - je L(4bytes) - jmp L(6bytes) - - .p2align 4 + cmp $8, %ecx + jae L(more8bytes) + cmp $2, %ecx + je L(2bytes) + cmp $4, %ecx + je L(4bytes) + jmp L(6bytes) + + .p2align 4 L(more40bytes): - cmp $40, %ecx - je L(40bytes) - cmp $42, %ecx - je L(42bytes) - cmp $44, %ecx - je L(44bytes) - jmp L(46bytes) - - .p2align 4 + cmp $40, %ecx + je L(40bytes) + cmp $42, %ecx + je L(42bytes) + cmp $44, %ecx + je L(44bytes) + jmp L(46bytes) + + .p2align 4 L(46bytes): - movzwl -46(%eax), %ecx - movzwl -46(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -46(%eax), %ecx + movzwl -46(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(44bytes): - movzwl -44(%eax), %ecx - movzwl -44(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -44(%eax), %ecx + movzwl -44(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(42bytes): - movzwl -42(%eax), %ecx - movzwl -42(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -42(%eax), %ecx + movzwl -42(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(40bytes): - movzwl -40(%eax), %ecx - movzwl -40(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -40(%eax), %ecx + movzwl -40(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(38bytes): - movzwl -38(%eax), %ecx - movzwl -38(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -38(%eax), %ecx + movzwl -38(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(36bytes): - movzwl -36(%eax), %ecx - movzwl -36(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -36(%eax), %ecx + movzwl -36(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(34bytes): - movzwl -34(%eax), %ecx - movzwl -34(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -34(%eax), %ecx + movzwl -34(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(32bytes): - movzwl -32(%eax), %ecx - movzwl -32(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -32(%eax), %ecx + movzwl -32(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(30bytes): - movzwl -30(%eax), %ecx - movzwl -30(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -30(%eax), %ecx + movzwl -30(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(28bytes): - movzwl -28(%eax), %ecx - movzwl -28(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -28(%eax), %ecx + movzwl -28(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(26bytes): - movzwl -26(%eax), %ecx - movzwl -26(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -26(%eax), %ecx + movzwl -26(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(24bytes): - movzwl -24(%eax), %ecx - movzwl -24(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -24(%eax), %ecx + movzwl -24(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(22bytes): - movzwl -22(%eax), %ecx - movzwl -22(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -22(%eax), %ecx + movzwl -22(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(20bytes): - movzwl -20(%eax), %ecx - movzwl -20(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -20(%eax), %ecx + movzwl -20(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(18bytes): - movzwl -18(%eax), %ecx - movzwl -18(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -18(%eax), %ecx + movzwl -18(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(16bytes): - movzwl -16(%eax), %ecx - movzwl -16(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -16(%eax), %ecx + movzwl -16(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(14bytes): - movzwl -14(%eax), %ecx - movzwl -14(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -14(%eax), %ecx + movzwl -14(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(12bytes): - movzwl -12(%eax), %ecx - movzwl -12(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -12(%eax), %ecx + movzwl -12(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(10bytes): - movzwl -10(%eax), %ecx - movzwl -10(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -10(%eax), %ecx + movzwl -10(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(8bytes): - movzwl -8(%eax), %ecx - movzwl -8(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -8(%eax), %ecx + movzwl -8(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(6bytes): - movzwl -6(%eax), %ecx - movzwl -6(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -6(%eax), %ecx + movzwl -6(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(4bytes): - movzwl -4(%eax), %ecx - movzwl -4(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -4(%eax), %ecx + movzwl -4(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(2bytes): - movzwl -2(%eax), %eax - movzwl -2(%edx), %ebx - subl %ebx, %eax - POP (%ebx) - ret - CFI_PUSH (%ebx) - - .p2align 4 + movzwl -2(%eax), %eax + movzwl -2(%edx), %ebx + subl %ebx, %eax + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 L(memcmp16_exit): - POP (%ebx) - mov %ecx, %eax - ret + POP (%ebx) + mov %ecx, %eax + ret END_FUNCTION MEMCMP diff --git a/runtime/arch/x86_64/memcmp16_x86_64.S b/runtime/arch/x86_64/memcmp16_x86_64.S new file mode 100755 index 0000000000..46e4ba36cf --- /dev/null +++ b/runtime/arch/x86_64/memcmp16_x86_64.S @@ -0,0 +1,1210 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "asm_support_x86_64.S" + +#define MEMCMP __memcmp16 + +/* + * Half of Silvermont L1 Data Cache size + *(see original file cache.h in bionic/libc/arch-x86_64/). + * This value is used for specific optimization on big lengths. + */ +#define DATA_CACHE_SIZE_HALF (12*1024) + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define JMPTBL(I, B) (I - B) + +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + +DEFINE_FUNCTION MEMCMP + pxor %xmm0, %xmm0 + shl $1, %rdx + cmp $79, %rdx + ja L(79bytesormore) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) + cmp $256, %rdx + ja L(less512bytes) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(512bytesormore): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +L(L2_L3_cache_unaglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + ALIGN (4) +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(512bytesormorein2aligned): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) +L(L2_L3_cache_aglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + + ALIGN (4) +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx + mov -4(%rdi), %eax + cmp %eax, %ecx + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + + ALIGN (4) +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(end) + + ALIGN (4) +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + ALIGN (3) +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax +L(diffin4bytes): + cmp %cx, %ax + jne L(end) + shr $16, %ecx + shr $16, %eax + jmp L(end) + + ALIGN (4) +L(end): + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + +END_FUNCTION MEMCMP + + ALIGN (3) +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc index c1e3b257f7..f0b1b9578f 100644 --- a/runtime/class_linker.cc +++ b/runtime/class_linker.cc @@ -3516,14 +3516,19 @@ mirror::ArtMethod* ClassLinker::CreateProxyConstructor(Thread* self, proxy_class->GetDirectMethods(); CHECK_EQ(proxy_direct_methods->GetLength(), 16); mirror::ArtMethod* proxy_constructor = proxy_direct_methods->Get(2); - // Clone the existing constructor of Proxy (our constructor would just invoke it so steal its - // code_ too) - mirror::ArtMethod* constructor = - down_cast<mirror::ArtMethod*>(proxy_constructor->Clone(self)); - if (constructor == NULL) { + mirror::ArtMethod* constructor = down_cast<mirror::ArtMethod*>(proxy_constructor->Clone(self)); + if (constructor == nullptr) { CHECK(self->IsExceptionPending()); // OOME. - return NULL; + return nullptr; } + // Make the proxy constructor's code always point to the uninstrumented code. This avoids + // getting a method enter event for the proxy constructor as the proxy constructor doesn't + // have an activation. + bool have_portable_code; + constructor->SetEntryPointFromQuickCompiledCode(GetQuickOatCodeFor(proxy_constructor)); + constructor->SetEntryPointFromPortableCompiledCode(GetPortableOatCodeFor(proxy_constructor, + &have_portable_code)); + // Make this constructor public and fix the class to be our Proxy version constructor->SetAccessFlags((constructor->GetAccessFlags() & ~kAccProtected) | kAccPublic); constructor->SetDeclaringClass(klass.Get()); diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc index 8e363c4fa3..997236200c 100644 --- a/runtime/common_runtime_test.cc +++ b/runtime/common_runtime_test.cc @@ -137,7 +137,17 @@ void CommonRuntimeTest::SetEnvironmentVariables(std::string& android_data) { } // On target, Cannot use /mnt/sdcard because it is mounted noexec, so use subdir of dalvik-cache - android_data = (IsHost() ? "/tmp/art-data-XXXXXX" : "/data/dalvik-cache/art-data-XXXXXX"); + if (IsHost()) { + const char* tmpdir = getenv("TMPDIR"); + if (tmpdir != nullptr && tmpdir[0] != 0) { + android_data = tmpdir; + } else { + android_data = "/tmp"; + } + } else { + android_data = "/data/dalvik-cache"; + } + android_data += "/art-data-XXXXXX"; if (mkdtemp(&android_data[0]) == nullptr) { PLOG(FATAL) << "mkdtemp(\"" << &android_data[0] << "\") failed"; } @@ -212,7 +222,7 @@ void CommonRuntimeTest::ClearDirectory(const char* dirpath) { if ((strcmp(e->d_name, ".") == 0) || (strcmp(e->d_name, "..") == 0)) { continue; } - std::string filename(dalvik_cache_); + std::string filename(dirpath); filename.push_back('/'); filename.append(e->d_name); int stat_result = lstat(filename.c_str(), &s); @@ -265,6 +275,19 @@ std::string CommonRuntimeTest::GetDexFileName(const std::string& jar_prefix) { return StringPrintf("%s/framework/%s.jar", GetAndroidRoot(), jar_prefix.c_str()); } +std::string CommonRuntimeTest::GetLibCoreOatFileName() { + return GetOatFileName("core"); +} + +std::string CommonRuntimeTest::GetOatFileName(const std::string& oat_prefix) { + if (IsHost()) { + const char* host_dir = getenv("ANDROID_HOST_OUT"); + CHECK(host_dir != nullptr); + return StringPrintf("%s/framework/%s.art", host_dir, oat_prefix.c_str()); + } + return StringPrintf("%s/framework/%s.art", GetAndroidRoot(), oat_prefix.c_str()); +} + std::string CommonRuntimeTest::GetTestAndroidRoot() { if (IsHost()) { const char* host_dir = getenv("ANDROID_HOST_OUT"); diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h index eb963525e8..363d8daaf8 100644 --- a/runtime/common_runtime_test.h +++ b/runtime/common_runtime_test.h @@ -85,10 +85,18 @@ class CommonRuntimeTest : public testing::Test { virtual void TearDown(); + // Gets the path of the libcore dex file. std::string GetLibCoreDexFileName(); + // Gets the path of the specified dex file for host or target. std::string GetDexFileName(const std::string& jar_prefix); + // Gets the path of the libcore oat file. + std::string GetLibCoreOatFileName(); + + // Gets the path of the specified oat file for host or target. + std::string GetOatFileName(const std::string& oat_prefix); + std::string GetTestAndroidRoot(); std::vector<const DexFile*> OpenTestDexFiles(const char* name) diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc index 49bb65f488..fa198d7ef5 100644 --- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc +++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc @@ -592,8 +592,7 @@ extern "C" uint64_t artQuickProxyInvokeHandler(mirror::ArtMethod* proxy_method, const char* old_cause = self->StartAssertNoThreadSuspension("Adding to IRT proxy object arguments"); // Register the top of the managed stack, making stack crawlable. - DCHECK_EQ(sp->AsMirrorPtr(), proxy_method) - << PrettyMethod(proxy_method); + DCHECK_EQ(sp->AsMirrorPtr(), proxy_method) << PrettyMethod(proxy_method); self->SetTopOfStack(sp, 0); DCHECK_EQ(proxy_method->GetFrameSizeInBytes(), Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes()) diff --git a/runtime/gc/accounting/card_table.cc b/runtime/gc/accounting/card_table.cc index ceb42e5936..049855000b 100644 --- a/runtime/gc/accounting/card_table.cc +++ b/runtime/gc/accounting/card_table.cc @@ -28,6 +28,11 @@ namespace art { namespace gc { namespace accounting { +constexpr size_t CardTable::kCardShift; +constexpr size_t CardTable::kCardSize; +constexpr uint8_t CardTable::kCardClean; +constexpr uint8_t CardTable::kCardDirty; + /* * Maintain a card table from the write barrier. All writes of * non-NULL values to heap addresses should go through an entry in @@ -55,9 +60,9 @@ CardTable* CardTable::Create(const byte* heap_begin, size_t heap_capacity) { size_t capacity = heap_capacity / kCardSize; /* Allocate an extra 256 bytes to allow fixed low-byte of base */ std::string error_msg; - std::unique_ptr<MemMap> mem_map(MemMap::MapAnonymous("card table", NULL, - capacity + 256, PROT_READ | PROT_WRITE, - false, &error_msg)); + std::unique_ptr<MemMap> mem_map( + MemMap::MapAnonymous("card table", nullptr, capacity + 256, PROT_READ | PROT_WRITE, + false, &error_msg)); CHECK(mem_map.get() != NULL) << "couldn't allocate card table: " << error_msg; // All zeros is the correct initial value; all clean. Anonymous mmaps are initialized to zero, we // don't clear the card table to avoid unnecessary pages being allocated @@ -67,17 +72,17 @@ CardTable* CardTable::Create(const byte* heap_begin, size_t heap_capacity) { CHECK(cardtable_begin != NULL); // We allocated up to a bytes worth of extra space to allow biased_begin's byte value to equal - // GC_CARD_DIRTY, compute a offset value to make this the case + // kCardDirty, compute a offset value to make this the case size_t offset = 0; byte* biased_begin = reinterpret_cast<byte*>(reinterpret_cast<uintptr_t>(cardtable_begin) - (reinterpret_cast<uintptr_t>(heap_begin) >> kCardShift)); - if (((uintptr_t)biased_begin & 0xff) != kCardDirty) { - int delta = kCardDirty - (reinterpret_cast<uintptr_t>(biased_begin) & 0xff); + uintptr_t biased_byte = reinterpret_cast<uintptr_t>(biased_begin) & 0xff; + if (biased_byte != kCardDirty) { + int delta = kCardDirty - biased_byte; offset = delta + (delta < 0 ? 0x100 : 0); biased_begin += offset; } CHECK_EQ(reinterpret_cast<uintptr_t>(biased_begin) & 0xff, kCardDirty); - return new CardTable(mem_map.release(), biased_begin, offset); } diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h index 7934974081..fbeea85554 100644 --- a/runtime/gc/accounting/card_table.h +++ b/runtime/gc/accounting/card_table.h @@ -46,10 +46,10 @@ template<size_t kAlignment> class SpaceBitmap; // WriteBarrier, and from there to here. class CardTable { public: - static const size_t kCardShift = 7; - static const size_t kCardSize = (1 << kCardShift); - static const uint8_t kCardClean = 0x0; - static const uint8_t kCardDirty = 0x70; + static constexpr size_t kCardShift = 7; + static constexpr size_t kCardSize = 1 << kCardShift; + static constexpr uint8_t kCardClean = 0x0; + static constexpr uint8_t kCardDirty = 0x70; static CardTable* Create(const byte* heap_begin, size_t heap_capacity); diff --git a/runtime/gc/accounting/card_table_test.cc b/runtime/gc/accounting/card_table_test.cc new file mode 100644 index 0000000000..a88b2c9881 --- /dev/null +++ b/runtime/gc/accounting/card_table_test.cc @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "card_table-inl.h" + +#include <string> + +#include "atomic.h" +#include "common_runtime_test.h" +#include "handle_scope-inl.h" +#include "mirror/class-inl.h" +#include "mirror/string-inl.h" // Strings are easiest to allocate +#include "scoped_thread_state_change.h" +#include "thread_pool.h" +#include "utils.h" + +namespace art { + +namespace mirror { + class Object; +} // namespace mirror + +class CardTableTest : public CommonRuntimeTest { + public: + std::unique_ptr<gc::accounting::CardTable> card_table_; + static constexpr size_t kCardSize = gc::accounting::CardTable::kCardSize; + + void CommonSetup() { + if (card_table_.get() == nullptr) { + card_table_.reset(gc::accounting::CardTable::Create(heap_begin_, heap_size_)); + EXPECT_TRUE(card_table_.get() != nullptr); + } else { + ClearCardTable(); + } + } + // Default values for the test, not random to avoid undeterministic behaviour. + CardTableTest() : heap_begin_(reinterpret_cast<byte*>(0x2000000)), heap_size_(2 * MB) { + } + void ClearCardTable() { + card_table_->ClearCardTable(); + } + byte* HeapBegin() const { + return heap_begin_; + } + byte* HeapLimit() const { + return HeapBegin() + heap_size_; + } + byte PRandCard(const byte* addr) const { + size_t offset = RoundDown(addr - heap_begin_, kCardSize); + return 1 + offset % 254; + } + void FillRandom() { + for (const byte* addr = HeapBegin(); addr != HeapLimit(); addr += kCardSize) { + EXPECT_TRUE(card_table_->AddrIsInCardTable(addr)); + byte* card = card_table_->CardFromAddr(addr); + *card = PRandCard(addr); + } + } + + private: + byte* const heap_begin_; + const size_t heap_size_; +}; + +TEST_F(CardTableTest, TestMarkCard) { + CommonSetup(); + for (const byte* addr = HeapBegin(); addr < HeapLimit(); addr += kObjectAlignment) { + auto obj = reinterpret_cast<const mirror::Object*>(addr); + EXPECT_EQ(card_table_->GetCard(obj), gc::accounting::CardTable::kCardClean); + EXPECT_TRUE(!card_table_->IsDirty(obj)); + card_table_->MarkCard(addr); + EXPECT_TRUE(card_table_->IsDirty(obj)); + EXPECT_EQ(card_table_->GetCard(obj), gc::accounting::CardTable::kCardDirty); + byte* card_addr = card_table_->CardFromAddr(addr); + EXPECT_EQ(*card_addr, gc::accounting::CardTable::kCardDirty); + *card_addr = gc::accounting::CardTable::kCardClean; + EXPECT_EQ(*card_addr, gc::accounting::CardTable::kCardClean); + } +} + +class UpdateVisitor { + public: + byte operator()(byte c) const { + return c * 93 + 123; + } + void operator()(byte* /*card*/, byte /*expected_value*/, byte /*new_value*/) const { + } +}; + +TEST_F(CardTableTest, TestModifyCardsAtomic) { + CommonSetup(); + FillRandom(); + const size_t delta = std::min(static_cast<size_t>(HeapLimit() - HeapBegin()), 8U * kCardSize); + UpdateVisitor visitor; + size_t start_offset = 0; + for (byte* cstart = HeapBegin(); cstart < HeapBegin() + delta; cstart += kCardSize) { + start_offset = (start_offset + kObjectAlignment) % kCardSize; + size_t end_offset = 0; + for (byte* cend = HeapLimit() - delta; cend < HeapLimit(); cend += kCardSize) { + // Don't always start at a card boundary. + byte* start = cstart + start_offset; + byte* end = cend - end_offset; + end_offset = (end_offset + kObjectAlignment) % kCardSize; + // Modify cards. + card_table_->ModifyCardsAtomic(start, end, visitor, visitor); + // Check adjacent cards not modified. + for (byte* cur = start - kCardSize; cur >= HeapBegin(); cur -= kCardSize) { + EXPECT_EQ(card_table_->GetCard(reinterpret_cast<mirror::Object*>(cur)), PRandCard(cur)); + } + for (byte* cur = end + kCardSize; cur < HeapLimit(); cur += kCardSize) { + EXPECT_EQ(card_table_->GetCard(reinterpret_cast<mirror::Object*>(cur)), PRandCard(cur)); + } + // Verify Range. + for (byte* cur = start; cur < AlignUp(end, kCardSize); cur += kCardSize) { + byte* card = card_table_->CardFromAddr(cur); + byte value = PRandCard(cur); + if (visitor(value) != *card) { + LOG(ERROR) << reinterpret_cast<void*>(start) << " " << reinterpret_cast<void*>(cur) << " " << reinterpret_cast<void*>(end); + } + EXPECT_EQ(visitor(value), *card); + // Restore for next iteration. + *card = value; + } + } + } +} + +// TODO: Add test for CardTable::Scan. + +} // namespace art diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc index 4882728e0d..8eacb1c3d7 100644 --- a/runtime/mirror/art_method.cc +++ b/runtime/mirror/art_method.cc @@ -157,12 +157,12 @@ ArtMethod* ArtMethod::FindOverriddenMethod() { } } } -#ifndef NDEBUG - StackHandleScope<2> hs(Thread::Current()); - MethodHelper result_mh(hs.NewHandle(result)); - MethodHelper this_mh(hs.NewHandle(this)); - DCHECK(result == NULL || this_mh.HasSameNameAndSignature(&result_mh)); -#endif + if (kIsDebugBuild) { + StackHandleScope<2> hs(Thread::Current()); + MethodHelper result_mh(hs.NewHandle(result)); + MethodHelper this_mh(hs.NewHandle(this)); + DCHECK(result == nullptr || this_mh.HasSameNameAndSignature(&result_mh)); + } return result; } diff --git a/runtime/monitor_pool.cc b/runtime/monitor_pool.cc index 440a6be07b..4964aa06c5 100644 --- a/runtime/monitor_pool.cc +++ b/runtime/monitor_pool.cc @@ -52,7 +52,7 @@ void MonitorPool::AllocateChunk() { monitor_chunks_.StoreRelaxed(new_backing); capacity_ = new_capacity; old_chunk_arrays_.push_back(old_backing); - LOG(INFO) << "Resizing to capacity " << capacity_; + VLOG(monitor) << "Resizing to capacity " << capacity_; } } @@ -64,7 +64,7 @@ void MonitorPool::AllocateChunk() { CHECK_EQ(0U, reinterpret_cast<uintptr_t>(chunk) % kMonitorAlignment); // Add the chunk. - *(monitor_chunks_.LoadRelaxed()+num_chunks_) = reinterpret_cast<uintptr_t>(chunk); + *(monitor_chunks_.LoadRelaxed() + num_chunks_) = reinterpret_cast<uintptr_t>(chunk); num_chunks_++; // Set up the free list @@ -96,7 +96,7 @@ Monitor* MonitorPool::CreateMonitorInPool(Thread* self, Thread* owner, mirror::O // Enough space, or need to resize? if (first_free_ == nullptr) { - LOG(INFO) << "Allocating a new chunk."; + VLOG(monitor) << "Allocating a new chunk."; AllocateChunk(); } diff --git a/runtime/proxy_test.cc b/runtime/proxy_test.cc index bd6656dda1..308142157c 100644 --- a/runtime/proxy_test.cc +++ b/runtime/proxy_test.cc @@ -17,14 +17,14 @@ #include <jni.h> #include <vector> -#include "common_compiler_test.h" +#include "common_runtime_test.h" #include "field_helper.h" #include "mirror/art_field-inl.h" #include "scoped_thread_state_change.h" namespace art { -class ProxyTest : public CommonCompilerTest { +class ProxyTest : public CommonRuntimeTest { public: // Generate a proxy class with the given name and interfaces. This is a simplification from what // libcore does to fit to our test needs. We do not check for duplicated interfaces or methods and @@ -103,6 +103,12 @@ class ProxyTest : public CommonCompilerTest { soa.Self()->AssertNoPendingException(); return proxyClass; } + + protected: + void SetUpRuntimeOptions(RuntimeOptions *options) OVERRIDE { + options->push_back(std::make_pair(StringPrintf("-Ximage:%s", GetLibCoreOatFileName().c_str()), + nullptr)); + } }; // Creates a proxy class and check ClassHelper works correctly. diff --git a/test/015-switch/expected.txt b/test/015-switch/expected.txt index ca3b518f03..91b47142d1 100644 --- a/test/015-switch/expected.txt +++ b/test/015-switch/expected.txt @@ -8,3 +8,9 @@ CORRECT (not found) CORRECT (default only) CORRECT big sparse / first CORRECT big sparse / last +default +254 +255 +256 +257 +default diff --git a/test/015-switch/src/Main.java b/test/015-switch/src/Main.java index 7198e2b7b9..dd97a8c60b 100644 --- a/test/015-switch/src/Main.java +++ b/test/015-switch/src/Main.java @@ -101,5 +101,15 @@ public class Main { case 100: System.out.print("CORRECT big sparse / last\n"); break; default: System.out.print("blah!\n"); break; } + + for (a = 253; a <= 258; a++) { + switch (a) { + case 254: System.out.println("254"); break; + case 255: System.out.println("255"); break; + case 256: System.out.println("256"); break; + case 257: System.out.println("257"); break; + default: System.out.println("default"); break; + } + } } } diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk index 5c1bc0395f..d7ee383db7 100644 --- a/test/Android.run-test.mk +++ b/test/Android.run-test.mk @@ -81,38 +81,10 @@ endif # Tests that are broken in --trace mode. TEST_ART_BROKEN_TRACE_RUN_TESTS := \ - 003-omnibus-opcodes \ - 004-InterfaceTest \ 004-SignalTest \ - 004-ThreadStress \ - 005-annotations \ - 012-math \ 018-stack-overflow \ - 023-many-interfaces \ - 027-arithmetic \ - 031-class-attributes \ - 037-inherit \ - 044-proxy \ - 046-reflect \ - 051-thread \ - 055-enum-performance \ - 062-character-encodings \ - 064-field-access \ - 074-gc-thrash \ - 078-polymorphic-virtual \ - 080-oom-throw \ - 082-inline-execute \ - 083-compiler-regressions \ - 093-serialization \ 097-duplicate-method \ - 100-reflect2 \ - 102-concurrent-gc \ - 103-string-append \ - 107-int-math2 \ - 112-double-math \ - 114-ParallelGC \ - 700-LoadArgRegs \ - 701-easy-div-rem + 107-int-math2 ART_TEST_KNOWN_BROKEN += $(foreach test, $(TEST_ART_BROKEN_TRACE_RUN_TESTS), $(call all-run-test-names,$(test),-trace,-relocate)) ART_TEST_KNOWN_BROKEN += $(foreach test, $(TEST_ART_BROKEN_TRACE_RUN_TESTS), $(call all-run-test-names,$(test),-trace,-no-prebuild)) diff --git a/test/run-test b/test/run-test index aef7c52120..ca7e68c263 100755 --- a/test/run-test +++ b/test/run-test @@ -33,7 +33,11 @@ cd "${progdir}" progdir=`pwd` prog="${progdir}"/`basename "${prog}"` test_dir="test-$$" -tmp_dir="/tmp/$USER/${test_dir}" +if [ -z "$TMPDIR" ]; then + tmp_dir="/tmp/$USER/${test_dir}" +else + tmp_dir="${TMPDIR}/$USER/${test_dir}" +fi export JAVA="java" export JAVAC="javac -g" |