summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Android.mk7
-rw-r--r--build/Android.gtest.mk3
-rw-r--r--build/Android.oat.mk4
-rw-r--r--compiler/compilers.cc14
-rw-r--r--compiler/compilers.h11
-rw-r--r--compiler/dex/quick/arm/call_arm.cc6
-rw-r--r--compiler/dex/quick/arm/codegen_arm.h4
-rw-r--r--compiler/dex/quick/arm64/call_arm64.cc6
-rw-r--r--compiler/dex/quick/arm64/codegen_arm64.h4
-rw-r--r--compiler/dex/quick/gen_common.cc88
-rw-r--r--compiler/dex/quick/local_optimizations.cc4
-rw-r--r--compiler/dex/quick/mips/call_mips.cc6
-rw-r--r--compiler/dex/quick/mips/codegen_mips.h4
-rw-r--r--compiler/dex/quick/mir_to_lir.h18
-rw-r--r--compiler/dex/quick/x86/call_x86.cc6
-rw-r--r--compiler/dex/quick/x86/codegen_x86.h10
-rwxr-xr-xcompiler/dex/quick/x86/target_x86.cc83
-rw-r--r--compiler/driver/compiler_driver.cc73
-rw-r--r--compiler/driver/compiler_driver.h19
-rw-r--r--compiler/elf_writer_quick.cc222
-rw-r--r--compiler/elf_writer_quick.h6
-rw-r--r--compiler/oat_writer.cc48
-rw-r--r--compiler/oat_writer.h14
-rw-r--r--compiler/utils/x86_64/assembler_x86_64.h4
-rw-r--r--compiler/utils/x86_64/assembler_x86_64_test.cc11
-rw-r--r--runtime/Android.mk1
-rw-r--r--runtime/arch/arm/quick_entrypoints_arm.S3
-rw-r--r--runtime/arch/memcmp16.h2
-rw-r--r--runtime/arch/x86/memcmp16_x86.S1812
-rwxr-xr-xruntime/arch/x86_64/memcmp16_x86_64.S1210
-rw-r--r--runtime/class_linker.cc17
-rw-r--r--runtime/common_runtime_test.cc27
-rw-r--r--runtime/common_runtime_test.h8
-rw-r--r--runtime/entrypoints/quick/quick_trampoline_entrypoints.cc3
-rw-r--r--runtime/gc/accounting/card_table.cc19
-rw-r--r--runtime/gc/accounting/card_table.h8
-rw-r--r--runtime/gc/accounting/card_table_test.cc143
-rw-r--r--runtime/mirror/art_method.cc12
-rw-r--r--runtime/monitor_pool.cc6
-rw-r--r--runtime/proxy_test.cc10
-rw-r--r--test/015-switch/expected.txt6
-rw-r--r--test/015-switch/src/Main.java10
-rw-r--r--test/Android.run-test.mk30
-rwxr-xr-xtest/run-test6
44 files changed, 2762 insertions, 1246 deletions
diff --git a/Android.mk b/Android.mk
index b006bfe129..15e8308ba0 100644
--- a/Android.mk
+++ b/Android.mk
@@ -67,8 +67,13 @@ ifdef TARGET_2ND_ARCH
rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/JAVA_LIBRARIES/*_intermediates/javalib.odex
rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/APPS/*_intermediates/*.odex
endif
+ifneq ($(TMPDIR),)
+ rm -rf $(TMPDIR)/$(USER)/test-*/dalvik-cache/*
+ rm -rf $(TMPDIR)/android-data/dalvik-cache/*
+else
rm -rf /tmp/$(USER)/test-*/dalvik-cache/*
rm -rf /tmp/android-data/dalvik-cache/*
+endif
.PHONY: clean-oat-target
clean-oat-target:
@@ -309,7 +314,7 @@ else
.PHONY: oat-target-$(1)
oat-target-$(1): $$(OUT_OAT_FILE)
-$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD)
+$$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
@mkdir -p $$(dir $$@)
$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 6e27190bd3..17c478c6d1 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -91,6 +91,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
runtime/entrypoints/quick/quick_trampoline_entrypoints_test.cc \
runtime/entrypoints_order_test.cc \
runtime/exception_test.cc \
+ runtime/gc/accounting/card_table_test.cc \
runtime/gc/accounting/space_bitmap_test.cc \
runtime/gc/heap_test.cc \
runtime/gc/space/dlmalloc_space_base_test.cc \
@@ -113,6 +114,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
runtime/monitor_pool_test.cc \
runtime/monitor_test.cc \
runtime/parsed_options_test.cc \
+ runtime/proxy_test.cc \
runtime/reference_table_test.cc \
runtime/thread_pool_test.cc \
runtime/transaction_test.cc \
@@ -123,7 +125,6 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
COMPILER_GTEST_COMMON_SRC_FILES := \
runtime/jni_internal_test.cc \
- runtime/proxy_test.cc \
runtime/reflection_test.cc \
compiler/dex/global_value_numbering_test.cc \
compiler/dex/local_value_numbering_test.cc \
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index cd6b13aac3..10936a45d6 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -26,7 +26,7 @@ include art/build/Android.common_path.mk
# Use dex2oat debug version for better error reporting
# $(1): 2ND_ or undefined, 2ND_ for 32-bit host builds.
define create-core-oat-host-rules
-$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD)
+$$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
@echo "host dex2oat: $$@ ($$?)"
@mkdir -p $$(dir $$@)
$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_IMAGE_XMS) --runtime-arg -Xmx$(DEX2OAT_IMAGE_XMX) \
@@ -49,7 +49,7 @@ $(eval $(call create-core-oat-host-rules,2ND_))
endif
define create-core-oat-target-rules
-$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD)
+$$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
@echo "target dex2oat: $$@ ($$?)"
@mkdir -p $$(dir $$@)
$$(hide) $$(DEX2OATD) --runtime-arg -Xms$(DEX2OAT_XMS) --runtime-arg -Xmx$(DEX2OAT_XMX) \
diff --git a/compiler/compilers.cc b/compiler/compilers.cc
index 250924ad30..5cf846f8db 100644
--- a/compiler/compilers.cc
+++ b/compiler/compilers.cc
@@ -38,9 +38,6 @@ extern "C" art::CompiledMethod* ArtQuickJniCompileMethod(art::CompilerDriver* dr
uint32_t access_flags, uint32_t method_idx,
const art::DexFile& dex_file);
-// Hack for CFI CIE initialization
-extern std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64);
-
void QuickCompiler::Init() const {
ArtInitQuickCompilerContext(GetCompilerDriver());
}
@@ -126,17 +123,6 @@ Backend* QuickCompiler::GetCodeGenerator(CompilationUnit* cu, void* compilation_
return mir_to_lir;
}
-std::vector<uint8_t>* QuickCompiler::GetCallFrameInformationInitialization(
- const CompilerDriver& driver) const {
- if (driver.GetInstructionSet() == kX86) {
- return X86CFIInitialization(false);
- }
- if (driver.GetInstructionSet() == kX86_64) {
- return X86CFIInitialization(true);
- }
- return nullptr;
-}
-
CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
uint32_t access_flags,
InvokeType invoke_type,
diff --git a/compiler/compilers.h b/compiler/compilers.h
index 2c231e1212..151bf6fa6c 100644
--- a/compiler/compilers.h
+++ b/compiler/compilers.h
@@ -56,17 +56,6 @@ class QuickCompiler : public Compiler {
void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE {}
- /*
- * @brief Generate and return Dwarf CFI initialization, if supported by the
- * backend.
- * @param driver CompilerDriver for this compile.
- * @returns nullptr if not supported by backend or a vector of bytes for CFI DWARF
- * information.
- * @note This is used for backtrace information in generated code.
- */
- std::vector<uint8_t>* GetCallFrameInformationInitialization(const CompilerDriver& driver) const
- OVERRIDE;
-
private:
DISALLOW_COPY_AND_ASSIGN(QuickCompiler);
};
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 5059c5fc9a..b1339916f0 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -43,8 +43,7 @@ namespace art {
* add rARM_PC, r_disp ; This is the branch from which we compute displacement
* cbnz r_idx, lp
*/
-void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
- RegLocation rl_src) {
+void ArmMir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpSparseSwitchTable(table);
@@ -92,8 +91,7 @@ void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
}
-void ArmMir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset,
- RegLocation rl_src) {
+void ArmMir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 21322a6f15..072acbeaa7 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -130,8 +130,8 @@ class ArmMir2Lir FINAL : public Mir2Lir {
int first_bit, int second_bit);
void GenNegDouble(RegLocation rl_dest, RegLocation rl_src);
void GenNegFloat(RegLocation rl_dest, RegLocation rl_src);
- void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
- void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+ void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+ void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
// Required for target - single operation generators.
LIR* OpUnconditionalBranch(LIR* target);
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 6fa8a4aca5..7c5c4fac40 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -43,8 +43,7 @@ namespace art {
* br r_base
* quit:
*/
-void Arm64Mir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
- RegLocation rl_src) {
+void Arm64Mir2Lir::GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpSparseSwitchTable(table);
@@ -96,8 +95,7 @@ void Arm64Mir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
}
-void Arm64Mir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset,
- RegLocation rl_src) {
+void Arm64Mir2Lir::GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index b182cc027e..2cd24c6874 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -196,8 +196,8 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
int first_bit, int second_bit) OVERRIDE;
void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
- void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
- void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+ void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+ void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
// Required for target - single operation generators.
LIR* OpUnconditionalBranch(LIR* target) OVERRIDE;
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 0054f34322..f6c77fcea8 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -2008,4 +2008,92 @@ void Mir2Lir::GenConstWide(RegLocation rl_dest, int64_t value) {
StoreValueWide(rl_dest, rl_result);
}
+void Mir2Lir::GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+ const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+ const uint16_t entries = table[1];
+ // Chained cmp-and-branch.
+ const int32_t* as_int32 = reinterpret_cast<const int32_t*>(&table[2]);
+ int32_t current_key = as_int32[0];
+ const int32_t* targets = &as_int32[1];
+ rl_src = LoadValue(rl_src, kCoreReg);
+ int i = 0;
+ for (; i < entries; i++, current_key++) {
+ if (!InexpensiveConstantInt(current_key, Instruction::Code::IF_EQ)) {
+ // Switch to using a temp and add.
+ break;
+ }
+ BasicBlock* case_block =
+ mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+ OpCmpImmBranch(kCondEq, rl_src.reg, current_key, &block_label_list_[case_block->id]);
+ }
+ if (i < entries) {
+ // The rest do not seem to be inexpensive. Try to allocate a temp and use add.
+ RegStorage key_temp = AllocTypedTemp(false, kCoreReg, false);
+ if (key_temp.Valid()) {
+ LoadConstantNoClobber(key_temp, current_key);
+ for (; i < entries - 1; i++, current_key++) {
+ BasicBlock* case_block =
+ mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+ OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]);
+ OpRegImm(kOpAdd, key_temp, 1); // Increment key.
+ }
+ BasicBlock* case_block =
+ mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+ OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]);
+ } else {
+ // No free temp, just finish the old loop.
+ for (; i < entries; i++, current_key++) {
+ BasicBlock* case_block =
+ mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+ OpCmpImmBranch(kCondEq, rl_src.reg, current_key, &block_label_list_[case_block->id]);
+ }
+ }
+ }
+}
+
+void Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+ const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+ if (cu_->verbose) {
+ DumpSparseSwitchTable(table);
+ }
+
+ const uint16_t entries = table[1];
+ if (entries <= kSmallSwitchThreshold) {
+ GenSmallPackedSwitch(mir, table_offset, rl_src);
+ } else {
+ // Use the backend-specific implementation.
+ GenLargePackedSwitch(mir, table_offset, rl_src);
+ }
+}
+
+void Mir2Lir::GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+ const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+ const uint16_t entries = table[1];
+ // Chained cmp-and-branch.
+ const int32_t* keys = reinterpret_cast<const int32_t*>(&table[2]);
+ const int32_t* targets = &keys[entries];
+ rl_src = LoadValue(rl_src, kCoreReg);
+ for (int i = 0; i < entries; i++) {
+ int key = keys[i];
+ BasicBlock* case_block =
+ mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
+ OpCmpImmBranch(kCondEq, rl_src.reg, key, &block_label_list_[case_block->id]);
+ }
+}
+
+void Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+ const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
+ if (cu_->verbose) {
+ DumpSparseSwitchTable(table);
+ }
+
+ const uint16_t entries = table[1];
+ if (entries <= kSmallSwitchThreshold) {
+ GenSmallSparseSwitch(mir, table_offset, rl_src);
+ } else {
+ // Use the backend-specific implementation.
+ GenLargeSparseSwitch(mir, table_offset, rl_src);
+ }
+}
+
} // namespace art
diff --git a/compiler/dex/quick/local_optimizations.cc b/compiler/dex/quick/local_optimizations.cc
index eec2b32726..e0f4691063 100644
--- a/compiler/dex/quick/local_optimizations.cc
+++ b/compiler/dex/quick/local_optimizations.cc
@@ -200,7 +200,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) {
/* Initialize alias list */
alias_list.clear();
ResourceMask alias_reg_list_mask = kEncodeNone;
- if (!this_mem_mask.Intersects(kEncodeLiteral)) {
+ if (!this_mem_mask.Intersects(kEncodeMem) && !this_mem_mask.Intersects(kEncodeLiteral)) {
alias_list.push_back(dest_reg_id);
SetupRegMask(&alias_reg_list_mask, dest_reg_id);
}
@@ -248,7 +248,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) {
bool is_check_lir_load = check_flags & IS_LOAD;
bool reg_compatible = RegStorage::SameRegType(check_lir->operands[0], native_reg_id);
- if (alias_mem_mask.Equals(kEncodeLiteral)) {
+ if (!alias_mem_mask.Intersects(kEncodeMem) && alias_mem_mask.Equals(kEncodeLiteral)) {
DCHECK(check_flags & IS_LOAD);
/* Same value && same register type */
if (reg_compatible && (this_lir->target == check_lir->target)) {
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 9adddf079d..4577a4c904 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -61,8 +61,7 @@ bool MipsMir2Lir::GenSpecialCase(BasicBlock* bb, MIR* mir,
* done:
*
*/
-void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
- RegLocation rl_src) {
+void MipsMir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpSparseSwitchTable(table);
@@ -139,8 +138,7 @@ void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
* jr rRA
* done:
*/
-void MipsMir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
- RegLocation rl_src) {
+void MipsMir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index bd0c020ce8..43cbde7781 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -128,8 +128,8 @@ class MipsMir2Lir FINAL : public Mir2Lir {
int first_bit, int second_bit);
void GenNegDouble(RegLocation rl_dest, RegLocation rl_src);
void GenNegFloat(RegLocation rl_dest, RegLocation rl_src);
- void GenPackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
- void GenSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
+ void GenLargePackedSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
+ void GenLargeSparseSwitch(MIR* mir, uint32_t table_offset, RegLocation rl_src);
bool GenSpecialCase(BasicBlock* bb, MIR* mir, const InlineMethod& special);
// Required for target - single operation generators.
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 4ed9929338..0e6f36bdeb 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -229,6 +229,9 @@ class Mir2Lir : public Backend {
static constexpr bool kFailOnSizeError = true && kIsDebugBuild;
static constexpr bool kReportSizeError = true && kIsDebugBuild;
+ // TODO: If necessary, this could be made target-dependent.
+ static constexpr uint16_t kSmallSwitchThreshold = 5;
+
/*
* Auxiliary information describing the location of data embedded in the Dalvik
* byte code stream.
@@ -1355,8 +1358,19 @@ class Mir2Lir : public Backend {
int first_bit, int second_bit) = 0;
virtual void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) = 0;
virtual void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) = 0;
- virtual void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
- virtual void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
+
+ // Create code for switch statements. Will decide between short and long versions below.
+ void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+ void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+
+ // Potentially backend-specific versions of switch instructions for shorter switch statements.
+ // The default implementation will create a chained compare-and-branch.
+ virtual void GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+ virtual void GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+ // Backend-specific versions of switch instructions for longer switch statements.
+ virtual void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
+ virtual void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) = 0;
+
virtual void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
RegLocation rl_index, RegLocation rl_dest, int scale) = 0;
virtual void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 15aae9ef1e..f5f86717b4 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -27,8 +27,7 @@ namespace art {
* The sparse table in the literal pool is an array of <key,displacement>
* pairs.
*/
-void X86Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
- RegLocation rl_src) {
+void X86Mir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpSparseSwitchTable(table);
@@ -61,8 +60,7 @@ void X86Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
* jmp r_start_of_method
* done:
*/
-void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
- RegLocation rl_src) {
+void X86Mir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
if (cu_->verbose) {
DumpPackedSwitchTable(table);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 266191adcb..1d8deae3c8 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -246,8 +246,8 @@ class X86Mir2Lir : public Mir2Lir {
int first_bit, int second_bit) OVERRIDE;
void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
- void GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
- void GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+ void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+ void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
/**
* @brief Implement instanceof a final class with x86 specific code.
@@ -355,12 +355,6 @@ class X86Mir2Lir : public Mir2Lir {
void InstallLiteralPools() OVERRIDE;
/*
- * @brief Generate the debug_frame CFI information.
- * @returns pointer to vector containing CFE information
- */
- static std::vector<uint8_t>* ReturnCommonCallFrameInformation(bool is_x86_64);
-
- /*
* @brief Generate the debug_frame FDE information.
* @returns pointer to vector containing CFE information
*/
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index bd2e0f31bb..62a75a598b 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -1437,11 +1437,6 @@ static void AdvanceLoc(std::vector<uint8_t>&buf, uint32_t increment) {
}
}
-
-std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64) {
- return X86Mir2Lir::ReturnCommonCallFrameInformation(is_x86_64);
-}
-
static void EncodeUnsignedLeb128(std::vector<uint8_t>& buf, uint32_t value) {
uint8_t buffer[12];
uint8_t *ptr = EncodeUnsignedLeb128(buffer, value);
@@ -1458,84 +1453,6 @@ static void EncodeSignedLeb128(std::vector<uint8_t>& buf, int32_t value) {
}
}
-std::vector<uint8_t>* X86Mir2Lir::ReturnCommonCallFrameInformation(bool is_x86_64) {
- std::vector<uint8_t>*cfi_info = new std::vector<uint8_t>;
-
- // Length (will be filled in later in this routine).
- PushWord(*cfi_info, 0);
-
- // CIE id: always 0.
- PushWord(*cfi_info, 0);
-
- // Version: always 1.
- cfi_info->push_back(0x01);
-
- // Augmentation: 'zR\0'
- cfi_info->push_back(0x7a);
- cfi_info->push_back(0x52);
- cfi_info->push_back(0x0);
-
- // Code alignment: 1.
- EncodeUnsignedLeb128(*cfi_info, 1);
-
- // Data alignment.
- if (is_x86_64) {
- EncodeSignedLeb128(*cfi_info, -8);
- } else {
- EncodeSignedLeb128(*cfi_info, -4);
- }
-
- // Return address register.
- if (is_x86_64) {
- // R16(RIP)
- cfi_info->push_back(0x10);
- } else {
- // R8(EIP)
- cfi_info->push_back(0x08);
- }
-
- // Augmentation length: 1.
- cfi_info->push_back(1);
-
- // Augmentation data: 0x03 ((DW_EH_PE_absptr << 4) | DW_EH_PE_udata4).
- cfi_info->push_back(0x03);
-
- // Initial instructions.
- if (is_x86_64) {
- // DW_CFA_def_cfa R7(RSP) 8.
- cfi_info->push_back(0x0c);
- cfi_info->push_back(0x07);
- cfi_info->push_back(0x08);
-
- // DW_CFA_offset R16(RIP) 1 (* -8).
- cfi_info->push_back(0x90);
- cfi_info->push_back(0x01);
- } else {
- // DW_CFA_def_cfa R4(ESP) 4.
- cfi_info->push_back(0x0c);
- cfi_info->push_back(0x04);
- cfi_info->push_back(0x04);
-
- // DW_CFA_offset R8(EIP) 1 (* -4).
- cfi_info->push_back(0x88);
- cfi_info->push_back(0x01);
- }
-
- // Padding to a multiple of 4
- while ((cfi_info->size() & 3) != 0) {
- // DW_CFA_nop is encoded as 0.
- cfi_info->push_back(0);
- }
-
- // Set the length of the CIE inside the generated bytes.
- uint32_t length = cfi_info->size() - 4;
- (*cfi_info)[0] = length;
- (*cfi_info)[1] = length >> 8;
- (*cfi_info)[2] = length >> 16;
- (*cfi_info)[3] = length >> 24;
- return cfi_info;
-}
-
static bool ARTRegIDToDWARFRegID(bool is_x86_64, int art_reg_id, int* dwarf_reg_id) {
if (is_x86_64) {
switch (art_reg_id) {
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index ed126adfe9..810761f0d8 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -353,7 +353,6 @@ CompilerDriver::CompilerDriver(const CompilerOptions* compiler_options,
compiler_enable_auto_elf_loading_(NULL),
compiler_get_method_code_addr_(NULL),
support_boot_image_fixup_(instruction_set != kMips),
- cfi_info_(nullptr),
dedupe_code_("dedupe code"),
dedupe_mapping_table_("dedupe mapping table"),
dedupe_vmap_table_("dedupe vmap table"),
@@ -376,11 +375,6 @@ CompilerDriver::CompilerDriver(const CompilerOptions* compiler_options,
CHECK(image_classes_.get() == nullptr);
}
- // Are we generating CFI information?
- if (compiler_options->GetGenerateGDBInformation()) {
- cfi_info_.reset(compiler_->GetCallFrameInformationInitialization(*this));
- }
-
// Read the profile file if one is provided.
if (!profile_file.empty()) {
profile_present_ = profile_file_.LoadFile(profile_file);
@@ -597,7 +591,7 @@ void CompilerDriver::Resolve(jobject class_loader, const std::vector<const DexFi
for (size_t i = 0; i != dex_files.size(); ++i) {
const DexFile* dex_file = dex_files[i];
CHECK(dex_file != nullptr);
- ResolveDexFile(class_loader, *dex_file, thread_pool, timings);
+ ResolveDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
}
}
@@ -1357,12 +1351,14 @@ class ParallelCompilationManager {
jobject class_loader,
CompilerDriver* compiler,
const DexFile* dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool)
: index_(0),
class_linker_(class_linker),
class_loader_(class_loader),
compiler_(compiler),
dex_file_(dex_file),
+ dex_files_(dex_files),
thread_pool_(thread_pool) {}
ClassLinker* GetClassLinker() const {
@@ -1384,6 +1380,10 @@ class ParallelCompilationManager {
return dex_file_;
}
+ const std::vector<const DexFile*>& GetDexFiles() const {
+ return dex_files_;
+ }
+
void ForAll(size_t begin, size_t end, Callback callback, size_t work_units) {
Thread* self = Thread::Current();
self->AssertNoPendingException();
@@ -1441,11 +1441,24 @@ class ParallelCompilationManager {
const jobject class_loader_;
CompilerDriver* const compiler_;
const DexFile* const dex_file_;
+ const std::vector<const DexFile*>& dex_files_;
ThreadPool* const thread_pool_;
DISALLOW_COPY_AND_ASSIGN(ParallelCompilationManager);
};
+static bool SkipClassCheckClassPath(const char* descriptor, const DexFile& dex_file,
+ const std::vector<const DexFile*>& classpath) {
+ DexFile::ClassPathEntry pair = DexFile::FindInClassPath(descriptor, classpath);
+ CHECK(pair.second != NULL);
+ if (pair.first != &dex_file) {
+ LOG(WARNING) << "Skipping class " << descriptor << " from " << dex_file.GetLocation()
+ << " previously found in " << pair.first->GetLocation();
+ return true;
+ }
+ return false;
+}
+
// Return true if the class should be skipped during compilation.
//
// The first case where we skip is for redundant class definitions in
@@ -1454,20 +1467,23 @@ class ParallelCompilationManager {
// The second case where we skip is when an app bundles classes found
// in the boot classpath. Since at runtime we will select the class from
// the boot classpath, we ignore the one from the app.
+//
+// The third case is if the app itself has the class defined in multiple dex files. Then we skip
+// it if it is not the first occurrence.
static bool SkipClass(ClassLinker* class_linker, jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
const DexFile::ClassDef& class_def) {
const char* descriptor = dex_file.GetClassDescriptor(class_def);
+
if (class_loader == NULL) {
- DexFile::ClassPathEntry pair = DexFile::FindInClassPath(descriptor, class_linker->GetBootClassPath());
- CHECK(pair.second != NULL);
- if (pair.first != &dex_file) {
- LOG(WARNING) << "Skipping class " << descriptor << " from " << dex_file.GetLocation()
- << " previously found in " << pair.first->GetLocation();
- return true;
- }
- return false;
+ return SkipClassCheckClassPath(descriptor, dex_file, class_linker->GetBootClassPath());
}
- return class_linker->IsInBootClassPath(descriptor);
+
+ if (class_linker->IsInBootClassPath(descriptor)) {
+ return true;
+ }
+
+ return SkipClassCheckClassPath(descriptor, dex_file, dex_files);
}
// A fast version of SkipClass above if the class pointer is available
@@ -1525,7 +1541,7 @@ static void ResolveClassFieldsAndMethods(const ParallelCompilationManager* manag
// definitions, since many of them many never be referenced by
// generated code.
const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index);
- if (!SkipClass(class_linker, jclass_loader, dex_file, class_def)) {
+ if (!SkipClass(class_linker, jclass_loader, dex_file, manager->GetDexFiles(), class_def)) {
ScopedObjectAccess soa(self);
StackHandleScope<2> hs(soa.Self());
Handle<mirror::ClassLoader> class_loader(
@@ -1632,13 +1648,15 @@ static void ResolveType(const ParallelCompilationManager* manager, size_t type_i
}
void CompilerDriver::ResolveDexFile(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings) {
ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
// TODO: we could resolve strings here, although the string table is largely filled with class
// and method names.
- ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool);
+ ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, dex_files,
+ thread_pool);
if (IsImage()) {
// For images we resolve all types, such as array, whereas for applications just those with
// classdefs are resolved by ResolveClassFieldsAndMethods.
@@ -1655,7 +1673,7 @@ void CompilerDriver::Verify(jobject class_loader, const std::vector<const DexFil
for (size_t i = 0; i != dex_files.size(); ++i) {
const DexFile* dex_file = dex_files[i];
CHECK(dex_file != NULL);
- VerifyDexFile(class_loader, *dex_file, thread_pool, timings);
+ VerifyDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
}
}
@@ -1707,10 +1725,12 @@ static void VerifyClass(const ParallelCompilationManager* manager, size_t class_
}
void CompilerDriver::VerifyDexFile(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings) {
TimingLogger::ScopedTiming t("Verify Dex File", timings);
ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
- ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool);
+ ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, dex_files,
+ thread_pool);
context.ForAll(0, dex_file.NumClassDefs(), VerifyClass, thread_count_);
}
@@ -1800,10 +1820,12 @@ static void InitializeClass(const ParallelCompilationManager* manager, size_t cl
}
void CompilerDriver::InitializeClasses(jobject jni_class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings) {
TimingLogger::ScopedTiming t("InitializeNoClinit", timings);
ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
- ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, thread_pool);
+ ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, dex_files,
+ thread_pool);
size_t thread_count;
if (IsImage()) {
// TODO: remove this when transactional mode supports multithreading.
@@ -1824,7 +1846,7 @@ void CompilerDriver::InitializeClasses(jobject class_loader,
for (size_t i = 0; i != dex_files.size(); ++i) {
const DexFile* dex_file = dex_files[i];
CHECK(dex_file != NULL);
- InitializeClasses(class_loader, *dex_file, thread_pool, timings);
+ InitializeClasses(class_loader, *dex_file, dex_files, thread_pool, timings);
}
}
@@ -1833,7 +1855,7 @@ void CompilerDriver::Compile(jobject class_loader, const std::vector<const DexFi
for (size_t i = 0; i != dex_files.size(); ++i) {
const DexFile* dex_file = dex_files[i];
CHECK(dex_file != NULL);
- CompileDexFile(class_loader, *dex_file, thread_pool, timings);
+ CompileDexFile(class_loader, *dex_file, dex_files, thread_pool, timings);
}
}
@@ -1843,7 +1865,7 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz
const DexFile& dex_file = *manager->GetDexFile();
const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index);
ClassLinker* class_linker = manager->GetClassLinker();
- if (SkipClass(class_linker, jclass_loader, dex_file, class_def)) {
+ if (SkipClass(class_linker, jclass_loader, dex_file, manager->GetDexFiles(), class_def)) {
return;
}
ClassReference ref(&dex_file, class_def_index);
@@ -1912,10 +1934,11 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz
}
void CompilerDriver::CompileDexFile(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings) {
TimingLogger::ScopedTiming t("Compile Dex File", timings);
ParallelCompilationManager context(Runtime::Current()->GetClassLinker(), class_loader, this,
- &dex_file, thread_pool);
+ &dex_file, dex_files, thread_pool);
context.ForAll(0, dex_file.NumClassDefs(), CompilerDriver::CompileClass, thread_count_);
}
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 6dae398372..e6927d1c8d 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -402,10 +402,6 @@ class CompilerDriver {
return dump_passes_;
}
- bool DidIncludeDebugSymbols() const {
- return compiler_options_->GetIncludeDebugSymbols();
- }
-
CumulativeLogger* GetTimingsLogger() const {
return timings_logger_;
}
@@ -599,14 +595,6 @@ class CompilerDriver {
std::vector<uint8_t>* DeduplicateGCMap(const std::vector<uint8_t>& code);
std::vector<uint8_t>* DeduplicateCFIInfo(const std::vector<uint8_t>* cfi_info);
- /*
- * @brief return the pointer to the Call Frame Information.
- * @return pointer to call frame information for this compilation.
- */
- std::vector<uint8_t>* GetCallFrameInformation() const {
- return cfi_info_.get();
- }
-
ProfileFile profile_file_;
bool profile_present_;
@@ -658,12 +646,14 @@ class CompilerDriver {
ThreadPool* thread_pool, TimingLogger* timings)
LOCKS_EXCLUDED(Locks::mutator_lock_);
void ResolveDexFile(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings)
LOCKS_EXCLUDED(Locks::mutator_lock_);
void Verify(jobject class_loader, const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings);
void VerifyDexFile(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings)
LOCKS_EXCLUDED(Locks::mutator_lock_);
@@ -671,6 +661,7 @@ class CompilerDriver {
ThreadPool* thread_pool, TimingLogger* timings)
LOCKS_EXCLUDED(Locks::mutator_lock_);
void InitializeClasses(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings)
LOCKS_EXCLUDED(Locks::mutator_lock_, compiled_classes_lock_);
@@ -681,6 +672,7 @@ class CompilerDriver {
void Compile(jobject class_loader, const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings);
void CompileDexFile(jobject class_loader, const DexFile& dex_file,
+ const std::vector<const DexFile*>& dex_files,
ThreadPool* thread_pool, TimingLogger* timings)
LOCKS_EXCLUDED(Locks::mutator_lock_);
void CompileMethod(const DexFile::CodeItem* code_item, uint32_t access_flags,
@@ -766,9 +758,6 @@ class CompilerDriver {
bool support_boot_image_fixup_;
- // Call Frame Information, which might be generated to help stack tracebacks.
- std::unique_ptr<std::vector<uint8_t>> cfi_info_;
-
// DeDuplication data structures, these own the corresponding byte arrays.
class DedupeHashFunc {
public:
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 1fde12ecaf..71f02d3b5d 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -24,6 +24,7 @@
#include "elf_utils.h"
#include "file_output_stream.h"
#include "globals.h"
+#include "leb128.h"
#include "oat.h"
#include "oat_writer.h"
#include "utils.h"
@@ -38,6 +39,25 @@ static uint8_t MakeStInfo(uint8_t binding, uint8_t type) {
return ((binding) << 4) + ((type) & 0xf);
}
+static void UpdateWord(std::vector<uint8_t>* buf, int offset, int data) {
+ (*buf)[offset+0] = data;
+ (*buf)[offset+1] = data >> 8;
+ (*buf)[offset+2] = data >> 16;
+ (*buf)[offset+3] = data >> 24;
+}
+
+static void PushWord(std::vector<uint8_t>* buf, int data) {
+ buf->push_back(data & 0xff);
+ buf->push_back((data >> 8) & 0xff);
+ buf->push_back((data >> 16) & 0xff);
+ buf->push_back((data >> 24) & 0xff);
+}
+
+static void PushHalf(std::vector<uint8_t>* buf, int data) {
+ buf->push_back(data & 0xff);
+ buf->push_back((data >> 8) & 0xff);
+}
+
bool ElfWriterQuick::ElfBuilder::Write() {
// The basic layout of the elf file. Order may be different in final output.
// +-------------------------+
@@ -822,37 +842,131 @@ void ElfWriterQuick::ReservePatchSpace(std::vector<uint8_t>* buffer, bool debug)
}
}
+static void EncodeUnsignedLeb128(uint32_t data, std::vector<uint8_t>* dst) {
+ size_t encoded_size = UnsignedLeb128Size(data);
+ size_t cur_index = dst->size();
+ dst->resize(dst->size() + encoded_size);
+ uint8_t* write_pos = &((*dst)[cur_index]);
+ uint8_t* write_pos_after = EncodeUnsignedLeb128(write_pos, data);
+ DCHECK_EQ(static_cast<size_t>(write_pos_after - write_pos), encoded_size);
+}
+
+static void EncodeSignedLeb128(int32_t data, std::vector<uint8_t>* dst) {
+ size_t encoded_size = SignedLeb128Size(data);
+ size_t cur_index = dst->size();
+ dst->resize(dst->size() + encoded_size);
+ uint8_t* write_pos = &((*dst)[cur_index]);
+ uint8_t* write_pos_after = EncodeSignedLeb128(write_pos, data);
+ DCHECK_EQ(static_cast<size_t>(write_pos_after - write_pos), encoded_size);
+}
+
+std::vector<uint8_t>* ConstructCIEFrameX86(bool is_x86_64) {
+ std::vector<uint8_t>*cfi_info = new std::vector<uint8_t>;
+
+ // Length (will be filled in later in this routine).
+ PushWord(cfi_info, 0);
+
+ // CIE id: always 0.
+ PushWord(cfi_info, 0);
+
+ // Version: always 1.
+ cfi_info->push_back(0x01);
+
+ // Augmentation: 'zR\0'
+ cfi_info->push_back(0x7a);
+ cfi_info->push_back(0x52);
+ cfi_info->push_back(0x0);
+
+ // Code alignment: 1.
+ EncodeUnsignedLeb128(1, cfi_info);
+
+ // Data alignment.
+ if (is_x86_64) {
+ EncodeSignedLeb128(-8, cfi_info);
+ } else {
+ EncodeSignedLeb128(-4, cfi_info);
+ }
+
+ // Return address register.
+ if (is_x86_64) {
+ // R16(RIP)
+ cfi_info->push_back(0x10);
+ } else {
+ // R8(EIP)
+ cfi_info->push_back(0x08);
+ }
+
+ // Augmentation length: 1.
+ cfi_info->push_back(1);
+
+ // Augmentation data: 0x03 ((DW_EH_PE_absptr << 4) | DW_EH_PE_udata4).
+ cfi_info->push_back(0x03);
+
+ // Initial instructions.
+ if (is_x86_64) {
+ // DW_CFA_def_cfa R7(RSP) 8.
+ cfi_info->push_back(0x0c);
+ cfi_info->push_back(0x07);
+ cfi_info->push_back(0x08);
+
+ // DW_CFA_offset R16(RIP) 1 (* -8).
+ cfi_info->push_back(0x90);
+ cfi_info->push_back(0x01);
+ } else {
+ // DW_CFA_def_cfa R4(ESP) 4.
+ cfi_info->push_back(0x0c);
+ cfi_info->push_back(0x04);
+ cfi_info->push_back(0x04);
+
+ // DW_CFA_offset R8(EIP) 1 (* -4).
+ cfi_info->push_back(0x88);
+ cfi_info->push_back(0x01);
+ }
+
+ // Padding to a multiple of 4
+ while ((cfi_info->size() & 3) != 0) {
+ // DW_CFA_nop is encoded as 0.
+ cfi_info->push_back(0);
+ }
+
+ // Set the length of the CIE inside the generated bytes.
+ uint32_t length = cfi_info->size() - 4;
+ (*cfi_info)[0] = length;
+ (*cfi_info)[1] = length >> 8;
+ (*cfi_info)[2] = length >> 16;
+ (*cfi_info)[3] = length >> 24;
+ return cfi_info;
+}
+
+std::vector<uint8_t>* ConstructCIEFrame(InstructionSet isa) {
+ switch (isa) {
+ case kX86:
+ return ConstructCIEFrameX86(false);
+ case kX86_64:
+ return ConstructCIEFrameX86(true);
+
+ default:
+ // Not implemented.
+ return nullptr;
+ }
+}
+
bool ElfWriterQuick::Write(OatWriter* oat_writer,
const std::vector<const DexFile*>& dex_files_unused,
const std::string& android_root_unused,
bool is_host_unused) {
- const bool debug = false;
- const bool add_symbols = oat_writer->DidAddSymbols();
+ constexpr bool debug = false;
const OatHeader& oat_header = oat_writer->GetOatHeader();
Elf32_Word oat_data_size = oat_header.GetExecutableOffset();
uint32_t oat_exec_size = oat_writer->GetSize() - oat_data_size;
ElfBuilder builder(oat_writer, elf_file_, compiler_driver_->GetInstructionSet(), 0,
- oat_data_size, oat_data_size, oat_exec_size, add_symbols, debug);
+ oat_data_size, oat_data_size, oat_exec_size,
+ compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols(),
+ debug);
- if (add_symbols) {
- AddDebugSymbols(builder, oat_writer, debug);
- }
-
- bool generateDebugInformation = compiler_driver_->GetCallFrameInformation() != nullptr;
- if (generateDebugInformation) {
- ElfRawSectionBuilder debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
- ElfRawSectionBuilder debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
- ElfRawSectionBuilder debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
- ElfRawSectionBuilder eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0);
- eh_frame.SetBuffer(*compiler_driver_->GetCallFrameInformation());
-
- FillInCFIInformation(oat_writer, debug_info.GetBuffer(),
- debug_abbrev.GetBuffer(), debug_str.GetBuffer());
- builder.RegisterRawSection(debug_info);
- builder.RegisterRawSection(debug_abbrev);
- builder.RegisterRawSection(eh_frame);
- builder.RegisterRawSection(debug_str);
+ if (compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) {
+ WriteDebugSymbols(builder, oat_writer);
}
if (compiler_driver_->GetCompilerOptions().GetIncludePatchInformation()) {
@@ -865,32 +979,62 @@ bool ElfWriterQuick::Write(OatWriter* oat_writer,
return builder.Write();
}
-void ElfWriterQuick::AddDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer, bool debug) {
+void ElfWriterQuick::WriteDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer) {
+ std::unique_ptr<std::vector<uint8_t>> cfi_info(
+ ConstructCIEFrame(compiler_driver_->GetInstructionSet()));
+
+ // Iterate over the compiled methods.
const std::vector<OatWriter::DebugInfo>& method_info = oat_writer->GetCFIMethodInfo();
ElfSymtabBuilder* symtab = &builder.symtab_builder_;
for (auto it = method_info.begin(); it != method_info.end(); ++it) {
symtab->AddSymbol(it->method_name_, &builder.text_builder_, it->low_pc_, true,
it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC);
- }
-}
-static void UpdateWord(std::vector<uint8_t>*buf, int offset, int data) {
- (*buf)[offset+0] = data;
- (*buf)[offset+1] = data >> 8;
- (*buf)[offset+2] = data >> 16;
- (*buf)[offset+3] = data >> 24;
-}
+ // Include CFI for compiled method, if possible.
+ if (cfi_info.get() != nullptr) {
+ DCHECK(it->compiled_method_ != nullptr);
+
+ // Copy in the FDE, if present
+ const std::vector<uint8_t>* fde = it->compiled_method_->GetCFIInfo();
+ if (fde != nullptr) {
+ // Copy the information into cfi_info and then fix the address in the new copy.
+ int cur_offset = cfi_info->size();
+ cfi_info->insert(cfi_info->end(), fde->begin(), fde->end());
+
+ // Set the 'CIE_pointer' field to cur_offset+4.
+ uint32_t CIE_pointer = cur_offset + 4;
+ uint32_t offset_to_update = cur_offset + sizeof(uint32_t);
+ (*cfi_info)[offset_to_update+0] = CIE_pointer;
+ (*cfi_info)[offset_to_update+1] = CIE_pointer >> 8;
+ (*cfi_info)[offset_to_update+2] = CIE_pointer >> 16;
+ (*cfi_info)[offset_to_update+3] = CIE_pointer >> 24;
+
+ // Set the 'initial_location' field to address the start of the method.
+ offset_to_update = cur_offset + 2*sizeof(uint32_t);
+ const uint32_t quick_code_start = it->low_pc_;
+ (*cfi_info)[offset_to_update+0] = quick_code_start;
+ (*cfi_info)[offset_to_update+1] = quick_code_start >> 8;
+ (*cfi_info)[offset_to_update+2] = quick_code_start >> 16;
+ (*cfi_info)[offset_to_update+3] = quick_code_start >> 24;
+ }
+ }
+ }
-static void PushWord(std::vector<uint8_t>*buf, int data) {
- buf->push_back(data & 0xff);
- buf->push_back((data >> 8) & 0xff);
- buf->push_back((data >> 16) & 0xff);
- buf->push_back((data >> 24) & 0xff);
-}
+ if (cfi_info.get() != nullptr) {
+ // Now lay down the Elf sections.
+ ElfRawSectionBuilder debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+ ElfRawSectionBuilder debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+ ElfRawSectionBuilder debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
+ ElfRawSectionBuilder eh_frame(".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0);
+ eh_frame.SetBuffer(std::move(*cfi_info.get()));
-static void PushHalf(std::vector<uint8_t>*buf, int data) {
- buf->push_back(data & 0xff);
- buf->push_back((data >> 8) & 0xff);
+ FillInCFIInformation(oat_writer, debug_info.GetBuffer(), debug_abbrev.GetBuffer(),
+ debug_str.GetBuffer());
+ builder.RegisterRawSection(debug_info);
+ builder.RegisterRawSection(debug_abbrev);
+ builder.RegisterRawSection(eh_frame);
+ builder.RegisterRawSection(debug_str);
+ }
}
void ElfWriterQuick::FillInCFIInformation(OatWriter* oat_writer,
diff --git a/compiler/elf_writer_quick.h b/compiler/elf_writer_quick.h
index a0d36df471..8cfe550495 100644
--- a/compiler/elf_writer_quick.h
+++ b/compiler/elf_writer_quick.h
@@ -48,9 +48,7 @@ class ElfWriterQuick FINAL : public ElfWriter {
~ElfWriterQuick() {}
class ElfBuilder;
- void AddDebugSymbols(ElfBuilder& builder,
- OatWriter* oat_writer,
- bool debug);
+ void WriteDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer);
void ReservePatchSpace(std::vector<uint8_t>* buffer, bool debug);
class ElfSectionBuilder {
@@ -126,7 +124,7 @@ class ElfWriterQuick FINAL : public ElfWriter {
: ElfSectionBuilder(sec_name, type, flags, link, info, align, entsize) {}
~ElfRawSectionBuilder() {}
std::vector<uint8_t>* GetBuffer() { return &buf_; }
- void SetBuffer(std::vector<uint8_t> buf) { buf_ = buf; }
+ void SetBuffer(std::vector<uint8_t>&& buf) { buf_ = buf; }
protected:
std::vector<uint8_t> buf_;
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 9da59ab7ff..1ba5d3218e 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -357,7 +357,6 @@ class OatWriter::InitCodeMethodVisitor : public OatDexMethodVisitor {
uint32_t thumb_offset = compiled_method->CodeDelta();
quick_code_offset = offset_ + sizeof(OatQuickMethodHeader) + thumb_offset;
- bool force_debug_capture = false;
bool deduped = false;
// Deduplicate code arrays.
@@ -400,47 +399,22 @@ class OatWriter::InitCodeMethodVisitor : public OatDexMethodVisitor {
offset_ += code_size;
}
- uint32_t quick_code_start = quick_code_offset - writer_->oat_header_->GetExecutableOffset();
- std::vector<uint8_t>* cfi_info = writer_->compiler_driver_->GetCallFrameInformation();
- if (cfi_info != nullptr) {
- // Copy in the FDE, if present
- const std::vector<uint8_t>* fde = compiled_method->GetCFIInfo();
- if (fde != nullptr) {
- // Copy the information into cfi_info and then fix the address in the new copy.
- int cur_offset = cfi_info->size();
- cfi_info->insert(cfi_info->end(), fde->begin(), fde->end());
-
- // Set the 'CIE_pointer' field to cur_offset+4.
- uint32_t CIE_pointer = cur_offset + 4;
- uint32_t offset_to_update = cur_offset + sizeof(uint32_t);
- (*cfi_info)[offset_to_update+0] = CIE_pointer;
- (*cfi_info)[offset_to_update+1] = CIE_pointer >> 8;
- (*cfi_info)[offset_to_update+2] = CIE_pointer >> 16;
- (*cfi_info)[offset_to_update+3] = CIE_pointer >> 24;
-
- // Set the 'initial_location' field to address the start of the method.
- offset_to_update = cur_offset + 2*sizeof(uint32_t);
- (*cfi_info)[offset_to_update+0] = quick_code_start;
- (*cfi_info)[offset_to_update+1] = quick_code_start >> 8;
- (*cfi_info)[offset_to_update+2] = quick_code_start >> 16;
- (*cfi_info)[offset_to_update+3] = quick_code_start >> 24;
- force_debug_capture = true;
- }
- }
+ if (writer_->compiler_driver_->GetCompilerOptions().GetIncludeDebugSymbols()) {
+ // Record debug information for this function if we are doing that.
-
- if (writer_->compiler_driver_->DidIncludeDebugSymbols() || force_debug_capture) {
- // Record debug information for this function if we are doing that or
- // we have CFI and so need it.
std::string name = PrettyMethod(it.GetMemberIndex(), *dex_file_, true);
if (deduped) {
- // TODO We should place the DEDUPED tag on the first instance of a
- // deduplicated symbol so that it will show up in a debuggerd crash
- // report.
+ // TODO We should place the DEDUPED tag on the first instance of a deduplicated symbol
+ // so that it will show up in a debuggerd crash report.
name += " [ DEDUPED ]";
}
- writer_->method_info_.push_back(DebugInfo(name, quick_code_start,
- quick_code_start + code_size));
+
+ const uint32_t quick_code_start = quick_code_offset -
+ writer_->oat_header_->GetExecutableOffset();
+ writer_->method_info_.push_back(DebugInfo(name,
+ quick_code_start,
+ quick_code_start + code_size,
+ compiled_method));
}
}
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 945048ecb7..ef5fd6b1c2 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -30,6 +30,7 @@
namespace art {
class BitVector;
+class CompiledMethod;
class OutputStream;
// OatHeader variable length with count of D OatDexFiles
@@ -97,22 +98,21 @@ class OatWriter {
~OatWriter();
struct DebugInfo {
- DebugInfo(const std::string& method_name, uint32_t low_pc, uint32_t high_pc)
- : method_name_(method_name), low_pc_(low_pc), high_pc_(high_pc) {
+ DebugInfo(const std::string& method_name, uint32_t low_pc, uint32_t high_pc,
+ CompiledMethod* compiled_method)
+ : method_name_(method_name), low_pc_(low_pc), high_pc_(high_pc),
+ compiled_method_(compiled_method) {
}
- std::string method_name_;
+ std::string method_name_; // Note: this name is a pretty-printed name.
uint32_t low_pc_;
uint32_t high_pc_;
+ CompiledMethod* compiled_method_;
};
const std::vector<DebugInfo>& GetCFIMethodInfo() const {
return method_info_;
}
- bool DidAddSymbols() const {
- return compiler_driver_->DidIncludeDebugSymbols();
- }
-
private:
// The DataAccess classes are helper classes that provide access to members related to
// a given map, i.e. GC map, mapping table or vmap table. By abstracting these away
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 1d6655c24e..2f814dfaf4 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -124,8 +124,8 @@ class Operand {
if (index.NeedsRex()) {
rex_ |= 0x42; // REX.00X0
}
- encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.AsRegister()) << 3) |
- static_cast<uint8_t>(base.AsRegister());
+ encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.LowBits()) << 3) |
+ static_cast<uint8_t>(base.LowBits());
length_ = 2;
}
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 1f4d727abb..4ed7b2015a 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -128,9 +128,18 @@ TEST_F(AssemblerX86_64Test, XorqImm) {
TEST_F(AssemblerX86_64Test, Movl) {
GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::CpuRegister(x86_64::R11));
GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::CpuRegister(x86_64::R11));
+ GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
+ x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+ GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
+ x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+ GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::Address(
+ x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
const char* expected =
"movl %R11d, %R8d\n"
- "movl %R11d, %EAX\n";
+ "movl %R11d, %EAX\n"
+ "movl 0xc(%RDI,%RBX,4), %EAX\n"
+ "movl 0xc(%RDI,%R9,4), %EAX\n"
+ "movl 0xc(%RDI,%R9,4), %R8d\n";
DriverStr(expected, "movl");
}
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 8fc5e343e7..302e835e71 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -252,6 +252,7 @@ LIBART_SRC_FILES_x86_64 := \
arch/x86_64/context_x86_64.cc \
arch/x86_64/entrypoints_init_x86_64.cc \
arch/x86_64/jni_entrypoints_x86_64.S \
+ arch/x86_64/memcmp16_x86_64.S \
arch/x86_64/portable_entrypoints_x86_64.S \
arch/x86_64/quick_entrypoints_x86_64.S \
arch/x86_64/thread_x86_64.cc \
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 4939610e60..86cb16aab5 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -365,8 +365,9 @@ END art_quick_invoke_stub
ARM_ENTRY art_quick_do_long_jump
vldm r1, {s0-s31} @ load all fprs from argument fprs_
ldr r2, [r0, #60] @ r2 = r15 (PC from gprs_ 60=4*15)
+ ldr r14, [r0, #56] @ (LR from gprs_ 56=4*14)
add r0, r0, #12 @ increment r0 to skip gprs_[0..2] 12=4*3
- ldm r0, {r3-r14} @ load remaining gprs from argument gprs_
+ ldm r0, {r3-r13} @ load remaining gprs from argument gprs_
mov r0, #0 @ clear result registers r0 and r1
mov r1, #0
bx r2 @ do long jump
diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h
index 65d2f92d74..14dc1e3880 100644
--- a/runtime/arch/memcmp16.h
+++ b/runtime/arch/memcmp16.h
@@ -30,7 +30,7 @@
//
// In both cases, MemCmp16 is declared.
-#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) || defined(__x86_64__)
extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count);
#define MemCmp16 __memcmp16
diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S
index 17662fae6b..a315a378ea 100644
--- a/runtime/arch/x86/memcmp16_x86.S
+++ b/runtime/arch/x86/memcmp16_x86.S
@@ -21,1018 +21,1018 @@
/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
#ifndef L
-# define L(label) .L##label
+# define L(label) .L##label
#endif
-#define CFI_PUSH(REG) \
- CFI_ADJUST_CFA_OFFSET(4); \
- CFI_REL_OFFSET(REG, 0)
+#define CFI_PUSH(REG) \
+ CFI_ADJUST_CFA_OFFSET(4); \
+ CFI_REL_OFFSET(REG, 0)
-#define CFI_POP(REG) \
- CFI_ADJUST_CFA_OFFSET(-4); \
- CFI_RESTORE(REG)
+#define CFI_POP(REG) \
+ CFI_ADJUST_CFA_OFFSET(-4); \
+ CFI_RESTORE(REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
+#define PARMS 4
+#define BLK1 PARMS
+#define BLK2 BLK1+4
+#define LEN BLK2+4
+#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
DEFINE_FUNCTION MEMCMP
- movl LEN(%esp), %ecx
+ movl LEN(%esp), %ecx
- shl $1, %ecx
- jz L(zero)
+ shl $1, %ecx
+ jz L(zero)
- movl BLK1(%esp), %eax
- cmp $48, %ecx
- movl BLK2(%esp), %edx
- jae L(48bytesormore)
+ movl BLK1(%esp), %eax
+ cmp $48, %ecx
+ movl BLK2(%esp), %edx
+ jae L(48bytesormore)
- PUSH (%ebx)
- add %ecx, %edx
- add %ecx, %eax
- jmp L(less48bytes)
+ PUSH (%ebx)
+ add %ecx, %edx
+ add %ecx, %eax
+ jmp L(less48bytes)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
- .p2align 4
+ .p2align 4
L(zero):
- xor %eax, %eax
- ret
+ xor %eax, %eax
+ ret
- .p2align 4
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
- CFI_REMEMBER_STATE
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
- movl %eax, %edi
- movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%edi), %edi
-
- sub $0xffff, %edx
- lea 16(%esi), %esi
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %edx, %edi
- sub %edx, %esi
- add %edx, %ecx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %edx, %esi
-
- cmp $0, %edx
- je L(shr_0)
- cmp $2, %edx
- je L(shr_2)
- cmp $4, %edx
- je L(shr_4)
- cmp $6, %edx
- je L(shr_6)
- cmp $8, %edx
- je L(shr_8)
- cmp $10, %edx
- je L(shr_10)
- cmp $12, %edx
- je L(shr_12)
- jmp L(shr_14)
-
- .p2align 4
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
+ CFI_REMEMBER_STATE
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
+ movl %eax, %edi
+ movl %edx, %esi
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%edi), %edi
+
+ sub $0xffff, %edx
+ lea 16(%esi), %esi
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %edx, %edi
+ sub %edx, %esi
+ add %edx, %ecx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %edx, %esi
+
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $6, %edx
+ je L(shr_6)
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $12, %edx
+ je L(shr_12)
+ jmp L(shr_14)
+
+ .p2align 4
L(shr_0):
- cmp $80, %ecx
- jae L(shr_0_gobble)
- lea -48(%ecx), %ecx
- xor %eax, %eax
- movaps (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
- movaps 16(%esi), %xmm2
- pcmpeqb 16(%edi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- add $32, %edi
- add $32, %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ jae L(shr_0_gobble)
+ lea -48(%ecx), %ecx
+ xor %eax, %eax
+ movaps (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+ movaps 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ add $32, %edi
+ add $32, %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_0_gobble):
- lea -48(%ecx), %ecx
- movdqa (%esi), %xmm0
- xor %eax, %eax
- pcmpeqb (%edi), %xmm0
- sub $32, %ecx
- movdqa 16(%esi), %xmm2
- pcmpeqb 16(%edi), %xmm2
+ lea -48(%ecx), %ecx
+ movdqa (%esi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %ecx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%esi), %xmm0
- movdqa 48(%esi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%edi), %xmm0
- pcmpeqb 48(%edi), %xmm2
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %ecx
- jge L(shr_0_gobble_loop_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm2
+ sub $32, %ecx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%esi), %xmm0
+ movdqa 48(%esi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%edi), %xmm0
+ pcmpeqb 48(%edi), %xmm2
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ jz L(shr_0_gobble_loop)
+
+ pand %xmm0, %xmm2
+ cmp $0, %ecx
+ jge L(shr_0_gobble_loop_next)
+ inc %edx
+ add $32, %ecx
L(shr_0_gobble_loop_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_2):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $2,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $2,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_2_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $2,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $2,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $2,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $2,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $2,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $2,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $2,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $2,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_4):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $4,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $4,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_4_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $4,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $4,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $4,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $4,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $4,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $4,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $4,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $4,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_6):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $6,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $6,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_6_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $6,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $6,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $6,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $6,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $6,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $6,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $6,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $6,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_8):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $8,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $8,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_8_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $8,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $8,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $8,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $8,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $8,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $8,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $8,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $8,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_10):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $10,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $10,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_10_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $10, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $10, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $10, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $10, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $10,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $10,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $10,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $10,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_12):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_12_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $12, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $12, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $12, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $12, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $12,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $12,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $12,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $12,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_14):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
-
- movdqa 32(%esi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_14_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $14, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $14, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $14, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $14, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%esi), %xmm3
- palignr $14,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $14,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
-
- lea 32(%edi), %edi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %ecx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $14,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $14,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea (%ecx, %edi,1), %eax
- lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
-
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(exit):
- pmovmskb %xmm1, %ebx
- sub $0xffff, %ebx
- jz L(first16bytes)
- lea -16(%esi), %esi
- lea -16(%edi), %edi
- mov %ebx, %edx
+ pmovmskb %xmm1, %ebx
+ sub $0xffff, %ebx
+ jz L(first16bytes)
+ lea -16(%esi), %esi
+ lea -16(%edi), %edi
+ mov %ebx, %edx
L(first16bytes):
- add %eax, %esi
+ add %eax, %esi
L(less16bytes):
- test %dl, %dl
- jz L(next_four_words)
- test $15, %dl
- jz L(second_two_words)
- test $3, %dl
- jz L(second_word)
- movzwl -16(%edi), %eax
- movzwl -16(%esi), %ebx
- subl %ebx, %eax
- RETURN
-
- .p2align 4
+ test %dl, %dl
+ jz L(next_four_words)
+ test $15, %dl
+ jz L(second_two_words)
+ test $3, %dl
+ jz L(second_word)
+ movzwl -16(%edi), %eax
+ movzwl -16(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
+
+ .p2align 4
L(second_word):
- movzwl -14(%edi), %eax
- movzwl -14(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -14(%edi), %eax
+ movzwl -14(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(second_two_words):
- test $63, %dl
- jz L(fourth_word)
- movzwl -12(%edi), %eax
- movzwl -12(%esi), %ebx
- subl %ebx, %eax
- RETURN
-
- .p2align 4
+ test $63, %dl
+ jz L(fourth_word)
+ movzwl -12(%edi), %eax
+ movzwl -12(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
+
+ .p2align 4
L(fourth_word):
- movzwl -10(%edi), %eax
- movzwl -10(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -10(%edi), %eax
+ movzwl -10(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(next_four_words):
- test $15, %dh
- jz L(fourth_two_words)
- test $3, %dh
- jz L(sixth_word)
- movzwl -8(%edi), %eax
- movzwl -8(%esi), %ebx
- subl %ebx, %eax
- RETURN
-
- .p2align 4
+ test $15, %dh
+ jz L(fourth_two_words)
+ test $3, %dh
+ jz L(sixth_word)
+ movzwl -8(%edi), %eax
+ movzwl -8(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
+
+ .p2align 4
L(sixth_word):
- movzwl -6(%edi), %eax
- movzwl -6(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -6(%edi), %eax
+ movzwl -6(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(fourth_two_words):
- test $63, %dh
- jz L(eighth_word)
- movzwl -4(%edi), %eax
- movzwl -4(%esi), %ebx
- subl %ebx, %eax
- RETURN
-
- .p2align 4
+ test $63, %dh
+ jz L(eighth_word)
+ movzwl -4(%edi), %eax
+ movzwl -4(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
+
+ .p2align 4
L(eighth_word):
- movzwl -2(%edi), %eax
- movzwl -2(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -2(%edi), %eax
+ movzwl -2(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- CFI_PUSH (%ebx)
+ CFI_PUSH (%ebx)
- .p2align 4
+ .p2align 4
L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $12, %ecx
- je L(12bytes)
- jmp L(14bytes)
-
- .p2align 4
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ jmp L(14bytes)
+
+ .p2align 4
L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $20, %ecx
- je L(20bytes)
- jmp L(22bytes)
-
- .p2align 4
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ jmp L(22bytes)
+
+ .p2align 4
L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $28, %ecx
- je L(28bytes)
- jmp L(30bytes)
-
- .p2align 4
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ jmp L(30bytes)
+
+ .p2align 4
L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $36, %ecx
- je L(36bytes)
- jmp L(38bytes)
-
- .p2align 4
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ jmp L(38bytes)
+
+ .p2align 4
L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $4, %ecx
- je L(4bytes)
- jmp L(6bytes)
-
- .p2align 4
+ cmp $8, %ecx
+ jae L(more8bytes)
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ jmp L(6bytes)
+
+ .p2align 4
L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $44, %ecx
- je L(44bytes)
- jmp L(46bytes)
-
- .p2align 4
+ cmp $40, %ecx
+ je L(40bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ jmp L(46bytes)
+
+ .p2align 4
L(46bytes):
- movzwl -46(%eax), %ecx
- movzwl -46(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -46(%eax), %ecx
+ movzwl -46(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(44bytes):
- movzwl -44(%eax), %ecx
- movzwl -44(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -44(%eax), %ecx
+ movzwl -44(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(42bytes):
- movzwl -42(%eax), %ecx
- movzwl -42(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -42(%eax), %ecx
+ movzwl -42(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(40bytes):
- movzwl -40(%eax), %ecx
- movzwl -40(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -40(%eax), %ecx
+ movzwl -40(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(38bytes):
- movzwl -38(%eax), %ecx
- movzwl -38(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -38(%eax), %ecx
+ movzwl -38(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(36bytes):
- movzwl -36(%eax), %ecx
- movzwl -36(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -36(%eax), %ecx
+ movzwl -36(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(34bytes):
- movzwl -34(%eax), %ecx
- movzwl -34(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -34(%eax), %ecx
+ movzwl -34(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(32bytes):
- movzwl -32(%eax), %ecx
- movzwl -32(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -32(%eax), %ecx
+ movzwl -32(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(30bytes):
- movzwl -30(%eax), %ecx
- movzwl -30(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -30(%eax), %ecx
+ movzwl -30(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(28bytes):
- movzwl -28(%eax), %ecx
- movzwl -28(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -28(%eax), %ecx
+ movzwl -28(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(26bytes):
- movzwl -26(%eax), %ecx
- movzwl -26(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -26(%eax), %ecx
+ movzwl -26(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(24bytes):
- movzwl -24(%eax), %ecx
- movzwl -24(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -24(%eax), %ecx
+ movzwl -24(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(22bytes):
- movzwl -22(%eax), %ecx
- movzwl -22(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -22(%eax), %ecx
+ movzwl -22(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(20bytes):
- movzwl -20(%eax), %ecx
- movzwl -20(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -20(%eax), %ecx
+ movzwl -20(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(18bytes):
- movzwl -18(%eax), %ecx
- movzwl -18(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -18(%eax), %ecx
+ movzwl -18(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(16bytes):
- movzwl -16(%eax), %ecx
- movzwl -16(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -16(%eax), %ecx
+ movzwl -16(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(14bytes):
- movzwl -14(%eax), %ecx
- movzwl -14(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -14(%eax), %ecx
+ movzwl -14(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(12bytes):
- movzwl -12(%eax), %ecx
- movzwl -12(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -12(%eax), %ecx
+ movzwl -12(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(10bytes):
- movzwl -10(%eax), %ecx
- movzwl -10(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -10(%eax), %ecx
+ movzwl -10(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(8bytes):
- movzwl -8(%eax), %ecx
- movzwl -8(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -8(%eax), %ecx
+ movzwl -8(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(6bytes):
- movzwl -6(%eax), %ecx
- movzwl -6(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -6(%eax), %ecx
+ movzwl -6(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(4bytes):
- movzwl -4(%eax), %ecx
- movzwl -4(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -4(%eax), %ecx
+ movzwl -4(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(2bytes):
- movzwl -2(%eax), %eax
- movzwl -2(%edx), %ebx
- subl %ebx, %eax
- POP (%ebx)
- ret
- CFI_PUSH (%ebx)
-
- .p2align 4
+ movzwl -2(%eax), %eax
+ movzwl -2(%edx), %ebx
+ subl %ebx, %eax
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
L(memcmp16_exit):
- POP (%ebx)
- mov %ecx, %eax
- ret
+ POP (%ebx)
+ mov %ecx, %eax
+ ret
END_FUNCTION MEMCMP
diff --git a/runtime/arch/x86_64/memcmp16_x86_64.S b/runtime/arch/x86_64/memcmp16_x86_64.S
new file mode 100755
index 0000000000..46e4ba36cf
--- /dev/null
+++ b/runtime/arch/x86_64/memcmp16_x86_64.S
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "asm_support_x86_64.S"
+
+#define MEMCMP __memcmp16
+
+/*
+ * Half of Silvermont L1 Data Cache size
+ *(see original file cache.h in bionic/libc/arch-x86_64/).
+ * This value is used for specific optimization on big lengths.
+ */
+#define DATA_CACHE_SIZE_HALF (12*1024)
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) (I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ add %r11, %rcx; \
+ jmp *%rcx; \
+ ud2
+
+DEFINE_FUNCTION MEMCMP
+ pxor %xmm0, %xmm0
+ shl $1, %rdx
+ cmp $79, %rdx
+ ja L(79bytesormore)
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(79bytesormore):
+ movdqu (%rsi), %xmm1
+ movdqu (%rdi), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+ mov %rsi, %rcx
+ and $-16, %rsi
+ add $16, %rsi
+ sub %rsi, %rcx
+
+ sub %rcx, %rdi
+ add %rcx, %rdx
+ test $0xf, %rdi
+ jz L(2aligned)
+
+ cmp $128, %rdx
+ ja L(128bytesormore)
+L(less128bytes):
+ sub $64, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(128bytesormore):
+ cmp $512, %rdx
+ ja L(512bytesormore)
+ cmp $256, %rdx
+ ja L(less512bytes)
+L(less256bytes):
+ sub $128, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(less512bytes):
+ sub $256, %rdx
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqu 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqu 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqu 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqu 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqu 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqu 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqu 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqu 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytes)
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_unaglined)
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loop):
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(L2_L3_cache_unaglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_unaligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+ ALIGN (4)
+L(2aligned):
+ cmp $128, %rdx
+ ja L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+ sub $64, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64in2alinged)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64in2alinged):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(128bytesormorein2aligned):
+ cmp $512, %rdx
+ ja L(512bytesormorein2aligned)
+ cmp $256, %rdx
+ ja L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+ sub $128, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128in2aligned)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128in2aligned):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(256bytesormorein2aligned):
+
+ sub $256, %rdx
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqa 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqa 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqa 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqa 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqa 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqa 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqa 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqa 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytesin2alinged)
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256in2alinged)
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256in2alinged):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_aglined)
+
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loopin2aligned):
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loopin2aligned)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+L(L2_L3_cache_aglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_aligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+
+ ALIGN (4)
+L(64bytesormore_loop_end):
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm2, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm3, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm4, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ jmp L(16bytes)
+
+L(256bytesin256):
+ add $256, %rdi
+ add $256, %rsi
+ jmp L(16bytes)
+L(240bytesin256):
+ add $240, %rdi
+ add $240, %rsi
+ jmp L(16bytes)
+L(224bytesin256):
+ add $224, %rdi
+ add $224, %rsi
+ jmp L(16bytes)
+L(208bytesin256):
+ add $208, %rdi
+ add $208, %rsi
+ jmp L(16bytes)
+L(192bytesin256):
+ add $192, %rdi
+ add $192, %rsi
+ jmp L(16bytes)
+L(176bytesin256):
+ add $176, %rdi
+ add $176, %rsi
+ jmp L(16bytes)
+L(160bytesin256):
+ add $160, %rdi
+ add $160, %rsi
+ jmp L(16bytes)
+L(144bytesin256):
+ add $144, %rdi
+ add $144, %rsi
+ jmp L(16bytes)
+L(128bytesin256):
+ add $128, %rdi
+ add $128, %rsi
+ jmp L(16bytes)
+L(112bytesin256):
+ add $112, %rdi
+ add $112, %rsi
+ jmp L(16bytes)
+L(96bytesin256):
+ add $96, %rdi
+ add $96, %rsi
+ jmp L(16bytes)
+L(80bytesin256):
+ add $80, %rdi
+ add $80, %rsi
+ jmp L(16bytes)
+L(64bytesin256):
+ add $64, %rdi
+ add $64, %rsi
+ jmp L(16bytes)
+L(48bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(32bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytes):
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(8bytes):
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(12bytes):
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(4bytes):
+ mov -4(%rsi), %ecx
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(0bytes):
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(66bytes):
+ movdqu -66(%rdi), %xmm1
+ movdqu -66(%rsi), %xmm2
+ mov $-66, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(50bytes):
+ movdqu -50(%rdi), %xmm1
+ movdqu -50(%rsi), %xmm2
+ mov $-50, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ movdqu -34(%rdi), %xmm1
+ movdqu -34(%rsi), %xmm2
+ mov $-34, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%rdi), %rax
+ mov -18(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(10bytes):
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(14bytes):
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(6bytes):
+ mov -6(%rdi), %eax
+ mov -6(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(2bytes):
+ movzwl -2(%rsi), %ecx
+ movzwl -2(%rdi), %eax
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(68bytes):
+ movdqu -68(%rdi), %xmm2
+ movdqu -68(%rsi), %xmm1
+ mov $-68, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(52bytes):
+ movdqu -52(%rdi), %xmm2
+ movdqu -52(%rsi), %xmm1
+ mov $-52, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%rdi), %xmm2
+ movdqu -36(%rsi), %xmm1
+ mov $-36, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%rdi), %xmm2
+ movdqu -20(%rsi), %xmm1
+ mov $-20, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(70bytes):
+ movdqu -70(%rsi), %xmm1
+ movdqu -70(%rdi), %xmm2
+ mov $-70, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(54bytes):
+ movdqu -54(%rsi), %xmm1
+ movdqu -54(%rdi), %xmm2
+ mov $-54, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ movdqu -38(%rsi), %xmm1
+ movdqu -38(%rdi), %xmm2
+ mov $-38, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ movdqu -22(%rsi), %xmm1
+ movdqu -22(%rdi), %xmm2
+ mov $-22, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(72bytes):
+ movdqu -72(%rsi), %xmm1
+ movdqu -72(%rdi), %xmm2
+ mov $-72, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(56bytes):
+ movdqu -56(%rdi), %xmm2
+ movdqu -56(%rsi), %xmm1
+ mov $-56, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ movdqu -40(%rdi), %xmm2
+ movdqu -40(%rsi), %xmm1
+ mov $-40, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ movdqu -24(%rdi), %xmm2
+ movdqu -24(%rsi), %xmm1
+ mov $-24, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(74bytes):
+ movdqu -74(%rsi), %xmm1
+ movdqu -74(%rdi), %xmm2
+ mov $-74, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(58bytes):
+ movdqu -58(%rdi), %xmm2
+ movdqu -58(%rsi), %xmm1
+ mov $-58, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ movdqu -42(%rdi), %xmm2
+ movdqu -42(%rsi), %xmm1
+ mov $-42, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ movdqu -26(%rdi), %xmm2
+ movdqu -26(%rsi), %xmm1
+ mov $-26, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ jmp L(end)
+
+ ALIGN (4)
+L(76bytes):
+ movdqu -76(%rsi), %xmm1
+ movdqu -76(%rdi), %xmm2
+ mov $-76, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(60bytes):
+ movdqu -60(%rdi), %xmm2
+ movdqu -60(%rsi), %xmm1
+ mov $-60, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ movdqu -44(%rdi), %xmm2
+ movdqu -44(%rsi), %xmm1
+ mov $-44, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ movdqu -28(%rdi), %xmm2
+ movdqu -28(%rsi), %xmm1
+ mov $-28, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(78bytes):
+ movdqu -78(%rsi), %xmm1
+ movdqu -78(%rdi), %xmm2
+ mov $-78, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(62bytes):
+ movdqu -62(%rdi), %xmm2
+ movdqu -62(%rsi), %xmm1
+ mov $-62, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ movdqu -46(%rdi), %xmm2
+ movdqu -46(%rsi), %xmm1
+ mov $-46, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ movdqu -30(%rdi), %xmm2
+ movdqu -30(%rsi), %xmm1
+ mov $-30, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(64bytes):
+ movdqu -64(%rdi), %xmm2
+ movdqu -64(%rsi), %xmm1
+ mov $-64, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%rdi), %xmm2
+ movdqu -48(%rsi), %xmm1
+ mov $-48, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%rdi), %xmm2
+ movdqu -32(%rsi), %xmm1
+ mov $-32, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+ ALIGN (3)
+L(less16bytes):
+ movsbq %dl, %rdx
+ mov (%rsi, %rdx), %rcx
+ mov (%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov 8(%rsi, %rdx), %rcx
+ mov 8(%rdi, %rdx), %rax
+L(diffin8bytes):
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ shr $32, %rcx
+ shr $32, %rax
+L(diffin4bytes):
+ cmp %cx, %ax
+ jne L(end)
+ shr $16, %ecx
+ shr $16, %eax
+ jmp L(end)
+
+ ALIGN (4)
+L(end):
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+END_FUNCTION MEMCMP
+
+ ALIGN (3)
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(66bytes), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(70bytes), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(74bytes), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(78bytes), L(table_64bytes))
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index c1e3b257f7..f0b1b9578f 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -3516,14 +3516,19 @@ mirror::ArtMethod* ClassLinker::CreateProxyConstructor(Thread* self,
proxy_class->GetDirectMethods();
CHECK_EQ(proxy_direct_methods->GetLength(), 16);
mirror::ArtMethod* proxy_constructor = proxy_direct_methods->Get(2);
- // Clone the existing constructor of Proxy (our constructor would just invoke it so steal its
- // code_ too)
- mirror::ArtMethod* constructor =
- down_cast<mirror::ArtMethod*>(proxy_constructor->Clone(self));
- if (constructor == NULL) {
+ mirror::ArtMethod* constructor = down_cast<mirror::ArtMethod*>(proxy_constructor->Clone(self));
+ if (constructor == nullptr) {
CHECK(self->IsExceptionPending()); // OOME.
- return NULL;
+ return nullptr;
}
+ // Make the proxy constructor's code always point to the uninstrumented code. This avoids
+ // getting a method enter event for the proxy constructor as the proxy constructor doesn't
+ // have an activation.
+ bool have_portable_code;
+ constructor->SetEntryPointFromQuickCompiledCode(GetQuickOatCodeFor(proxy_constructor));
+ constructor->SetEntryPointFromPortableCompiledCode(GetPortableOatCodeFor(proxy_constructor,
+ &have_portable_code));
+
// Make this constructor public and fix the class to be our Proxy version
constructor->SetAccessFlags((constructor->GetAccessFlags() & ~kAccProtected) | kAccPublic);
constructor->SetDeclaringClass(klass.Get());
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index 8e363c4fa3..997236200c 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -137,7 +137,17 @@ void CommonRuntimeTest::SetEnvironmentVariables(std::string& android_data) {
}
// On target, Cannot use /mnt/sdcard because it is mounted noexec, so use subdir of dalvik-cache
- android_data = (IsHost() ? "/tmp/art-data-XXXXXX" : "/data/dalvik-cache/art-data-XXXXXX");
+ if (IsHost()) {
+ const char* tmpdir = getenv("TMPDIR");
+ if (tmpdir != nullptr && tmpdir[0] != 0) {
+ android_data = tmpdir;
+ } else {
+ android_data = "/tmp";
+ }
+ } else {
+ android_data = "/data/dalvik-cache";
+ }
+ android_data += "/art-data-XXXXXX";
if (mkdtemp(&android_data[0]) == nullptr) {
PLOG(FATAL) << "mkdtemp(\"" << &android_data[0] << "\") failed";
}
@@ -212,7 +222,7 @@ void CommonRuntimeTest::ClearDirectory(const char* dirpath) {
if ((strcmp(e->d_name, ".") == 0) || (strcmp(e->d_name, "..") == 0)) {
continue;
}
- std::string filename(dalvik_cache_);
+ std::string filename(dirpath);
filename.push_back('/');
filename.append(e->d_name);
int stat_result = lstat(filename.c_str(), &s);
@@ -265,6 +275,19 @@ std::string CommonRuntimeTest::GetDexFileName(const std::string& jar_prefix) {
return StringPrintf("%s/framework/%s.jar", GetAndroidRoot(), jar_prefix.c_str());
}
+std::string CommonRuntimeTest::GetLibCoreOatFileName() {
+ return GetOatFileName("core");
+}
+
+std::string CommonRuntimeTest::GetOatFileName(const std::string& oat_prefix) {
+ if (IsHost()) {
+ const char* host_dir = getenv("ANDROID_HOST_OUT");
+ CHECK(host_dir != nullptr);
+ return StringPrintf("%s/framework/%s.art", host_dir, oat_prefix.c_str());
+ }
+ return StringPrintf("%s/framework/%s.art", GetAndroidRoot(), oat_prefix.c_str());
+}
+
std::string CommonRuntimeTest::GetTestAndroidRoot() {
if (IsHost()) {
const char* host_dir = getenv("ANDROID_HOST_OUT");
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index eb963525e8..363d8daaf8 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -85,10 +85,18 @@ class CommonRuntimeTest : public testing::Test {
virtual void TearDown();
+ // Gets the path of the libcore dex file.
std::string GetLibCoreDexFileName();
+ // Gets the path of the specified dex file for host or target.
std::string GetDexFileName(const std::string& jar_prefix);
+ // Gets the path of the libcore oat file.
+ std::string GetLibCoreOatFileName();
+
+ // Gets the path of the specified oat file for host or target.
+ std::string GetOatFileName(const std::string& oat_prefix);
+
std::string GetTestAndroidRoot();
std::vector<const DexFile*> OpenTestDexFiles(const char* name)
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 49bb65f488..fa198d7ef5 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -592,8 +592,7 @@ extern "C" uint64_t artQuickProxyInvokeHandler(mirror::ArtMethod* proxy_method,
const char* old_cause =
self->StartAssertNoThreadSuspension("Adding to IRT proxy object arguments");
// Register the top of the managed stack, making stack crawlable.
- DCHECK_EQ(sp->AsMirrorPtr(), proxy_method)
- << PrettyMethod(proxy_method);
+ DCHECK_EQ(sp->AsMirrorPtr(), proxy_method) << PrettyMethod(proxy_method);
self->SetTopOfStack(sp, 0);
DCHECK_EQ(proxy_method->GetFrameSizeInBytes(),
Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes())
diff --git a/runtime/gc/accounting/card_table.cc b/runtime/gc/accounting/card_table.cc
index ceb42e5936..049855000b 100644
--- a/runtime/gc/accounting/card_table.cc
+++ b/runtime/gc/accounting/card_table.cc
@@ -28,6 +28,11 @@ namespace art {
namespace gc {
namespace accounting {
+constexpr size_t CardTable::kCardShift;
+constexpr size_t CardTable::kCardSize;
+constexpr uint8_t CardTable::kCardClean;
+constexpr uint8_t CardTable::kCardDirty;
+
/*
* Maintain a card table from the write barrier. All writes of
* non-NULL values to heap addresses should go through an entry in
@@ -55,9 +60,9 @@ CardTable* CardTable::Create(const byte* heap_begin, size_t heap_capacity) {
size_t capacity = heap_capacity / kCardSize;
/* Allocate an extra 256 bytes to allow fixed low-byte of base */
std::string error_msg;
- std::unique_ptr<MemMap> mem_map(MemMap::MapAnonymous("card table", NULL,
- capacity + 256, PROT_READ | PROT_WRITE,
- false, &error_msg));
+ std::unique_ptr<MemMap> mem_map(
+ MemMap::MapAnonymous("card table", nullptr, capacity + 256, PROT_READ | PROT_WRITE,
+ false, &error_msg));
CHECK(mem_map.get() != NULL) << "couldn't allocate card table: " << error_msg;
// All zeros is the correct initial value; all clean. Anonymous mmaps are initialized to zero, we
// don't clear the card table to avoid unnecessary pages being allocated
@@ -67,17 +72,17 @@ CardTable* CardTable::Create(const byte* heap_begin, size_t heap_capacity) {
CHECK(cardtable_begin != NULL);
// We allocated up to a bytes worth of extra space to allow biased_begin's byte value to equal
- // GC_CARD_DIRTY, compute a offset value to make this the case
+ // kCardDirty, compute a offset value to make this the case
size_t offset = 0;
byte* biased_begin = reinterpret_cast<byte*>(reinterpret_cast<uintptr_t>(cardtable_begin) -
(reinterpret_cast<uintptr_t>(heap_begin) >> kCardShift));
- if (((uintptr_t)biased_begin & 0xff) != kCardDirty) {
- int delta = kCardDirty - (reinterpret_cast<uintptr_t>(biased_begin) & 0xff);
+ uintptr_t biased_byte = reinterpret_cast<uintptr_t>(biased_begin) & 0xff;
+ if (biased_byte != kCardDirty) {
+ int delta = kCardDirty - biased_byte;
offset = delta + (delta < 0 ? 0x100 : 0);
biased_begin += offset;
}
CHECK_EQ(reinterpret_cast<uintptr_t>(biased_begin) & 0xff, kCardDirty);
-
return new CardTable(mem_map.release(), biased_begin, offset);
}
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 7934974081..fbeea85554 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -46,10 +46,10 @@ template<size_t kAlignment> class SpaceBitmap;
// WriteBarrier, and from there to here.
class CardTable {
public:
- static const size_t kCardShift = 7;
- static const size_t kCardSize = (1 << kCardShift);
- static const uint8_t kCardClean = 0x0;
- static const uint8_t kCardDirty = 0x70;
+ static constexpr size_t kCardShift = 7;
+ static constexpr size_t kCardSize = 1 << kCardShift;
+ static constexpr uint8_t kCardClean = 0x0;
+ static constexpr uint8_t kCardDirty = 0x70;
static CardTable* Create(const byte* heap_begin, size_t heap_capacity);
diff --git a/runtime/gc/accounting/card_table_test.cc b/runtime/gc/accounting/card_table_test.cc
new file mode 100644
index 0000000000..a88b2c9881
--- /dev/null
+++ b/runtime/gc/accounting/card_table_test.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "card_table-inl.h"
+
+#include <string>
+
+#include "atomic.h"
+#include "common_runtime_test.h"
+#include "handle_scope-inl.h"
+#include "mirror/class-inl.h"
+#include "mirror/string-inl.h" // Strings are easiest to allocate
+#include "scoped_thread_state_change.h"
+#include "thread_pool.h"
+#include "utils.h"
+
+namespace art {
+
+namespace mirror {
+ class Object;
+} // namespace mirror
+
+class CardTableTest : public CommonRuntimeTest {
+ public:
+ std::unique_ptr<gc::accounting::CardTable> card_table_;
+ static constexpr size_t kCardSize = gc::accounting::CardTable::kCardSize;
+
+ void CommonSetup() {
+ if (card_table_.get() == nullptr) {
+ card_table_.reset(gc::accounting::CardTable::Create(heap_begin_, heap_size_));
+ EXPECT_TRUE(card_table_.get() != nullptr);
+ } else {
+ ClearCardTable();
+ }
+ }
+ // Default values for the test, not random to avoid undeterministic behaviour.
+ CardTableTest() : heap_begin_(reinterpret_cast<byte*>(0x2000000)), heap_size_(2 * MB) {
+ }
+ void ClearCardTable() {
+ card_table_->ClearCardTable();
+ }
+ byte* HeapBegin() const {
+ return heap_begin_;
+ }
+ byte* HeapLimit() const {
+ return HeapBegin() + heap_size_;
+ }
+ byte PRandCard(const byte* addr) const {
+ size_t offset = RoundDown(addr - heap_begin_, kCardSize);
+ return 1 + offset % 254;
+ }
+ void FillRandom() {
+ for (const byte* addr = HeapBegin(); addr != HeapLimit(); addr += kCardSize) {
+ EXPECT_TRUE(card_table_->AddrIsInCardTable(addr));
+ byte* card = card_table_->CardFromAddr(addr);
+ *card = PRandCard(addr);
+ }
+ }
+
+ private:
+ byte* const heap_begin_;
+ const size_t heap_size_;
+};
+
+TEST_F(CardTableTest, TestMarkCard) {
+ CommonSetup();
+ for (const byte* addr = HeapBegin(); addr < HeapLimit(); addr += kObjectAlignment) {
+ auto obj = reinterpret_cast<const mirror::Object*>(addr);
+ EXPECT_EQ(card_table_->GetCard(obj), gc::accounting::CardTable::kCardClean);
+ EXPECT_TRUE(!card_table_->IsDirty(obj));
+ card_table_->MarkCard(addr);
+ EXPECT_TRUE(card_table_->IsDirty(obj));
+ EXPECT_EQ(card_table_->GetCard(obj), gc::accounting::CardTable::kCardDirty);
+ byte* card_addr = card_table_->CardFromAddr(addr);
+ EXPECT_EQ(*card_addr, gc::accounting::CardTable::kCardDirty);
+ *card_addr = gc::accounting::CardTable::kCardClean;
+ EXPECT_EQ(*card_addr, gc::accounting::CardTable::kCardClean);
+ }
+}
+
+class UpdateVisitor {
+ public:
+ byte operator()(byte c) const {
+ return c * 93 + 123;
+ }
+ void operator()(byte* /*card*/, byte /*expected_value*/, byte /*new_value*/) const {
+ }
+};
+
+TEST_F(CardTableTest, TestModifyCardsAtomic) {
+ CommonSetup();
+ FillRandom();
+ const size_t delta = std::min(static_cast<size_t>(HeapLimit() - HeapBegin()), 8U * kCardSize);
+ UpdateVisitor visitor;
+ size_t start_offset = 0;
+ for (byte* cstart = HeapBegin(); cstart < HeapBegin() + delta; cstart += kCardSize) {
+ start_offset = (start_offset + kObjectAlignment) % kCardSize;
+ size_t end_offset = 0;
+ for (byte* cend = HeapLimit() - delta; cend < HeapLimit(); cend += kCardSize) {
+ // Don't always start at a card boundary.
+ byte* start = cstart + start_offset;
+ byte* end = cend - end_offset;
+ end_offset = (end_offset + kObjectAlignment) % kCardSize;
+ // Modify cards.
+ card_table_->ModifyCardsAtomic(start, end, visitor, visitor);
+ // Check adjacent cards not modified.
+ for (byte* cur = start - kCardSize; cur >= HeapBegin(); cur -= kCardSize) {
+ EXPECT_EQ(card_table_->GetCard(reinterpret_cast<mirror::Object*>(cur)), PRandCard(cur));
+ }
+ for (byte* cur = end + kCardSize; cur < HeapLimit(); cur += kCardSize) {
+ EXPECT_EQ(card_table_->GetCard(reinterpret_cast<mirror::Object*>(cur)), PRandCard(cur));
+ }
+ // Verify Range.
+ for (byte* cur = start; cur < AlignUp(end, kCardSize); cur += kCardSize) {
+ byte* card = card_table_->CardFromAddr(cur);
+ byte value = PRandCard(cur);
+ if (visitor(value) != *card) {
+ LOG(ERROR) << reinterpret_cast<void*>(start) << " " << reinterpret_cast<void*>(cur) << " " << reinterpret_cast<void*>(end);
+ }
+ EXPECT_EQ(visitor(value), *card);
+ // Restore for next iteration.
+ *card = value;
+ }
+ }
+ }
+}
+
+// TODO: Add test for CardTable::Scan.
+
+} // namespace art
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index 4882728e0d..8eacb1c3d7 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -157,12 +157,12 @@ ArtMethod* ArtMethod::FindOverriddenMethod() {
}
}
}
-#ifndef NDEBUG
- StackHandleScope<2> hs(Thread::Current());
- MethodHelper result_mh(hs.NewHandle(result));
- MethodHelper this_mh(hs.NewHandle(this));
- DCHECK(result == NULL || this_mh.HasSameNameAndSignature(&result_mh));
-#endif
+ if (kIsDebugBuild) {
+ StackHandleScope<2> hs(Thread::Current());
+ MethodHelper result_mh(hs.NewHandle(result));
+ MethodHelper this_mh(hs.NewHandle(this));
+ DCHECK(result == nullptr || this_mh.HasSameNameAndSignature(&result_mh));
+ }
return result;
}
diff --git a/runtime/monitor_pool.cc b/runtime/monitor_pool.cc
index 440a6be07b..4964aa06c5 100644
--- a/runtime/monitor_pool.cc
+++ b/runtime/monitor_pool.cc
@@ -52,7 +52,7 @@ void MonitorPool::AllocateChunk() {
monitor_chunks_.StoreRelaxed(new_backing);
capacity_ = new_capacity;
old_chunk_arrays_.push_back(old_backing);
- LOG(INFO) << "Resizing to capacity " << capacity_;
+ VLOG(monitor) << "Resizing to capacity " << capacity_;
}
}
@@ -64,7 +64,7 @@ void MonitorPool::AllocateChunk() {
CHECK_EQ(0U, reinterpret_cast<uintptr_t>(chunk) % kMonitorAlignment);
// Add the chunk.
- *(monitor_chunks_.LoadRelaxed()+num_chunks_) = reinterpret_cast<uintptr_t>(chunk);
+ *(monitor_chunks_.LoadRelaxed() + num_chunks_) = reinterpret_cast<uintptr_t>(chunk);
num_chunks_++;
// Set up the free list
@@ -96,7 +96,7 @@ Monitor* MonitorPool::CreateMonitorInPool(Thread* self, Thread* owner, mirror::O
// Enough space, or need to resize?
if (first_free_ == nullptr) {
- LOG(INFO) << "Allocating a new chunk.";
+ VLOG(monitor) << "Allocating a new chunk.";
AllocateChunk();
}
diff --git a/runtime/proxy_test.cc b/runtime/proxy_test.cc
index bd6656dda1..308142157c 100644
--- a/runtime/proxy_test.cc
+++ b/runtime/proxy_test.cc
@@ -17,14 +17,14 @@
#include <jni.h>
#include <vector>
-#include "common_compiler_test.h"
+#include "common_runtime_test.h"
#include "field_helper.h"
#include "mirror/art_field-inl.h"
#include "scoped_thread_state_change.h"
namespace art {
-class ProxyTest : public CommonCompilerTest {
+class ProxyTest : public CommonRuntimeTest {
public:
// Generate a proxy class with the given name and interfaces. This is a simplification from what
// libcore does to fit to our test needs. We do not check for duplicated interfaces or methods and
@@ -103,6 +103,12 @@ class ProxyTest : public CommonCompilerTest {
soa.Self()->AssertNoPendingException();
return proxyClass;
}
+
+ protected:
+ void SetUpRuntimeOptions(RuntimeOptions *options) OVERRIDE {
+ options->push_back(std::make_pair(StringPrintf("-Ximage:%s", GetLibCoreOatFileName().c_str()),
+ nullptr));
+ }
};
// Creates a proxy class and check ClassHelper works correctly.
diff --git a/test/015-switch/expected.txt b/test/015-switch/expected.txt
index ca3b518f03..91b47142d1 100644
--- a/test/015-switch/expected.txt
+++ b/test/015-switch/expected.txt
@@ -8,3 +8,9 @@ CORRECT (not found)
CORRECT (default only)
CORRECT big sparse / first
CORRECT big sparse / last
+default
+254
+255
+256
+257
+default
diff --git a/test/015-switch/src/Main.java b/test/015-switch/src/Main.java
index 7198e2b7b9..dd97a8c60b 100644
--- a/test/015-switch/src/Main.java
+++ b/test/015-switch/src/Main.java
@@ -101,5 +101,15 @@ public class Main {
case 100: System.out.print("CORRECT big sparse / last\n"); break;
default: System.out.print("blah!\n"); break;
}
+
+ for (a = 253; a <= 258; a++) {
+ switch (a) {
+ case 254: System.out.println("254"); break;
+ case 255: System.out.println("255"); break;
+ case 256: System.out.println("256"); break;
+ case 257: System.out.println("257"); break;
+ default: System.out.println("default"); break;
+ }
+ }
}
}
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 5c1bc0395f..d7ee383db7 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -81,38 +81,10 @@ endif
# Tests that are broken in --trace mode.
TEST_ART_BROKEN_TRACE_RUN_TESTS := \
- 003-omnibus-opcodes \
- 004-InterfaceTest \
004-SignalTest \
- 004-ThreadStress \
- 005-annotations \
- 012-math \
018-stack-overflow \
- 023-many-interfaces \
- 027-arithmetic \
- 031-class-attributes \
- 037-inherit \
- 044-proxy \
- 046-reflect \
- 051-thread \
- 055-enum-performance \
- 062-character-encodings \
- 064-field-access \
- 074-gc-thrash \
- 078-polymorphic-virtual \
- 080-oom-throw \
- 082-inline-execute \
- 083-compiler-regressions \
- 093-serialization \
097-duplicate-method \
- 100-reflect2 \
- 102-concurrent-gc \
- 103-string-append \
- 107-int-math2 \
- 112-double-math \
- 114-ParallelGC \
- 700-LoadArgRegs \
- 701-easy-div-rem
+ 107-int-math2
ART_TEST_KNOWN_BROKEN += $(foreach test, $(TEST_ART_BROKEN_TRACE_RUN_TESTS), $(call all-run-test-names,$(test),-trace,-relocate))
ART_TEST_KNOWN_BROKEN += $(foreach test, $(TEST_ART_BROKEN_TRACE_RUN_TESTS), $(call all-run-test-names,$(test),-trace,-no-prebuild))
diff --git a/test/run-test b/test/run-test
index aef7c52120..ca7e68c263 100755
--- a/test/run-test
+++ b/test/run-test
@@ -33,7 +33,11 @@ cd "${progdir}"
progdir=`pwd`
prog="${progdir}"/`basename "${prog}"`
test_dir="test-$$"
-tmp_dir="/tmp/$USER/${test_dir}"
+if [ -z "$TMPDIR" ]; then
+ tmp_dir="/tmp/$USER/${test_dir}"
+else
+ tmp_dir="${TMPDIR}/$USER/${test_dir}"
+fi
export JAVA="java"
export JAVAC="javac -g"