| /* |
| * Copyright (C) 2014 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "asm_support_x86_64.S" |
| |
| #define MEMCMP __memcmp16 |
| |
| /* |
| * Half of Silvermont L1 Data Cache size |
| *(see original file cache.h in bionic/libc/arch-x86_64/). |
| * This value is used for specific optimization on big lengths. |
| */ |
| #define DATA_CACHE_SIZE_HALF (12*1024) |
| |
| #ifndef L |
| # define L(label) .L##label |
| #endif |
| |
| #ifndef ALIGN |
| # define ALIGN(n) .p2align n |
| #endif |
| |
| #define JMPTBL(I, B) (I - B) |
| |
| #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
| lea TABLE(%rip), %r11; \ |
| movslq (%r11, INDEX, SCALE), %rcx; \ |
| add %r11, %rcx; \ |
| jmp *%rcx; \ |
| ud2 |
| |
| DEFINE_FUNCTION MEMCMP |
| pxor %xmm0, %xmm0 |
| shl $1, %rdx |
| cmp $79, %rdx |
| ja L(79bytesormore) |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| ALIGN (4) |
| L(79bytesormore): |
| movdqu (%rsi), %xmm1 |
| movdqu (%rdi), %xmm2 |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| mov %rsi, %rcx |
| and $-16, %rsi |
| add $16, %rsi |
| sub %rsi, %rcx |
| |
| sub %rcx, %rdi |
| add %rcx, %rdx |
| test $0xf, %rdi |
| jz L(2aligned) |
| |
| cmp $128, %rdx |
| ja L(128bytesormore) |
| L(less128bytes): |
| sub $64, %rdx |
| |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqu 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| |
| movdqu 32(%rdi), %xmm2 |
| pxor 32(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(48bytesin256) |
| |
| movdqu 48(%rdi), %xmm2 |
| pxor 48(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(64bytesin256) |
| cmp $32, %rdx |
| jb L(less32bytesin64) |
| |
| movdqu 64(%rdi), %xmm2 |
| pxor 64(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(80bytesin256) |
| |
| movdqu 80(%rdi), %xmm2 |
| pxor 80(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(96bytesin256) |
| sub $32, %rdx |
| add $32, %rdi |
| add $32, %rsi |
| L(less32bytesin64): |
| add $64, %rdi |
| add $64, %rsi |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| L(128bytesormore): |
| cmp $512, %rdx |
| ja L(512bytesormore) |
| cmp $256, %rdx |
| ja L(less512bytes) |
| L(less256bytes): |
| sub $128, %rdx |
| |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqu 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| |
| movdqu 32(%rdi), %xmm2 |
| pxor 32(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(48bytesin256) |
| |
| movdqu 48(%rdi), %xmm2 |
| pxor 48(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(64bytesin256) |
| |
| movdqu 64(%rdi), %xmm2 |
| pxor 64(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(80bytesin256) |
| |
| movdqu 80(%rdi), %xmm2 |
| pxor 80(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(96bytesin256) |
| |
| movdqu 96(%rdi), %xmm2 |
| pxor 96(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(112bytesin256) |
| |
| movdqu 112(%rdi), %xmm2 |
| pxor 112(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(128bytesin256) |
| |
| add $128, %rsi |
| add $128, %rdi |
| |
| cmp $64, %rdx |
| jae L(less128bytes) |
| |
| cmp $32, %rdx |
| jb L(less32bytesin128) |
| |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqu 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| sub $32, %rdx |
| add $32, %rdi |
| add $32, %rsi |
| L(less32bytesin128): |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| L(less512bytes): |
| sub $256, %rdx |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqu 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| |
| movdqu 32(%rdi), %xmm2 |
| pxor 32(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(48bytesin256) |
| |
| movdqu 48(%rdi), %xmm2 |
| pxor 48(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(64bytesin256) |
| |
| movdqu 64(%rdi), %xmm2 |
| pxor 64(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(80bytesin256) |
| |
| movdqu 80(%rdi), %xmm2 |
| pxor 80(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(96bytesin256) |
| |
| movdqu 96(%rdi), %xmm2 |
| pxor 96(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(112bytesin256) |
| |
| movdqu 112(%rdi), %xmm2 |
| pxor 112(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(128bytesin256) |
| |
| movdqu 128(%rdi), %xmm2 |
| pxor 128(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(144bytesin256) |
| |
| movdqu 144(%rdi), %xmm2 |
| pxor 144(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(160bytesin256) |
| |
| movdqu 160(%rdi), %xmm2 |
| pxor 160(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(176bytesin256) |
| |
| movdqu 176(%rdi), %xmm2 |
| pxor 176(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(192bytesin256) |
| |
| movdqu 192(%rdi), %xmm2 |
| pxor 192(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(208bytesin256) |
| |
| movdqu 208(%rdi), %xmm2 |
| pxor 208(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(224bytesin256) |
| |
| movdqu 224(%rdi), %xmm2 |
| pxor 224(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(240bytesin256) |
| |
| movdqu 240(%rdi), %xmm2 |
| pxor 240(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(256bytesin256) |
| |
| add $256, %rsi |
| add $256, %rdi |
| |
| cmp $128, %rdx |
| jae L(less256bytes) |
| |
| cmp $64, %rdx |
| jae L(less128bytes) |
| |
| cmp $32, %rdx |
| jb L(less32bytesin256) |
| |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqu 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| sub $32, %rdx |
| add $32, %rdi |
| add $32, %rsi |
| L(less32bytesin256): |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| ALIGN (4) |
| L(512bytesormore): |
| #ifdef DATA_CACHE_SIZE_HALF |
| mov $DATA_CACHE_SIZE_HALF, %r8 |
| #else |
| mov __x86_64_data_cache_size_half(%rip), %r8 |
| #endif |
| mov %r8, %r9 |
| shr $1, %r8 |
| add %r9, %r8 |
| cmp %r8, %rdx |
| ja L(L2_L3_cache_unaglined) |
| sub $64, %rdx |
| ALIGN (4) |
| L(64bytesormore_loop): |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| movdqa %xmm2, %xmm1 |
| |
| movdqu 16(%rdi), %xmm3 |
| pxor 16(%rsi), %xmm3 |
| por %xmm3, %xmm1 |
| |
| movdqu 32(%rdi), %xmm4 |
| pxor 32(%rsi), %xmm4 |
| por %xmm4, %xmm1 |
| |
| movdqu 48(%rdi), %xmm5 |
| pxor 48(%rsi), %xmm5 |
| por %xmm5, %xmm1 |
| |
| ptest %xmm1, %xmm0 |
| jnc L(64bytesormore_loop_end) |
| add $64, %rsi |
| add $64, %rdi |
| sub $64, %rdx |
| jae L(64bytesormore_loop) |
| |
| add $64, %rdx |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| L(L2_L3_cache_unaglined): |
| sub $64, %rdx |
| ALIGN (4) |
| L(L2_L3_unaligned_128bytes_loop): |
| prefetchnta 0x1c0(%rdi) |
| prefetchnta 0x1c0(%rsi) |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| movdqa %xmm2, %xmm1 |
| |
| movdqu 16(%rdi), %xmm3 |
| pxor 16(%rsi), %xmm3 |
| por %xmm3, %xmm1 |
| |
| movdqu 32(%rdi), %xmm4 |
| pxor 32(%rsi), %xmm4 |
| por %xmm4, %xmm1 |
| |
| movdqu 48(%rdi), %xmm5 |
| pxor 48(%rsi), %xmm5 |
| por %xmm5, %xmm1 |
| |
| ptest %xmm1, %xmm0 |
| jnc L(64bytesormore_loop_end) |
| add $64, %rsi |
| add $64, %rdi |
| sub $64, %rdx |
| jae L(L2_L3_unaligned_128bytes_loop) |
| |
| add $64, %rdx |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| /* |
| * This case is for machines which are sensitive for unaligned instructions. |
| */ |
| ALIGN (4) |
| L(2aligned): |
| cmp $128, %rdx |
| ja L(128bytesormorein2aligned) |
| L(less128bytesin2aligned): |
| sub $64, %rdx |
| |
| movdqa (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqa 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| |
| movdqa 32(%rdi), %xmm2 |
| pxor 32(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(48bytesin256) |
| |
| movdqa 48(%rdi), %xmm2 |
| pxor 48(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(64bytesin256) |
| cmp $32, %rdx |
| jb L(less32bytesin64in2alinged) |
| |
| movdqa 64(%rdi), %xmm2 |
| pxor 64(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(80bytesin256) |
| |
| movdqa 80(%rdi), %xmm2 |
| pxor 80(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(96bytesin256) |
| sub $32, %rdx |
| add $32, %rdi |
| add $32, %rsi |
| L(less32bytesin64in2alinged): |
| add $64, %rdi |
| add $64, %rsi |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| ALIGN (4) |
| L(128bytesormorein2aligned): |
| cmp $512, %rdx |
| ja L(512bytesormorein2aligned) |
| cmp $256, %rdx |
| ja L(256bytesormorein2aligned) |
| L(less256bytesin2alinged): |
| sub $128, %rdx |
| |
| movdqa (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqa 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| |
| movdqa 32(%rdi), %xmm2 |
| pxor 32(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(48bytesin256) |
| |
| movdqa 48(%rdi), %xmm2 |
| pxor 48(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(64bytesin256) |
| |
| movdqa 64(%rdi), %xmm2 |
| pxor 64(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(80bytesin256) |
| |
| movdqa 80(%rdi), %xmm2 |
| pxor 80(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(96bytesin256) |
| |
| movdqa 96(%rdi), %xmm2 |
| pxor 96(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(112bytesin256) |
| |
| movdqa 112(%rdi), %xmm2 |
| pxor 112(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(128bytesin256) |
| |
| add $128, %rsi |
| add $128, %rdi |
| |
| cmp $64, %rdx |
| jae L(less128bytesin2aligned) |
| |
| cmp $32, %rdx |
| jb L(less32bytesin128in2aligned) |
| |
| movdqu (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqu 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| sub $32, %rdx |
| add $32, %rdi |
| add $32, %rsi |
| L(less32bytesin128in2aligned): |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| ALIGN (4) |
| L(256bytesormorein2aligned): |
| |
| sub $256, %rdx |
| movdqa (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqa 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| |
| movdqa 32(%rdi), %xmm2 |
| pxor 32(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(48bytesin256) |
| |
| movdqa 48(%rdi), %xmm2 |
| pxor 48(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(64bytesin256) |
| |
| movdqa 64(%rdi), %xmm2 |
| pxor 64(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(80bytesin256) |
| |
| movdqa 80(%rdi), %xmm2 |
| pxor 80(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(96bytesin256) |
| |
| movdqa 96(%rdi), %xmm2 |
| pxor 96(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(112bytesin256) |
| |
| movdqa 112(%rdi), %xmm2 |
| pxor 112(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(128bytesin256) |
| |
| movdqa 128(%rdi), %xmm2 |
| pxor 128(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(144bytesin256) |
| |
| movdqa 144(%rdi), %xmm2 |
| pxor 144(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(160bytesin256) |
| |
| movdqa 160(%rdi), %xmm2 |
| pxor 160(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(176bytesin256) |
| |
| movdqa 176(%rdi), %xmm2 |
| pxor 176(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(192bytesin256) |
| |
| movdqa 192(%rdi), %xmm2 |
| pxor 192(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(208bytesin256) |
| |
| movdqa 208(%rdi), %xmm2 |
| pxor 208(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(224bytesin256) |
| |
| movdqa 224(%rdi), %xmm2 |
| pxor 224(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(240bytesin256) |
| |
| movdqa 240(%rdi), %xmm2 |
| pxor 240(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(256bytesin256) |
| |
| add $256, %rsi |
| add $256, %rdi |
| |
| cmp $128, %rdx |
| jae L(less256bytesin2alinged) |
| |
| cmp $64, %rdx |
| jae L(less128bytesin2aligned) |
| |
| cmp $32, %rdx |
| jb L(less32bytesin256in2alinged) |
| |
| movdqa (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(16bytesin256) |
| |
| movdqa 16(%rdi), %xmm2 |
| pxor 16(%rsi), %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(32bytesin256) |
| sub $32, %rdx |
| add $32, %rdi |
| add $32, %rsi |
| L(less32bytesin256in2alinged): |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| ALIGN (4) |
| L(512bytesormorein2aligned): |
| #ifdef DATA_CACHE_SIZE_HALF |
| mov $DATA_CACHE_SIZE_HALF, %r8 |
| #else |
| mov __x86_64_data_cache_size_half(%rip), %r8 |
| #endif |
| mov %r8, %r9 |
| shr $1, %r8 |
| add %r9, %r8 |
| cmp %r8, %rdx |
| ja L(L2_L3_cache_aglined) |
| |
| sub $64, %rdx |
| ALIGN (4) |
| L(64bytesormore_loopin2aligned): |
| movdqa (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| movdqa %xmm2, %xmm1 |
| |
| movdqa 16(%rdi), %xmm3 |
| pxor 16(%rsi), %xmm3 |
| por %xmm3, %xmm1 |
| |
| movdqa 32(%rdi), %xmm4 |
| pxor 32(%rsi), %xmm4 |
| por %xmm4, %xmm1 |
| |
| movdqa 48(%rdi), %xmm5 |
| pxor 48(%rsi), %xmm5 |
| por %xmm5, %xmm1 |
| |
| ptest %xmm1, %xmm0 |
| jnc L(64bytesormore_loop_end) |
| add $64, %rsi |
| add $64, %rdi |
| sub $64, %rdx |
| jae L(64bytesormore_loopin2aligned) |
| |
| add $64, %rdx |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| L(L2_L3_cache_aglined): |
| sub $64, %rdx |
| ALIGN (4) |
| L(L2_L3_aligned_128bytes_loop): |
| prefetchnta 0x1c0(%rdi) |
| prefetchnta 0x1c0(%rsi) |
| movdqa (%rdi), %xmm2 |
| pxor (%rsi), %xmm2 |
| movdqa %xmm2, %xmm1 |
| |
| movdqa 16(%rdi), %xmm3 |
| pxor 16(%rsi), %xmm3 |
| por %xmm3, %xmm1 |
| |
| movdqa 32(%rdi), %xmm4 |
| pxor 32(%rsi), %xmm4 |
| por %xmm4, %xmm1 |
| |
| movdqa 48(%rdi), %xmm5 |
| pxor 48(%rsi), %xmm5 |
| por %xmm5, %xmm1 |
| |
| ptest %xmm1, %xmm0 |
| jnc L(64bytesormore_loop_end) |
| add $64, %rsi |
| add $64, %rdi |
| sub $64, %rdx |
| jae L(L2_L3_aligned_128bytes_loop) |
| |
| add $64, %rdx |
| add %rdx, %rsi |
| add %rdx, %rdi |
| BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) |
| |
| |
| ALIGN (4) |
| L(64bytesormore_loop_end): |
| add $16, %rdi |
| add $16, %rsi |
| ptest %xmm2, %xmm0 |
| jnc L(16bytes) |
| |
| add $16, %rdi |
| add $16, %rsi |
| ptest %xmm3, %xmm0 |
| jnc L(16bytes) |
| |
| add $16, %rdi |
| add $16, %rsi |
| ptest %xmm4, %xmm0 |
| jnc L(16bytes) |
| |
| add $16, %rdi |
| add $16, %rsi |
| jmp L(16bytes) |
| |
| L(256bytesin256): |
| add $256, %rdi |
| add $256, %rsi |
| jmp L(16bytes) |
| L(240bytesin256): |
| add $240, %rdi |
| add $240, %rsi |
| jmp L(16bytes) |
| L(224bytesin256): |
| add $224, %rdi |
| add $224, %rsi |
| jmp L(16bytes) |
| L(208bytesin256): |
| add $208, %rdi |
| add $208, %rsi |
| jmp L(16bytes) |
| L(192bytesin256): |
| add $192, %rdi |
| add $192, %rsi |
| jmp L(16bytes) |
| L(176bytesin256): |
| add $176, %rdi |
| add $176, %rsi |
| jmp L(16bytes) |
| L(160bytesin256): |
| add $160, %rdi |
| add $160, %rsi |
| jmp L(16bytes) |
| L(144bytesin256): |
| add $144, %rdi |
| add $144, %rsi |
| jmp L(16bytes) |
| L(128bytesin256): |
| add $128, %rdi |
| add $128, %rsi |
| jmp L(16bytes) |
| L(112bytesin256): |
| add $112, %rdi |
| add $112, %rsi |
| jmp L(16bytes) |
| L(96bytesin256): |
| add $96, %rdi |
| add $96, %rsi |
| jmp L(16bytes) |
| L(80bytesin256): |
| add $80, %rdi |
| add $80, %rsi |
| jmp L(16bytes) |
| L(64bytesin256): |
| add $64, %rdi |
| add $64, %rsi |
| jmp L(16bytes) |
| L(48bytesin256): |
| add $16, %rdi |
| add $16, %rsi |
| L(32bytesin256): |
| add $16, %rdi |
| add $16, %rsi |
| L(16bytesin256): |
| add $16, %rdi |
| add $16, %rsi |
| L(16bytes): |
| mov -16(%rdi), %rax |
| mov -16(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| L(8bytes): |
| mov -8(%rdi), %rax |
| mov -8(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(12bytes): |
| mov -12(%rdi), %rax |
| mov -12(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| L(4bytes): |
| mov -4(%rsi), %ecx |
| mov -4(%rdi), %eax |
| cmp %eax, %ecx |
| jne L(diffin4bytes) |
| L(0bytes): |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(66bytes): |
| movdqu -66(%rdi), %xmm1 |
| movdqu -66(%rsi), %xmm2 |
| mov $-66, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(50bytes): |
| movdqu -50(%rdi), %xmm1 |
| movdqu -50(%rsi), %xmm2 |
| mov $-50, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(34bytes): |
| movdqu -34(%rdi), %xmm1 |
| movdqu -34(%rsi), %xmm2 |
| mov $-34, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(18bytes): |
| mov -18(%rdi), %rax |
| mov -18(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| L(10bytes): |
| mov -10(%rdi), %rax |
| mov -10(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| movzwl -2(%rdi), %eax |
| movzwl -2(%rsi), %ecx |
| cmp %cl, %al |
| jne L(end) |
| and $0xffff, %eax |
| and $0xffff, %ecx |
| sub %ecx, %eax |
| ret |
| |
| ALIGN (4) |
| L(14bytes): |
| mov -14(%rdi), %rax |
| mov -14(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| mov -8(%rdi), %rax |
| mov -8(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(6bytes): |
| mov -6(%rdi), %eax |
| mov -6(%rsi), %ecx |
| cmp %eax, %ecx |
| jne L(diffin4bytes) |
| L(2bytes): |
| movzwl -2(%rsi), %ecx |
| movzwl -2(%rdi), %eax |
| cmp %cl, %al |
| jne L(end) |
| and $0xffff, %eax |
| and $0xffff, %ecx |
| sub %ecx, %eax |
| ret |
| |
| ALIGN (4) |
| L(68bytes): |
| movdqu -68(%rdi), %xmm2 |
| movdqu -68(%rsi), %xmm1 |
| mov $-68, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(52bytes): |
| movdqu -52(%rdi), %xmm2 |
| movdqu -52(%rsi), %xmm1 |
| mov $-52, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(36bytes): |
| movdqu -36(%rdi), %xmm2 |
| movdqu -36(%rsi), %xmm1 |
| mov $-36, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(20bytes): |
| movdqu -20(%rdi), %xmm2 |
| movdqu -20(%rsi), %xmm1 |
| mov $-20, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| mov -4(%rdi), %eax |
| mov -4(%rsi), %ecx |
| cmp %eax, %ecx |
| jne L(diffin4bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(70bytes): |
| movdqu -70(%rsi), %xmm1 |
| movdqu -70(%rdi), %xmm2 |
| mov $-70, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(54bytes): |
| movdqu -54(%rsi), %xmm1 |
| movdqu -54(%rdi), %xmm2 |
| mov $-54, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(38bytes): |
| movdqu -38(%rsi), %xmm1 |
| movdqu -38(%rdi), %xmm2 |
| mov $-38, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(22bytes): |
| movdqu -22(%rsi), %xmm1 |
| movdqu -22(%rdi), %xmm2 |
| mov $-22, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| mov -8(%rdi), %rax |
| mov -8(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(72bytes): |
| movdqu -72(%rsi), %xmm1 |
| movdqu -72(%rdi), %xmm2 |
| mov $-72, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(56bytes): |
| movdqu -56(%rdi), %xmm2 |
| movdqu -56(%rsi), %xmm1 |
| mov $-56, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(40bytes): |
| movdqu -40(%rdi), %xmm2 |
| movdqu -40(%rsi), %xmm1 |
| mov $-40, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(24bytes): |
| movdqu -24(%rdi), %xmm2 |
| movdqu -24(%rsi), %xmm1 |
| mov $-24, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| mov -8(%rdi), %rax |
| mov -8(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(74bytes): |
| movdqu -74(%rsi), %xmm1 |
| movdqu -74(%rdi), %xmm2 |
| mov $-74, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(58bytes): |
| movdqu -58(%rdi), %xmm2 |
| movdqu -58(%rsi), %xmm1 |
| mov $-58, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(42bytes): |
| movdqu -42(%rdi), %xmm2 |
| movdqu -42(%rsi), %xmm1 |
| mov $-42, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(26bytes): |
| movdqu -26(%rdi), %xmm2 |
| movdqu -26(%rsi), %xmm1 |
| mov $-26, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| mov -10(%rdi), %rax |
| mov -10(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| movzwl -2(%rdi), %eax |
| movzwl -2(%rsi), %ecx |
| jmp L(end) |
| |
| ALIGN (4) |
| L(76bytes): |
| movdqu -76(%rsi), %xmm1 |
| movdqu -76(%rdi), %xmm2 |
| mov $-76, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(60bytes): |
| movdqu -60(%rdi), %xmm2 |
| movdqu -60(%rsi), %xmm1 |
| mov $-60, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(44bytes): |
| movdqu -44(%rdi), %xmm2 |
| movdqu -44(%rsi), %xmm1 |
| mov $-44, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(28bytes): |
| movdqu -28(%rdi), %xmm2 |
| movdqu -28(%rsi), %xmm1 |
| mov $-28, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| mov -12(%rdi), %rax |
| mov -12(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| mov -4(%rdi), %eax |
| mov -4(%rsi), %ecx |
| cmp %eax, %ecx |
| jne L(diffin4bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(78bytes): |
| movdqu -78(%rsi), %xmm1 |
| movdqu -78(%rdi), %xmm2 |
| mov $-78, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(62bytes): |
| movdqu -62(%rdi), %xmm2 |
| movdqu -62(%rsi), %xmm1 |
| mov $-62, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(46bytes): |
| movdqu -46(%rdi), %xmm2 |
| movdqu -46(%rsi), %xmm1 |
| mov $-46, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(30bytes): |
| movdqu -30(%rdi), %xmm2 |
| movdqu -30(%rsi), %xmm1 |
| mov $-30, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| mov -14(%rdi), %rax |
| mov -14(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| mov -8(%rdi), %rax |
| mov -8(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| xor %eax, %eax |
| ret |
| |
| ALIGN (4) |
| L(64bytes): |
| movdqu -64(%rdi), %xmm2 |
| movdqu -64(%rsi), %xmm1 |
| mov $-64, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(48bytes): |
| movdqu -48(%rdi), %xmm2 |
| movdqu -48(%rsi), %xmm1 |
| mov $-48, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| L(32bytes): |
| movdqu -32(%rdi), %xmm2 |
| movdqu -32(%rsi), %xmm1 |
| mov $-32, %dl |
| pxor %xmm1, %xmm2 |
| ptest %xmm2, %xmm0 |
| jnc L(less16bytes) |
| |
| mov -16(%rdi), %rax |
| mov -16(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| |
| mov -8(%rdi), %rax |
| mov -8(%rsi), %rcx |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| xor %eax, %eax |
| ret |
| |
| /* |
| * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. |
| */ |
| ALIGN (3) |
| L(less16bytes): |
| movsbq %dl, %rdx |
| mov (%rsi, %rdx), %rcx |
| mov (%rdi, %rdx), %rax |
| cmp %rax, %rcx |
| jne L(diffin8bytes) |
| mov 8(%rsi, %rdx), %rcx |
| mov 8(%rdi, %rdx), %rax |
| L(diffin8bytes): |
| cmp %eax, %ecx |
| jne L(diffin4bytes) |
| shr $32, %rcx |
| shr $32, %rax |
| L(diffin4bytes): |
| cmp %cx, %ax |
| jne L(end) |
| shr $16, %ecx |
| shr $16, %eax |
| jmp L(end) |
| |
| ALIGN (4) |
| L(end): |
| and $0xffff, %eax |
| and $0xffff, %ecx |
| sub %ecx, %eax |
| ret |
| |
| END_FUNCTION MEMCMP |
| |
| ALIGN (3) |
| L(table_64bytes): |
| .int JMPTBL (L(0bytes), L(table_64bytes)) |
| .int JMPTBL (L(2bytes), L(table_64bytes)) |
| .int JMPTBL (L(4bytes), L(table_64bytes)) |
| .int JMPTBL (L(6bytes), L(table_64bytes)) |
| .int JMPTBL (L(8bytes), L(table_64bytes)) |
| .int JMPTBL (L(10bytes), L(table_64bytes)) |
| .int JMPTBL (L(12bytes), L(table_64bytes)) |
| .int JMPTBL (L(14bytes), L(table_64bytes)) |
| .int JMPTBL (L(16bytes), L(table_64bytes)) |
| .int JMPTBL (L(18bytes), L(table_64bytes)) |
| .int JMPTBL (L(20bytes), L(table_64bytes)) |
| .int JMPTBL (L(22bytes), L(table_64bytes)) |
| .int JMPTBL (L(24bytes), L(table_64bytes)) |
| .int JMPTBL (L(26bytes), L(table_64bytes)) |
| .int JMPTBL (L(28bytes), L(table_64bytes)) |
| .int JMPTBL (L(30bytes), L(table_64bytes)) |
| .int JMPTBL (L(32bytes), L(table_64bytes)) |
| .int JMPTBL (L(34bytes), L(table_64bytes)) |
| .int JMPTBL (L(36bytes), L(table_64bytes)) |
| .int JMPTBL (L(38bytes), L(table_64bytes)) |
| .int JMPTBL (L(40bytes), L(table_64bytes)) |
| .int JMPTBL (L(42bytes), L(table_64bytes)) |
| .int JMPTBL (L(44bytes), L(table_64bytes)) |
| .int JMPTBL (L(46bytes), L(table_64bytes)) |
| .int JMPTBL (L(48bytes), L(table_64bytes)) |
| .int JMPTBL (L(50bytes), L(table_64bytes)) |
| .int JMPTBL (L(52bytes), L(table_64bytes)) |
| .int JMPTBL (L(54bytes), L(table_64bytes)) |
| .int JMPTBL (L(56bytes), L(table_64bytes)) |
| .int JMPTBL (L(58bytes), L(table_64bytes)) |
| .int JMPTBL (L(60bytes), L(table_64bytes)) |
| .int JMPTBL (L(62bytes), L(table_64bytes)) |
| .int JMPTBL (L(64bytes), L(table_64bytes)) |
| .int JMPTBL (L(66bytes), L(table_64bytes)) |
| .int JMPTBL (L(68bytes), L(table_64bytes)) |
| .int JMPTBL (L(70bytes), L(table_64bytes)) |
| .int JMPTBL (L(72bytes), L(table_64bytes)) |
| .int JMPTBL (L(74bytes), L(table_64bytes)) |
| .int JMPTBL (L(76bytes), L(table_64bytes)) |
| .int JMPTBL (L(78bytes), L(table_64bytes)) |