| /* |
| * Normally compiler builtins are used, but sometimes the compiler calls out |
| * of line code. Based on asm-i386/string.h. |
| * |
| * This assembly file is re-written from memmove_64.c file. |
| * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> |
| */ |
| #define _STRING_C |
| #include <linux/linkage.h> |
| #include <asm/dwarf2.h> |
| |
| #undef memmove |
| |
| /* |
| * Implement memmove(). This can handle overlap between src and dst. |
| * |
| * Input: |
| * rdi: dest |
| * rsi: src |
| * rdx: count |
| * |
| * Output: |
| * rax: dest |
| */ |
| ENTRY(memmove) |
| CFI_STARTPROC |
| /* Handle more 32bytes in loop */ |
| mov %rdi, %rax |
| cmp $0x20, %rdx |
| jb 1f |
| |
| /* Decide forward/backward copy mode */ |
| cmp %rdi, %rsi |
| jb 2f |
| |
| /* |
| * movsq instruction have many startup latency |
| * so we handle small size by general register. |
| */ |
| cmp $680, %rdx |
| jb 3f |
| /* |
| * movsq instruction is only good for aligned case. |
| */ |
| |
| cmpb %dil, %sil |
| je 4f |
| 3: |
| sub $0x20, %rdx |
| /* |
| * We gobble 32byts forward in each loop. |
| */ |
| 5: |
| sub $0x20, %rdx |
| movq 0*8(%rsi), %r11 |
| movq 1*8(%rsi), %r10 |
| movq 2*8(%rsi), %r9 |
| movq 3*8(%rsi), %r8 |
| leaq 4*8(%rsi), %rsi |
| |
| movq %r11, 0*8(%rdi) |
| movq %r10, 1*8(%rdi) |
| movq %r9, 2*8(%rdi) |
| movq %r8, 3*8(%rdi) |
| leaq 4*8(%rdi), %rdi |
| jae 5b |
| addq $0x20, %rdx |
| jmp 1f |
| /* |
| * Handle data forward by movsq. |
| */ |
| .p2align 4 |
| 4: |
| movq %rdx, %rcx |
| movq -8(%rsi, %rdx), %r11 |
| lea -8(%rdi, %rdx), %r10 |
| shrq $3, %rcx |
| rep movsq |
| movq %r11, (%r10) |
| jmp 13f |
| /* |
| * Handle data backward by movsq. |
| */ |
| .p2align 4 |
| 7: |
| movq %rdx, %rcx |
| movq (%rsi), %r11 |
| movq %rdi, %r10 |
| leaq -8(%rsi, %rdx), %rsi |
| leaq -8(%rdi, %rdx), %rdi |
| shrq $3, %rcx |
| std |
| rep movsq |
| cld |
| movq %r11, (%r10) |
| jmp 13f |
| |
| /* |
| * Start to prepare for backward copy. |
| */ |
| .p2align 4 |
| 2: |
| cmp $680, %rdx |
| jb 6f |
| cmp %dil, %sil |
| je 7b |
| 6: |
| /* |
| * Calculate copy position to tail. |
| */ |
| addq %rdx, %rsi |
| addq %rdx, %rdi |
| subq $0x20, %rdx |
| /* |
| * We gobble 32byts backward in each loop. |
| */ |
| 8: |
| subq $0x20, %rdx |
| movq -1*8(%rsi), %r11 |
| movq -2*8(%rsi), %r10 |
| movq -3*8(%rsi), %r9 |
| movq -4*8(%rsi), %r8 |
| leaq -4*8(%rsi), %rsi |
| |
| movq %r11, -1*8(%rdi) |
| movq %r10, -2*8(%rdi) |
| movq %r9, -3*8(%rdi) |
| movq %r8, -4*8(%rdi) |
| leaq -4*8(%rdi), %rdi |
| jae 8b |
| /* |
| * Calculate copy position to head. |
| */ |
| addq $0x20, %rdx |
| subq %rdx, %rsi |
| subq %rdx, %rdi |
| 1: |
| cmpq $16, %rdx |
| jb 9f |
| /* |
| * Move data from 16 bytes to 31 bytes. |
| */ |
| movq 0*8(%rsi), %r11 |
| movq 1*8(%rsi), %r10 |
| movq -2*8(%rsi, %rdx), %r9 |
| movq -1*8(%rsi, %rdx), %r8 |
| movq %r11, 0*8(%rdi) |
| movq %r10, 1*8(%rdi) |
| movq %r9, -2*8(%rdi, %rdx) |
| movq %r8, -1*8(%rdi, %rdx) |
| jmp 13f |
| .p2align 4 |
| 9: |
| cmpq $8, %rdx |
| jb 10f |
| /* |
| * Move data from 8 bytes to 15 bytes. |
| */ |
| movq 0*8(%rsi), %r11 |
| movq -1*8(%rsi, %rdx), %r10 |
| movq %r11, 0*8(%rdi) |
| movq %r10, -1*8(%rdi, %rdx) |
| jmp 13f |
| 10: |
| cmpq $4, %rdx |
| jb 11f |
| /* |
| * Move data from 4 bytes to 7 bytes. |
| */ |
| movl (%rsi), %r11d |
| movl -4(%rsi, %rdx), %r10d |
| movl %r11d, (%rdi) |
| movl %r10d, -4(%rdi, %rdx) |
| jmp 13f |
| 11: |
| cmp $2, %rdx |
| jb 12f |
| /* |
| * Move data from 2 bytes to 3 bytes. |
| */ |
| movw (%rsi), %r11w |
| movw -2(%rsi, %rdx), %r10w |
| movw %r11w, (%rdi) |
| movw %r10w, -2(%rdi, %rdx) |
| jmp 13f |
| 12: |
| cmp $1, %rdx |
| jb 13f |
| /* |
| * Move data for 1 byte. |
| */ |
| movb (%rsi), %r11b |
| movb %r11b, (%rdi) |
| 13: |
| retq |
| CFI_ENDPROC |
| ENDPROC(memmove) |