| /* Copyright 2002 Andi Kleen */ |
| |
| #include <linux/linkage.h> |
| |
| #include <asm/cpufeature.h> |
| #include <asm/dwarf2.h> |
| |
| /* |
| * memcpy - Copy a memory block. |
| * |
| * Input: |
| * rdi destination |
| * rsi source |
| * rdx count |
| * |
| * Output: |
| * rax original destination |
| */ |
| |
| /* |
| * memcpy_c() - fast string ops (REP MOVSQ) based variant. |
| * |
| * This gets patched over the unrolled variant (below) via the |
| * alternative instructions framework: |
| */ |
| .section .altinstr_replacement, "ax", @progbits |
| .Lmemcpy_c: |
| movq %rdi, %rax |
| |
| movl %edx, %ecx |
| shrl $3, %ecx |
| andl $7, %edx |
| rep movsq |
| movl %edx, %ecx |
| rep movsb |
| ret |
| .Lmemcpy_e: |
| .previous |
| |
| ENTRY(__memcpy) |
| ENTRY(memcpy) |
| CFI_STARTPROC |
| |
| /* |
| * Put the number of full 64-byte blocks into %ecx. |
| * Tail portion is handled at the end: |
| */ |
| movq %rdi, %rax |
| movl %edx, %ecx |
| shrl $6, %ecx |
| jz .Lhandle_tail |
| |
| .p2align 4 |
| .Lloop_64: |
| /* |
| * We decrement the loop index here - and the zero-flag is |
| * checked at the end of the loop (instructions inbetween do |
| * not change the zero flag): |
| */ |
| decl %ecx |
| |
| /* |
| * Move in blocks of 4x16 bytes: |
| */ |
| movq 0*8(%rsi), %r11 |
| movq 1*8(%rsi), %r8 |
| movq %r11, 0*8(%rdi) |
| movq %r8, 1*8(%rdi) |
| |
| movq 2*8(%rsi), %r9 |
| movq 3*8(%rsi), %r10 |
| movq %r9, 2*8(%rdi) |
| movq %r10, 3*8(%rdi) |
| |
| movq 4*8(%rsi), %r11 |
| movq 5*8(%rsi), %r8 |
| movq %r11, 4*8(%rdi) |
| movq %r8, 5*8(%rdi) |
| |
| movq 6*8(%rsi), %r9 |
| movq 7*8(%rsi), %r10 |
| movq %r9, 6*8(%rdi) |
| movq %r10, 7*8(%rdi) |
| |
| leaq 64(%rsi), %rsi |
| leaq 64(%rdi), %rdi |
| |
| jnz .Lloop_64 |
| |
| .Lhandle_tail: |
| movl %edx, %ecx |
| andl $63, %ecx |
| shrl $3, %ecx |
| jz .Lhandle_7 |
| |
| .p2align 4 |
| .Lloop_8: |
| decl %ecx |
| movq (%rsi), %r8 |
| movq %r8, (%rdi) |
| leaq 8(%rdi), %rdi |
| leaq 8(%rsi), %rsi |
| jnz .Lloop_8 |
| |
| .Lhandle_7: |
| movl %edx, %ecx |
| andl $7, %ecx |
| jz .Lend |
| |
| .p2align 4 |
| .Lloop_1: |
| movb (%rsi), %r8b |
| movb %r8b, (%rdi) |
| incq %rdi |
| incq %rsi |
| decl %ecx |
| jnz .Lloop_1 |
| |
| .Lend: |
| ret |
| CFI_ENDPROC |
| ENDPROC(memcpy) |
| ENDPROC(__memcpy) |
| |
| /* |
| * Some CPUs run faster using the string copy instructions. |
| * It is also a lot simpler. Use this when possible: |
| */ |
| |
| .section .altinstructions, "a" |
| .align 8 |
| .quad memcpy |
| .quad .Lmemcpy_c |
| .byte X86_FEATURE_REP_GOOD |
| |
| /* |
| * Replace only beginning, memcpy is used to apply alternatives, |
| * so it is silly to overwrite itself with nops - reboot is the |
| * only outcome... |
| */ |
| .byte .Lmemcpy_e - .Lmemcpy_c |
| .byte .Lmemcpy_e - .Lmemcpy_c |
| .previous |