arch/x86/lib/memcpy_64.S - maze/linux - Git at Google

 /* Copyright 2002 Andi Kleen */

 #include <linux/linkage.h>

 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>

 /*
  * memcpy - Copy a memory block.
  *
  * Input:
  *  rdi destination
  *  rsi source
  *  rdx count
  *
  * Output:
  * rax original destination
  */

 /*
  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  *
  * This gets patched over the unrolled variant (below) via the
  * alternative instructions framework:
  */
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c:
 	movq %rdi, %rax

 	movl %edx, %ecx
 	shrl $3, %ecx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
 	rep movsb
 	ret
 .Lmemcpy_e:
 	.previous

 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC

 	/*
 	 * Put the number of full 64-byte blocks into %ecx.
 	 * Tail portion is handled at the end:
 	 */
 	movq %rdi, %rax
 	movl %edx, %ecx
 	shrl   $6, %ecx
 	jz .Lhandle_tail

 	.p2align 4
 .Lloop_64:
 	/*
 	 * We decrement the loop index here - and the zero-flag is
 	 * checked at the end of the loop (instructions inbetween do
 	 * not change the zero flag):
 	 */
 	decl %ecx

 	/*
 	 * Move in blocks of 4x16 bytes:
 	 */
 	movq 0*8(%rsi),		%r11
 	movq 1*8(%rsi),		%r8
 	movq %r11,		0*8(%rdi)
 	movq %r8,		1*8(%rdi)

 	movq 2*8(%rsi),		%r9
 	movq 3*8(%rsi),		%r10
 	movq %r9,		2*8(%rdi)
 	movq %r10,		3*8(%rdi)

 	movq 4*8(%rsi),		%r11
 	movq 5*8(%rsi),		%r8
 	movq %r11,		4*8(%rdi)
 	movq %r8,		5*8(%rdi)

 	movq 6*8(%rsi),		%r9
 	movq 7*8(%rsi),		%r10
 	movq %r9,		6*8(%rdi)
 	movq %r10,		7*8(%rdi)

 	leaq 64(%rsi), %rsi
 	leaq 64(%rdi), %rdi

 	jnz  .Lloop_64

 .Lhandle_tail:
 	movl %edx, %ecx
 	andl  $63, %ecx
 	shrl   $3, %ecx
 	jz   .Lhandle_7

 	.p2align 4
 .Lloop_8:
 	decl %ecx
 	movq (%rsi),		%r8
 	movq %r8,		(%rdi)
 	leaq 8(%rdi),		%rdi
 	leaq 8(%rsi),		%rsi
 	jnz  .Lloop_8

 .Lhandle_7:
 	movl %edx, %ecx
 	andl $7, %ecx
 	jz .Lend

 	.p2align 4
 .Lloop_1:
 	movb (%rsi), %r8b
 	movb %r8b, (%rdi)
 	incq %rdi
 	incq %rsi
 	decl %ecx
 	jnz .Lloop_1

 .Lend:
 	ret
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)

 	/*
 	 * Some CPUs run faster using the string copy instructions.
 	 * It is also a lot simpler. Use this when possible:
 	 */

 	.section .altinstructions, "a"
 	.align 8
 	.quad memcpy
 	.quad .Lmemcpy_c
 	.byte X86_FEATURE_REP_GOOD

 	/*
 	 * Replace only beginning, memcpy is used to apply alternatives,
 	 * so it is silly to overwrite itself with nops - reboot is the
 	 * only outcome...
 	 */
 	.byte .Lmemcpy_e - .Lmemcpy_c
 	.byte .Lmemcpy_e - .Lmemcpy_c
 	.previous
	/* Copyright 2002 Andi Kleen */

	#include <linux/linkage.h>

	#include <asm/cpufeature.h>
	#include <asm/dwarf2.h>

	/*
	* memcpy - Copy a memory block.
	*
	* Input:
	* rdi destination
	* rsi source
	* rdx count
	*
	* Output:
	* rax original destination
	*/

	/*
	* memcpy_c() - fast string ops (REP MOVSQ) based variant.
	*
	* This gets patched over the unrolled variant (below) via the
	* alternative instructions framework:
	*/
	.section .altinstr_replacement, "ax", @progbits
	.Lmemcpy_c:
	movq %rdi, %rax

	movl %edx, %ecx
	shrl $3, %ecx
	andl $7, %edx
	rep movsq
	movl %edx, %ecx
	rep movsb
	ret
	.Lmemcpy_e:
	.previous

	ENTRY(__memcpy)
	ENTRY(memcpy)
	CFI_STARTPROC

	/*
	* Put the number of full 64-byte blocks into %ecx.
	* Tail portion is handled at the end:
	*/
	movq %rdi, %rax
	movl %edx, %ecx
	shrl $6, %ecx
	jz .Lhandle_tail

	.p2align 4
	.Lloop_64:
	/*
	* We decrement the loop index here - and the zero-flag is
	* checked at the end of the loop (instructions inbetween do
	* not change the zero flag):
	*/
	decl %ecx

	/*
	* Move in blocks of 4x16 bytes:
	*/
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r8
	movq %r11, 0*8(%rdi)
	movq %r8, 1*8(%rdi)

	movq 2*8(%rsi), %r9
	movq 3*8(%rsi), %r10
	movq %r9, 2*8(%rdi)
	movq %r10, 3*8(%rdi)

	movq 4*8(%rsi), %r11
	movq 5*8(%rsi), %r8
	movq %r11, 4*8(%rdi)
	movq %r8, 5*8(%rdi)

	movq 6*8(%rsi), %r9
	movq 7*8(%rsi), %r10
	movq %r9, 6*8(%rdi)
	movq %r10, 7*8(%rdi)

	leaq 64(%rsi), %rsi
	leaq 64(%rdi), %rdi

	jnz .Lloop_64

	.Lhandle_tail:
	movl %edx, %ecx
	andl $63, %ecx
	shrl $3, %ecx
	jz .Lhandle_7

	.p2align 4
	.Lloop_8:
	decl %ecx
	movq (%rsi), %r8
	movq %r8, (%rdi)
	leaq 8(%rdi), %rdi
	leaq 8(%rsi), %rsi
	jnz .Lloop_8

	.Lhandle_7:
	movl %edx, %ecx
	andl $7, %ecx
	jz .Lend

	.p2align 4
	.Lloop_1:
	movb (%rsi), %r8b
	movb %r8b, (%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

	.Lend:
	ret
	CFI_ENDPROC
	ENDPROC(memcpy)
	ENDPROC(__memcpy)

	/*
	* Some CPUs run faster using the string copy instructions.
	* It is also a lot simpler. Use this when possible:
	*/

	.section .altinstructions, "a"
	.align 8
	.quad memcpy
	.quad .Lmemcpy_c
	.byte X86_FEATURE_REP_GOOD

	/*
	* Replace only beginning, memcpy is used to apply alternatives,
	* so it is silly to overwrite itself with nops - reboot is the
	* only outcome...
	*/
	.byte .Lmemcpy_e - .Lmemcpy_c
	.byte .Lmemcpy_e - .Lmemcpy_c
	.previous