/* Copyright 2002 Andi Kleen */ #include /* * memcpy - Copy a memory block. * * Input: * rdi destination * rsi source * rdx count * * Output: * rax original destination */ .globl __memcpy .globl memcpy .p2align 4 __memcpy: memcpy: pushq %rbx movq %rdi,%rax movl %edx,%ecx shrl $6,%ecx jz .Lhandle_tail .p2align 4 .Lloop_64: decl %ecx movq (%rsi),%r11 movq 8(%rsi),%r8 movq %r11,(%rdi) movq %r8,1*8(%rdi) movq 2*8(%rsi),%r9 movq 3*8(%rsi),%r10 movq %r9,2*8(%rdi) movq %r10,3*8(%rdi) movq 4*8(%rsi),%r11 movq 5*8(%rsi),%r8 movq %r11,4*8(%rdi) movq %r8,5*8(%rdi) movq 6*8(%rsi),%r9 movq 7*8(%rsi),%r10 movq %r9,6*8(%rdi) movq %r10,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi jnz .Lloop_64 .Lhandle_tail: movl %edx,%ecx andl $63,%ecx shrl $3,%ecx jz .Lhandle_7 .p2align 4 .Lloop_8: decl %ecx movq (%rsi),%r8 movq %r8,(%rdi) leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jnz .Lloop_8 .Lhandle_7: movl %edx,%ecx andl $7,%ecx jz .Lende .p2align 4 .Lloop_1: movb (%rsi),%r8b movb %r8b,(%rdi) incq %rdi incq %rsi decl %ecx jnz .Lloop_1 .Lende: popq %rbx ret .Lfinal: /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ .section .altinstructions,"a" .align 8 .quad memcpy .quad memcpy_c .byte X86_FEATURE_REP_GOOD .byte .Lfinal-memcpy .byte memcpy_c_end-memcpy_c .previous .section .altinstr_replacement,"ax" /* rdi destination * rsi source * rdx count */ memcpy_c: movq %rdi,%rax movl %edx,%ecx shrl $3,%ecx andl $7,%edx rep movsq movl %edx,%ecx rep movsb ret memcpy_c_end: .previous