diff options
Diffstat (limited to 'arch/x86_64/lib/memcpy.S')
-rw-r--r-- | arch/x86_64/lib/memcpy.S | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S new file mode 100644 index 0000000..c6c4649 --- /dev/null +++ b/arch/x86_64/lib/memcpy.S @@ -0,0 +1,121 @@ +/* Copyright 2002 Andi Kleen */ + + #include <asm/cpufeature.h> +/* + * memcpy - Copy a memory block. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ + + .globl __memcpy + .globl memcpy + .p2align 4 +__memcpy: +memcpy: + pushq %rbx + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret +.Lfinal: + + /* C stepping K8 run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad memcpy_c + .byte X86_FEATURE_K8_C + .byte .Lfinal-memcpy + .byte memcpy_c_end-memcpy_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi source + * rdx count + */ +memcpy_c: + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep + movsq + movl %edx,%ecx + rep + movsb + ret +memcpy_c_end: + .previous |