diff options
Diffstat (limited to 'arch/x86_64/lib/memcpy.S')
-rw-r--r-- | arch/x86_64/lib/memcpy.S | 93 |
1 files changed, 2 insertions, 91 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index c6c4649..92dd805 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -11,6 +11,8 @@ * * Output: * rax original destination + * + * TODO: check best memcpy for PSC */ .globl __memcpy @@ -18,95 +20,6 @@ .p2align 4 __memcpy: memcpy: - pushq %rbx - movq %rdi,%rax - - movl %edx,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - - movq (%rsi),%r11 - movq 8(%rsi),%r8 - - movq %r11,(%rdi) - movq %r8,1*8(%rdi) - - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 - - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) - - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 - - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jnz .Lloop_64 - -.Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - -.Lende: - popq %rbx - ret -.Lfinal: - - /* C stepping K8 run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad memcpy_c - .byte X86_FEATURE_K8_C - .byte .Lfinal-memcpy - .byte memcpy_c_end-memcpy_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi source - * rdx count - */ -memcpy_c: movq %rdi,%rax movl %edx,%ecx shrl $3,%ecx @@ -117,5 +30,3 @@ memcpy_c: rep movsb ret -memcpy_c_end: - .previous |