summaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/lib/memcpy.S')
-rw-r--r--arch/x86_64/lib/memcpy.S121
1 files changed, 121 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
new file mode 100644
index 0000000..c6c4649
--- /dev/null
+++ b/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,121 @@
+/* Copyright 2002 Andi Kleen */
+
+ #include <asm/cpufeature.h>
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+
+ .globl __memcpy
+ .globl memcpy
+ .p2align 4
+__memcpy:
+memcpy:
+ pushq %rbx
+ movq %rdi,%rax
+
+ movl %edx,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+
+ movq (%rsi),%r11
+ movq 8(%rsi),%r8
+
+ movq %r11,(%rdi)
+ movq %r8,1*8(%rdi)
+
+ movq 2*8(%rsi),%r9
+ movq 3*8(%rsi),%r10
+
+ movq %r9,2*8(%rdi)
+ movq %r10,3*8(%rdi)
+
+ movq 4*8(%rsi),%r11
+ movq 5*8(%rsi),%r8
+
+ movq %r11,4*8(%rdi)
+ movq %r8,5*8(%rdi)
+
+ movq 6*8(%rsi),%r9
+ movq 7*8(%rsi),%r10
+
+ movq %r9,6*8(%rdi)
+ movq %r10,7*8(%rdi)
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq (%rsi),%r8
+ movq %r8,(%rdi)
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ movb (%rsi),%r8b
+ movb %r8b,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+.Lende:
+ popq %rbx
+ ret
+.Lfinal:
+
+ /* C stepping K8 run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad memcpy
+ .quad memcpy_c
+ .byte X86_FEATURE_K8_C
+ .byte .Lfinal-memcpy
+ .byte memcpy_c_end-memcpy_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+ /* rdi destination
+ * rsi source
+ * rdx count
+ */
+memcpy_c:
+ movq %rdi,%rax
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+ rep
+ movsq
+ movl %edx,%ecx
+ rep
+ movsb
+ ret
+memcpy_c_end:
+ .previous
OpenPOWER on IntegriCloud