From 2a6c7b6bbfefbe37dbbc31a209dbf0166ee51a07 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:15:55 +0200 Subject: x86_64: prepare shared lib/memset.S Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86_64/lib/Makefile | 2 +- arch/x86_64/lib/memset.S | 133 -------------------------------------------- arch/x86_64/lib/memset_64.S | 133 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 134 deletions(-) delete mode 100644 arch/x86_64/lib/memset.S create mode 100644 arch/x86_64/lib/memset_64.S (limited to 'arch') diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index 6bdf7d8..09c1ffa 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -10,4 +10,4 @@ obj-$(CONFIG_SMP) += msr-on-cpu.o lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ usercopy_64.o getuser_64.o putuser_64.o \ thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o -lib-y += memcpy_64.o memmove_64.o memset.o copy_user.o rwlock_64.o copy_user_nocache_64.o +lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S deleted file mode 100644 index 2c59481..0000000 --- a/arch/x86_64/lib/memset.S +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs */ - -#include -#include - -/* - * ISO C memset - set a memory block to a byte value. - * - * rdi destination - * rsi value (char) - * rdx count (bytes) - * - * rax original destination - */ - ALIGN -memset_c: - CFI_STARTPROC - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep stosq - movl %r8d,%ecx - rep stosb - movq %r9,%rax - ret - CFI_ENDPROC -ENDPROC(memset_c) - -ENTRY(memset) -ENTRY(__memset) - CFI_STARTPROC - movq %rdi,%r10 - movq %rdx,%r11 - - /* expand byte value */ - movzbl %sil,%ecx - movabs $0x0101010101010101,%rax - mul %rcx /* with rax, clobbers rdx */ - - /* align dst */ - movl %edi,%r9d - andl $7,%r9d - jnz .Lbad_alignment - CFI_REMEMBER_STATE -.Lafter_bad_alignment: - - movl %r11d,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) - movq %rax,32(%rdi) - movq %rax,40(%rdi) - movq %rax,48(%rdi) - movq %rax,56(%rdi) - leaq 64(%rdi),%rdi - jnz .Lloop_64 - - /* Handle tail in loops. The loops should be faster than hard - to predict jump tables. */ - .p2align 4 -.Lhandle_tail: - movl %r11d,%ecx - andl $63&(~7),%ecx - jz .Lhandle_7 - shrl $3,%ecx - .p2align 4 -.Lloop_8: - decl %ecx - movq %rax,(%rdi) - leaq 8(%rdi),%rdi - jnz .Lloop_8 - -.Lhandle_7: - movl %r11d,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - decl %ecx - movb %al,(%rdi) - leaq 1(%rdi),%rdi - jnz .Lloop_1 - -.Lende: - movq %r10,%rax - ret - - CFI_RESTORE_STATE -.Lbad_alignment: - cmpq $7,%r11 - jbe .Lhandle_7 - movq %rax,(%rdi) /* unaligned store */ - movq $8,%r8 - subq %r9,%r8 - addq %r8,%rdi - subq %r8,%r11 - jmp .Lafter_bad_alignment -.Lfinal: - CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memset - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/memset_64.S b/arch/x86_64/lib/memset_64.S new file mode 100644 index 0000000..2c59481 --- /dev/null +++ b/arch/x86_64/lib/memset_64.S @@ -0,0 +1,133 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include +#include + +/* + * ISO C memset - set a memory block to a byte value. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment + CFI_REMEMBER_STATE +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + + CFI_RESTORE_STATE +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b + .previous -- cgit v1.1