diff options
author | dg <dg@FreeBSD.org> | 1995-12-28 23:14:40 +0000 |
---|---|---|
committer | dg <dg@FreeBSD.org> | 1995-12-28 23:14:40 +0000 |
commit | 102fe26c6a99225063775f2d2056e2d663becf10 (patch) | |
tree | d8053e4719d0aa194ae2a356b292b81cf937402a | |
parent | 2991bbce58decd6385778e050db0047dfea117e6 (diff) | |
download | FreeBSD-src-102fe26c6a99225063775f2d2056e2d663becf10.zip FreeBSD-src-102fe26c6a99225063775f2d2056e2d663becf10.tar.gz |
Made bzero a function vector and added a 586/686 optimized version of
bzero.
Deprecated blkclr (removed it).
Removed some old cruft from cpufunc.h.
The optimized bzero was submitted by Torbjorn Granlund <tege@matematik.su.se>
The kernel adaption and other changes by me.
-rw-r--r-- | sys/amd64/amd64/machdep.c | 9 | ||||
-rw-r--r-- | sys/amd64/amd64/support.S | 87 | ||||
-rw-r--r-- | sys/amd64/amd64/support.s | 87 | ||||
-rw-r--r-- | sys/i386/i386/machdep.c | 9 | ||||
-rw-r--r-- | sys/i386/i386/support.s | 87 |
5 files changed, 226 insertions, 53 deletions
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 885605a..b001966 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.163 1995/12/24 08:10:41 davidg Exp $ + * $Id: machdep.c,v 1.164 1995/12/25 01:02:32 davidg Exp $ */ #include "npx.h" @@ -117,6 +117,10 @@ extern int ptrace_single_step __P((struct proc *p)); extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data)); extern void dblfault_handler __P((void)); +extern void i486_bzero __P((void *, size_t)); +extern void i586_bzero __P((void *, size_t)); +extern void i686_bzero __P((void *, size_t)); + static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) @@ -565,6 +569,7 @@ identifycpu() #if defined(I486_CPU) case CPUCLASS_486: printf("486"); + bzero = i486_bzero; break; #endif #if defined(I586_CPU) @@ -573,6 +578,7 @@ identifycpu() ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("586"); + bzero = i586_bzero; break; #endif #if defined(I686_CPU) @@ -581,6 +587,7 @@ identifycpu() ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("686"); + bzero = i686_bzero; break; #endif default: diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index a158e54..10dfe2f 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.29 1995/12/26 13:58:11 bde Exp $ + * $Id: support.s,v 1.30 1995/12/27 18:54:51 davidg Exp $ */ #include "assym.s" /* system definitions */ @@ -41,6 +41,13 @@ #define KDSEL 0x10 /* kernel data selector */ #define IDXSHIFT 10 + + .data + .globl _bzero +_bzero: .long _generic_bzero + + .text + /* * Support for reading real time clock registers */ @@ -55,22 +62,10 @@ ENTRY(rtcin) /* rtcin(val) */ /* * bcopy family - */ - -/* * void bzero(void *base, u_int cnt) - * Special code for I486 because stosl uses lots - * of clocks. Makes little or no difference on DX2 type - * machines, but stosl is about 1/2 as fast as - * memory moves on a standard DX !!!!! */ -ALTENTRY(blkclr) -ENTRY(bzero) -#if defined(I486_CPU) - cmpl $CPUCLASS_486,_cpu_class - jz 1f -#endif +ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx @@ -87,8 +82,7 @@ ENTRY(bzero) ret #if defined(I486_CPU) - SUPERALIGN_TEXT -1: +ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -185,7 +179,66 @@ do1: SUPERALIGN_TEXT do0: ret -#endif /* I486_CPU */ +#endif + +#if defined(I586_CPU) || defined(I686_CPU) +ALTENTRY(i586_bzero) +ENTRY(i686_bzero) + pushl %edi + movl 8(%esp),%edi /* destination pointer */ + movl 12(%esp),%edx /* size (in 8-bit words) */ + + xorl %eax,%eax /* store data */ + cld + +/* If less than 100 bytes to write, skip tricky code. */ + cmpl $100,%edx + movl %edx,%ecx /* needed when branch is taken! */ + jl 2f + +/* First write 0-3 bytes to make the pointer 32-bit aligned. */ + movl %edi,%ecx /* Copy ptr to ecx... */ + negl %ecx /* ...and negate that and... */ + andl $3,%ecx /* ...mask to get byte count. */ + subl %ecx,%edx /* adjust global byte count */ + rep + stosb + + subl $32,%edx /* offset count for unrolled loop */ + movl (%edi),%ecx /* Fetch destination cache line */ + + .align 2,0x90 /* supply 0x90 for broken assemblers */ +1: + movl 28(%edi),%ecx /* allocate cache line for destination */ + subl $32,%edx /* decr loop count */ + movl %eax,0(%edi) /* store words pairwise */ + movl %eax,4(%edi) + movl %eax,8(%edi) + movl %eax,12(%edi) + movl %eax,16(%edi) + movl %eax,20(%edi) + movl %eax,24(%edi) + movl %eax,28(%edi) + + leal 32(%edi),%edi /* update destination pointer */ + jge 1b + leal 32(%edx),%ecx + +/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */ +2: + shrl $2,%ecx + rep + stosl + +/* Finally write the last 0-3 bytes. */ + movl %edx,%ecx + andl $3,%ecx + rep + stosb + + popl %edi + ret +#endif /* fillw(pat, base, cnt) */ ENTRY(fillw) diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s index a158e54..10dfe2f 100644 --- a/sys/amd64/amd64/support.s +++ b/sys/amd64/amd64/support.s @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.29 1995/12/26 13:58:11 bde Exp $ + * $Id: support.s,v 1.30 1995/12/27 18:54:51 davidg Exp $ */ #include "assym.s" /* system definitions */ @@ -41,6 +41,13 @@ #define KDSEL 0x10 /* kernel data selector */ #define IDXSHIFT 10 + + .data + .globl _bzero +_bzero: .long _generic_bzero + + .text + /* * Support for reading real time clock registers */ @@ -55,22 +62,10 @@ ENTRY(rtcin) /* rtcin(val) */ /* * bcopy family - */ - -/* * void bzero(void *base, u_int cnt) - * Special code for I486 because stosl uses lots - * of clocks. Makes little or no difference on DX2 type - * machines, but stosl is about 1/2 as fast as - * memory moves on a standard DX !!!!! */ -ALTENTRY(blkclr) -ENTRY(bzero) -#if defined(I486_CPU) - cmpl $CPUCLASS_486,_cpu_class - jz 1f -#endif +ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx @@ -87,8 +82,7 @@ ENTRY(bzero) ret #if defined(I486_CPU) - SUPERALIGN_TEXT -1: +ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -185,7 +179,66 @@ do1: SUPERALIGN_TEXT do0: ret -#endif /* I486_CPU */ +#endif + +#if defined(I586_CPU) || defined(I686_CPU) +ALTENTRY(i586_bzero) +ENTRY(i686_bzero) + pushl %edi + movl 8(%esp),%edi /* destination pointer */ + movl 12(%esp),%edx /* size (in 8-bit words) */ + + xorl %eax,%eax /* store data */ + cld + +/* If less than 100 bytes to write, skip tricky code. */ + cmpl $100,%edx + movl %edx,%ecx /* needed when branch is taken! */ + jl 2f + +/* First write 0-3 bytes to make the pointer 32-bit aligned. */ + movl %edi,%ecx /* Copy ptr to ecx... */ + negl %ecx /* ...and negate that and... */ + andl $3,%ecx /* ...mask to get byte count. */ + subl %ecx,%edx /* adjust global byte count */ + rep + stosb + + subl $32,%edx /* offset count for unrolled loop */ + movl (%edi),%ecx /* Fetch destination cache line */ + + .align 2,0x90 /* supply 0x90 for broken assemblers */ +1: + movl 28(%edi),%ecx /* allocate cache line for destination */ + subl $32,%edx /* decr loop count */ + movl %eax,0(%edi) /* store words pairwise */ + movl %eax,4(%edi) + movl %eax,8(%edi) + movl %eax,12(%edi) + movl %eax,16(%edi) + movl %eax,20(%edi) + movl %eax,24(%edi) + movl %eax,28(%edi) + + leal 32(%edi),%edi /* update destination pointer */ + jge 1b + leal 32(%edx),%ecx + +/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */ +2: + shrl $2,%ecx + rep + stosl + +/* Finally write the last 0-3 bytes. */ + movl %edx,%ecx + andl $3,%ecx + rep + stosb + + popl %edi + ret +#endif /* fillw(pat, base, cnt) */ ENTRY(fillw) diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 885605a..b001966 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.163 1995/12/24 08:10:41 davidg Exp $ + * $Id: machdep.c,v 1.164 1995/12/25 01:02:32 davidg Exp $ */ #include "npx.h" @@ -117,6 +117,10 @@ extern int ptrace_single_step __P((struct proc *p)); extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data)); extern void dblfault_handler __P((void)); +extern void i486_bzero __P((void *, size_t)); +extern void i586_bzero __P((void *, size_t)); +extern void i686_bzero __P((void *, size_t)); + static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) @@ -565,6 +569,7 @@ identifycpu() #if defined(I486_CPU) case CPUCLASS_486: printf("486"); + bzero = i486_bzero; break; #endif #if defined(I586_CPU) @@ -573,6 +578,7 @@ identifycpu() ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("586"); + bzero = i586_bzero; break; #endif #if defined(I686_CPU) @@ -581,6 +587,7 @@ identifycpu() ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("686"); + bzero = i686_bzero; break; #endif default: diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index a158e54..10dfe2f 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.29 1995/12/26 13:58:11 bde Exp $ + * $Id: support.s,v 1.30 1995/12/27 18:54:51 davidg Exp $ */ #include "assym.s" /* system definitions */ @@ -41,6 +41,13 @@ #define KDSEL 0x10 /* kernel data selector */ #define IDXSHIFT 10 + + .data + .globl _bzero +_bzero: .long _generic_bzero + + .text + /* * Support for reading real time clock registers */ @@ -55,22 +62,10 @@ ENTRY(rtcin) /* rtcin(val) */ /* * bcopy family - */ - -/* * void bzero(void *base, u_int cnt) - * Special code for I486 because stosl uses lots - * of clocks. Makes little or no difference on DX2 type - * machines, but stosl is about 1/2 as fast as - * memory moves on a standard DX !!!!! */ -ALTENTRY(blkclr) -ENTRY(bzero) -#if defined(I486_CPU) - cmpl $CPUCLASS_486,_cpu_class - jz 1f -#endif +ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx @@ -87,8 +82,7 @@ ENTRY(bzero) ret #if defined(I486_CPU) - SUPERALIGN_TEXT -1: +ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -185,7 +179,66 @@ do1: SUPERALIGN_TEXT do0: ret -#endif /* I486_CPU */ +#endif + +#if defined(I586_CPU) || defined(I686_CPU) +ALTENTRY(i586_bzero) +ENTRY(i686_bzero) + pushl %edi + movl 8(%esp),%edi /* destination pointer */ + movl 12(%esp),%edx /* size (in 8-bit words) */ + + xorl %eax,%eax /* store data */ + cld + +/* If less than 100 bytes to write, skip tricky code. */ + cmpl $100,%edx + movl %edx,%ecx /* needed when branch is taken! */ + jl 2f + +/* First write 0-3 bytes to make the pointer 32-bit aligned. */ + movl %edi,%ecx /* Copy ptr to ecx... */ + negl %ecx /* ...and negate that and... */ + andl $3,%ecx /* ...mask to get byte count. */ + subl %ecx,%edx /* adjust global byte count */ + rep + stosb + + subl $32,%edx /* offset count for unrolled loop */ + movl (%edi),%ecx /* Fetch destination cache line */ + + .align 2,0x90 /* supply 0x90 for broken assemblers */ +1: + movl 28(%edi),%ecx /* allocate cache line for destination */ + subl $32,%edx /* decr loop count */ + movl %eax,0(%edi) /* store words pairwise */ + movl %eax,4(%edi) + movl %eax,8(%edi) + movl %eax,12(%edi) + movl %eax,16(%edi) + movl %eax,20(%edi) + movl %eax,24(%edi) + movl %eax,28(%edi) + + leal 32(%edi),%edi /* update destination pointer */ + jge 1b + leal 32(%edx),%ecx + +/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */ +2: + shrl $2,%ecx + rep + stosl + +/* Finally write the last 0-3 bytes. */ + movl %edx,%ecx + andl $3,%ecx + rep + stosb + + popl %edi + ret +#endif /* fillw(pat, base, cnt) */ ENTRY(fillw) |