summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authordg <dg@FreeBSD.org>1995-12-28 23:14:40 +0000
committerdg <dg@FreeBSD.org>1995-12-28 23:14:40 +0000
commit102fe26c6a99225063775f2d2056e2d663becf10 (patch)
treed8053e4719d0aa194ae2a356b292b81cf937402a /sys
parent2991bbce58decd6385778e050db0047dfea117e6 (diff)
downloadFreeBSD-src-102fe26c6a99225063775f2d2056e2d663becf10.zip
FreeBSD-src-102fe26c6a99225063775f2d2056e2d663becf10.tar.gz
Made bzero a function vector and added a 586/686 optimized version of
bzero. Deprecated blkclr (removed it). Removed some old cruft from cpufunc.h. The optimized bzero was submitted by Torbjorn Granlund <tege@matematik.su.se> The kernel adaption and other changes by me.
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/machdep.c9
-rw-r--r--sys/amd64/amd64/support.S87
-rw-r--r--sys/amd64/amd64/support.s87
-rw-r--r--sys/i386/i386/machdep.c9
-rw-r--r--sys/i386/i386/support.s87
5 files changed, 226 insertions, 53 deletions
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 885605a..b001966 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
- * $Id: machdep.c,v 1.163 1995/12/24 08:10:41 davidg Exp $
+ * $Id: machdep.c,v 1.164 1995/12/25 01:02:32 davidg Exp $
*/
#include "npx.h"
@@ -117,6 +117,10 @@ extern int ptrace_single_step __P((struct proc *p));
extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data));
extern void dblfault_handler __P((void));
+extern void i486_bzero __P((void *, size_t));
+extern void i586_bzero __P((void *, size_t));
+extern void i686_bzero __P((void *, size_t));
+
static void cpu_startup __P((void *));
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
@@ -565,6 +569,7 @@ identifycpu()
#if defined(I486_CPU)
case CPUCLASS_486:
printf("486");
+ bzero = i486_bzero;
break;
#endif
#if defined(I586_CPU)
@@ -573,6 +578,7 @@ identifycpu()
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
printf("586");
+ bzero = i586_bzero;
break;
#endif
#if defined(I686_CPU)
@@ -581,6 +587,7 @@ identifycpu()
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
printf("686");
+ bzero = i686_bzero;
break;
#endif
default:
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index a158e54..10dfe2f 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.29 1995/12/26 13:58:11 bde Exp $
+ * $Id: support.s,v 1.30 1995/12/27 18:54:51 davidg Exp $
*/
#include "assym.s" /* system definitions */
@@ -41,6 +41,13 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
+
+ .data
+ .globl _bzero
+_bzero: .long _generic_bzero
+
+ .text
+
/*
* Support for reading real time clock registers
*/
@@ -55,22 +62,10 @@ ENTRY(rtcin) /* rtcin(val) */
/*
* bcopy family
- */
-
-/*
* void bzero(void *base, u_int cnt)
- * Special code for I486 because stosl uses lots
- * of clocks. Makes little or no difference on DX2 type
- * machines, but stosl is about 1/2 as fast as
- * memory moves on a standard DX !!!!!
*/
-ALTENTRY(blkclr)
-ENTRY(bzero)
-#if defined(I486_CPU)
- cmpl $CPUCLASS_486,_cpu_class
- jz 1f
-#endif
+ENTRY(generic_bzero)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
@@ -87,8 +82,7 @@ ENTRY(bzero)
ret
#if defined(I486_CPU)
- SUPERALIGN_TEXT
-1:
+ENTRY(i486_bzero)
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
@@ -185,7 +179,66 @@ do1:
SUPERALIGN_TEXT
do0:
ret
-#endif /* I486_CPU */
+#endif
+
+#if defined(I586_CPU) || defined(I686_CPU)
+ALTENTRY(i586_bzero)
+ENTRY(i686_bzero)
+ pushl %edi
+ movl 8(%esp),%edi /* destination pointer */
+ movl 12(%esp),%edx /* size (in 8-bit words) */
+
+ xorl %eax,%eax /* store data */
+ cld
+
+/* If less than 100 bytes to write, skip tricky code. */
+ cmpl $100,%edx
+ movl %edx,%ecx /* needed when branch is taken! */
+ jl 2f
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned. */
+ movl %edi,%ecx /* Copy ptr to ecx... */
+ negl %ecx /* ...and negate that and... */
+ andl $3,%ecx /* ...mask to get byte count. */
+ subl %ecx,%edx /* adjust global byte count */
+ rep
+ stosb
+
+ subl $32,%edx /* offset count for unrolled loop */
+ movl (%edi),%ecx /* Fetch destination cache line */
+
+ .align 2,0x90 /* supply 0x90 for broken assemblers */
+1:
+ movl 28(%edi),%ecx /* allocate cache line for destination */
+ subl $32,%edx /* decr loop count */
+ movl %eax,0(%edi) /* store words pairwise */
+ movl %eax,4(%edi)
+ movl %eax,8(%edi)
+ movl %eax,12(%edi)
+ movl %eax,16(%edi)
+ movl %eax,20(%edi)
+ movl %eax,24(%edi)
+ movl %eax,28(%edi)
+
+ leal 32(%edi),%edi /* update destination pointer */
+ jge 1b
+ leal 32(%edx),%ecx
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
+2:
+ shrl $2,%ecx
+ rep
+ stosl
+
+/* Finally write the last 0-3 bytes. */
+ movl %edx,%ecx
+ andl $3,%ecx
+ rep
+ stosb
+
+ popl %edi
+ ret
+#endif
/* fillw(pat, base, cnt) */
ENTRY(fillw)
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index a158e54..10dfe2f 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.29 1995/12/26 13:58:11 bde Exp $
+ * $Id: support.s,v 1.30 1995/12/27 18:54:51 davidg Exp $
*/
#include "assym.s" /* system definitions */
@@ -41,6 +41,13 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
+
+ .data
+ .globl _bzero
+_bzero: .long _generic_bzero
+
+ .text
+
/*
* Support for reading real time clock registers
*/
@@ -55,22 +62,10 @@ ENTRY(rtcin) /* rtcin(val) */
/*
* bcopy family
- */
-
-/*
* void bzero(void *base, u_int cnt)
- * Special code for I486 because stosl uses lots
- * of clocks. Makes little or no difference on DX2 type
- * machines, but stosl is about 1/2 as fast as
- * memory moves on a standard DX !!!!!
*/
-ALTENTRY(blkclr)
-ENTRY(bzero)
-#if defined(I486_CPU)
- cmpl $CPUCLASS_486,_cpu_class
- jz 1f
-#endif
+ENTRY(generic_bzero)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
@@ -87,8 +82,7 @@ ENTRY(bzero)
ret
#if defined(I486_CPU)
- SUPERALIGN_TEXT
-1:
+ENTRY(i486_bzero)
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
@@ -185,7 +179,66 @@ do1:
SUPERALIGN_TEXT
do0:
ret
-#endif /* I486_CPU */
+#endif
+
+#if defined(I586_CPU) || defined(I686_CPU)
+ALTENTRY(i586_bzero)
+ENTRY(i686_bzero)
+ pushl %edi
+ movl 8(%esp),%edi /* destination pointer */
+ movl 12(%esp),%edx /* size (in 8-bit words) */
+
+ xorl %eax,%eax /* store data */
+ cld
+
+/* If less than 100 bytes to write, skip tricky code. */
+ cmpl $100,%edx
+ movl %edx,%ecx /* needed when branch is taken! */
+ jl 2f
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned. */
+ movl %edi,%ecx /* Copy ptr to ecx... */
+ negl %ecx /* ...and negate that and... */
+ andl $3,%ecx /* ...mask to get byte count. */
+ subl %ecx,%edx /* adjust global byte count */
+ rep
+ stosb
+
+ subl $32,%edx /* offset count for unrolled loop */
+ movl (%edi),%ecx /* Fetch destination cache line */
+
+ .align 2,0x90 /* supply 0x90 for broken assemblers */
+1:
+ movl 28(%edi),%ecx /* allocate cache line for destination */
+ subl $32,%edx /* decr loop count */
+ movl %eax,0(%edi) /* store words pairwise */
+ movl %eax,4(%edi)
+ movl %eax,8(%edi)
+ movl %eax,12(%edi)
+ movl %eax,16(%edi)
+ movl %eax,20(%edi)
+ movl %eax,24(%edi)
+ movl %eax,28(%edi)
+
+ leal 32(%edi),%edi /* update destination pointer */
+ jge 1b
+ leal 32(%edx),%ecx
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
+2:
+ shrl $2,%ecx
+ rep
+ stosl
+
+/* Finally write the last 0-3 bytes. */
+ movl %edx,%ecx
+ andl $3,%ecx
+ rep
+ stosb
+
+ popl %edi
+ ret
+#endif
/* fillw(pat, base, cnt) */
ENTRY(fillw)
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 885605a..b001966 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
- * $Id: machdep.c,v 1.163 1995/12/24 08:10:41 davidg Exp $
+ * $Id: machdep.c,v 1.164 1995/12/25 01:02:32 davidg Exp $
*/
#include "npx.h"
@@ -117,6 +117,10 @@ extern int ptrace_single_step __P((struct proc *p));
extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data));
extern void dblfault_handler __P((void));
+extern void i486_bzero __P((void *, size_t));
+extern void i586_bzero __P((void *, size_t));
+extern void i686_bzero __P((void *, size_t));
+
static void cpu_startup __P((void *));
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
@@ -565,6 +569,7 @@ identifycpu()
#if defined(I486_CPU)
case CPUCLASS_486:
printf("486");
+ bzero = i486_bzero;
break;
#endif
#if defined(I586_CPU)
@@ -573,6 +578,7 @@ identifycpu()
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
printf("586");
+ bzero = i586_bzero;
break;
#endif
#if defined(I686_CPU)
@@ -581,6 +587,7 @@ identifycpu()
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
printf("686");
+ bzero = i686_bzero;
break;
#endif
default:
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
index a158e54..10dfe2f 100644
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.29 1995/12/26 13:58:11 bde Exp $
+ * $Id: support.s,v 1.30 1995/12/27 18:54:51 davidg Exp $
*/
#include "assym.s" /* system definitions */
@@ -41,6 +41,13 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
+
+ .data
+ .globl _bzero
+_bzero: .long _generic_bzero
+
+ .text
+
/*
* Support for reading real time clock registers
*/
@@ -55,22 +62,10 @@ ENTRY(rtcin) /* rtcin(val) */
/*
* bcopy family
- */
-
-/*
* void bzero(void *base, u_int cnt)
- * Special code for I486 because stosl uses lots
- * of clocks. Makes little or no difference on DX2 type
- * machines, but stosl is about 1/2 as fast as
- * memory moves on a standard DX !!!!!
*/
-ALTENTRY(blkclr)
-ENTRY(bzero)
-#if defined(I486_CPU)
- cmpl $CPUCLASS_486,_cpu_class
- jz 1f
-#endif
+ENTRY(generic_bzero)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
@@ -87,8 +82,7 @@ ENTRY(bzero)
ret
#if defined(I486_CPU)
- SUPERALIGN_TEXT
-1:
+ENTRY(i486_bzero)
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
@@ -185,7 +179,66 @@ do1:
SUPERALIGN_TEXT
do0:
ret
-#endif /* I486_CPU */
+#endif
+
+#if defined(I586_CPU) || defined(I686_CPU)
+ALTENTRY(i586_bzero)
+ENTRY(i686_bzero)
+ pushl %edi
+ movl 8(%esp),%edi /* destination pointer */
+ movl 12(%esp),%edx /* size (in 8-bit words) */
+
+ xorl %eax,%eax /* store data */
+ cld
+
+/* If less than 100 bytes to write, skip tricky code. */
+ cmpl $100,%edx
+ movl %edx,%ecx /* needed when branch is taken! */
+ jl 2f
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned. */
+ movl %edi,%ecx /* Copy ptr to ecx... */
+ negl %ecx /* ...and negate that and... */
+ andl $3,%ecx /* ...mask to get byte count. */
+ subl %ecx,%edx /* adjust global byte count */
+ rep
+ stosb
+
+ subl $32,%edx /* offset count for unrolled loop */
+ movl (%edi),%ecx /* Fetch destination cache line */
+
+ .align 2,0x90 /* supply 0x90 for broken assemblers */
+1:
+ movl 28(%edi),%ecx /* allocate cache line for destination */
+ subl $32,%edx /* decr loop count */
+ movl %eax,0(%edi) /* store words pairwise */
+ movl %eax,4(%edi)
+ movl %eax,8(%edi)
+ movl %eax,12(%edi)
+ movl %eax,16(%edi)
+ movl %eax,20(%edi)
+ movl %eax,24(%edi)
+ movl %eax,28(%edi)
+
+ leal 32(%edi),%edi /* update destination pointer */
+ jge 1b
+ leal 32(%edx),%ecx
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
+2:
+ shrl $2,%ecx
+ rep
+ stosl
+
+/* Finally write the last 0-3 bytes. */
+ movl %edx,%ecx
+ andl $3,%ecx
+ rep
+ stosb
+
+ popl %edi
+ ret
+#endif
/* fillw(pat, base, cnt) */
ENTRY(fillw)
OpenPOWER on IntegriCloud