3 files changed, 861 insertions, 168 deletions
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index e583aee..8a4d66e 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -30,10 +30,10 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
  */
 
-#include <sys/errno.h>
+#include "opt_temporary.h"			/* for I586_*_B* */
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
@@ -44,10 +44,19 @@
 #define KDSEL		0x10			/* kernel data selector */
 #define IDXSHIFT	10
 
-
 	.data
+	.globl	_bcopy_vector
+_bcopy_vector:
+	.long	_generic_bcopy
 	.globl	_bzero
-_bzero:	.long	_generic_bzero
+_bzero:
+	.long	_generic_bzero
+	.globl	_ovbcopy_vector
+_ovbcopy_vector:
+	.long	_generic_bcopy
+kernel_fpu_lock:
+	.byte	0xfe
+	.space	3
 
 	.text
 
@@ -174,66 +183,147 @@ do0:
 	ret
 #endif
 
-#if 0	/* Actually lowers performance in real-world cases */
 #if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
-	pushl	%edi
-	movl	8(%esp),%edi	/* destination pointer */
-	movl	12(%esp),%edx	/* size (in 8-bit words) */
+ENTRY(i586_bzero)
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
 
-	xorl	%eax,%eax	/* store data */
-	cld
+	/*
+	 * The FPU register method is twice as fast as the integer register
+	 * method unless the target is in the L1 cache and we pre-allocate a
+	 * cache line for it (then the integer register method is 4-5 times
+	 * faster).  However, we never pre-allocate cache lines, since that
+	 * would make the integer method 25% or more slower for the common
+	 * case when the target isn't in either the L1 cache or the L2 cache.
+	 * Thus we normally use the FPU register method unless the overhead
+	 * would be too large.
+	 */
+	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
+	jb	intreg_i586_bzero
 
-/* If less than 100 bytes to write, skip tricky code.  */
-	cmpl	$100,%edx
-	movl	%edx,%ecx	/* needed when branch is taken! */
-	jl	2f
+	/*
+	 * The FPU registers may belong to an application or to fastmove()
+	 * or to another invocation of bcopy() or ourself in a higher level
+	 * interrupt or trap handler.  Preserving the registers is
+	 * complicated since we avoid it if possible at all levels.  We
+	 * want to localize the complications even when that increases them.
+	 * Here the extra work involves preserving CR0_TS in TS.
+	 * `npxproc != NULL' is supposed to be the condition that all the
+	 * FPU resources belong to an application, but npxproc and CR0_TS
+	 * aren't set atomically enough for this condition to work in
+	 * interrupt handlers.
+	 *
+	 * Case 1: FPU registers belong to the application: we must preserve
+	 * the registers if we use them, so we only use the FPU register
+	 * method if the target size is large enough to amortize the extra
+	 * overhead for preserving them.  CR0_TS must be preserved although
+	 * it is very likely to end up as set.
+	 *
+	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
+	 * makes the registers look like they belong to an application so
+	 * that cpu_switch() and savectx() don't have to know about it, so
+	 * this case reduces to case 1.
+	 *
+	 * Case 3: FPU registers belong to the kernel: don't use the FPU
+	 * register method.  This case is unlikely, and supporting it would
+	 * be more complicated and might take too much stack.
+	 *
+	 * Case 4: FPU registers don't belong to anyone: the FPU registers
+	 * don't need to be preserved, so we always use the FPU register
+	 * method.  CR0_TS must be preserved although it is very likely to
+	 * always end up as clear.
+	 */
+	cmpl	$0,_npxproc
+	je	i586_bz1
+	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
+	jb	intreg_i586_bzero
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	i586_bz2
 
-/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
-	movl	%edi,%ecx	/* Copy ptr to ecx... */
-	negl	%ecx		/* ...and negate that and... */
-	andl	$3,%ecx		/* ...mask to get byte count.  */
-	subl	%ecx,%edx	/* adjust global byte count */
-	rep
-	stosb
+i586_bz1:
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	fninit				/* XXX should avoid needing this */
+i586_bz2:
+	fldz
 
-	subl	$32,%edx	/* offset count for unrolled loop */
-	movl	(%edi),%ecx	/* Fetch destination cache line */
+	/*
+	 * Align to an 8 byte boundary (misalignment in the main loop would
+	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
+	 * already aligned) by always zeroing 8 bytes and using the part up
+	 * to the _next_ alignment position.
+	 */
+	fstl	0(%edx)
+	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
+	addl	$8,%edx
+	andl	$~7,%edx
+	subl	%edx,%ecx
 
-	.align	2,0x90		/* supply 0x90 for broken assemblers */
-1:
-	movl	28(%edi),%ecx	/* allocate cache line for destination */
-	subl	$32,%edx	/* decr loop count */
-	movl	%eax,0(%edi)	/* store words pairwise */
-	movl	%eax,4(%edi)
-	movl	%eax,8(%edi)
-	movl	%eax,12(%edi)
-	movl	%eax,16(%edi)
-	movl	%eax,20(%edi)
-	movl	%eax,24(%edi)
-	movl	%eax,28(%edi)
-
-	leal	32(%edi),%edi	/* update destination pointer */
-	jge	1b
-	leal	32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
-2:
+	/*
+	 * Similarly align `len' to a multiple of 8.
+	 */
+	fstl	-8(%edx,%ecx)
+	decl	%ecx
+	andl	$~7,%ecx
+
+	/*
+	 * This wouldn't be any faster if it were unrolled, since the loop
+	 * control instructions are much faster than the fstl and/or done
+	 * in parallel with it so their overhead is insignificant.
+	 */
+fpureg_i586_bzero_loop:
+	fstl	0(%edx)
+	addl	$8,%edx
+	subl	$8,%ecx
+	cmpl	$8,%ecx
+	jae	fpureg_i586_bzero_loop
+
+	cmpl	$0,_npxproc
+	je	i586_bz3
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+i586_bz3:
+	fstpl	%st(0)
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+intreg_i586_bzero:
+	/*
+	 * `rep stos' seems to be the best method in practice for small
+	 * counts.  Fancy methods usually take too long to start up due
+	 * to cache and BTB misses.
+	 */
+	pushl	%edi
+	movl	%edx,%edi
+	xorl	%eax,%eax
 	shrl	$2,%ecx
+	cld
 	rep
 	stosl
-
-/* Finally write the last 0-3 bytes.  */
-	movl	%edx,%ecx
+	movl	12(%esp),%ecx
 	andl	$3,%ecx
+	jne	1f
+	popl	%edi
+	ret
+
+1:
 	rep
 	stosb
-
 	popl	%edi
 	ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
@@ -279,13 +369,19 @@ bcopyb:
 	cld
 	ret
 
+ENTRY(bcopy)
+	MEXITCOUNT
+	jmp	*_bcopy_vector
+
+ENTRY(ovbcopy)
+	MEXITCOUNT
+	jmp	*_ovbcopy_vector
+
 /*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
 
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
 	cld
 	ret
 
+ENTRY(i586_bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	movl	20(%esp),%ecx
+
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
+	jb	1f
+
+	cmpl	$1024,%ecx
+	jb	small_i586_bcopy
+
+	sarb	$1,kernel_fpu_lock
+	jc	small_i586_bcopy
+	cmpl	$0,_npxproc
+	je	i586_bc1
+	smsw	%dx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	4f
+
+i586_bc1:
+	smsw	%dx
+	clts
+	fninit				/* XXX should avoid needing this */
+
+	ALIGN_TEXT
+4:
+	pushl	%ecx
+#define	DCACHE_SIZE	8192
+	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+	jbe	2f
+	movl	$(DCACHE_SIZE-512)/2,%ecx
+2:
+	subl	%ecx,0(%esp)
+	cmpl	$256,%ecx
+	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+	pushl	%esi
+	pushl	%ecx
+	ALIGN_TEXT
+3:
+	movl	0(%esi),%eax
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+	addl	$256,%esi
+	subl	$256,%ecx
+	cmpl	$256,%ecx
+	jae	3b
+	popl	%ecx
+	popl	%esi
+5:
+	ALIGN_TEXT
+large_i586_bcopy_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq	56(%edi)
+	fistpq	48(%edi)
+	fistpq	40(%edi)
+	fistpq	32(%edi)
+	fistpq	24(%edi)
+	fistpq	16(%edi)
+	fistpq	8(%edi)
+	fistpq	0(%edi)
+	addl	$64,%esi
+	addl	$64,%edi
+	subl	$64,%ecx
+	cmpl	$64,%ecx
+	jae	large_i586_bcopy_loop
+	popl	%eax
+	addl	%eax,%ecx
+	cmpl	$64,%ecx
+	jae	4b
+
+	cmpl	$0,_npxproc
+	je	i586_bc2
+	frstor	0(%esp)
+	addl	$108,%esp
+i586_bc2:
+	lmsw	%dx
+	movb	$0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy.  See the comments
+ * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+	ALIGN_TEXT
+small_i586_bcopy:
+	shrl	$2,%ecx
+	cld
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+	ALIGN_TEXT
+1:
+	addl	%ecx,%edi
+	addl	%ecx,%esi
+	decl	%edi
+	decl	%esi
+	andl	$3,%ecx
+	std
+	rep
+	movsb
+	movl	20(%esp),%ecx
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret
 
 /*
  * Note: memcpy does not support overlapping copies
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index e583aee..8a4d66e 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -30,10 +30,10 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
  */
 
-#include <sys/errno.h>
+#include "opt_temporary.h"			/* for I586_*_B* */
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
@@ -44,10 +44,19 @@
 #define KDSEL		0x10			/* kernel data selector */
 #define IDXSHIFT	10
 
-
 	.data
+	.globl	_bcopy_vector
+_bcopy_vector:
+	.long	_generic_bcopy
 	.globl	_bzero
-_bzero:	.long	_generic_bzero
+_bzero:
+	.long	_generic_bzero
+	.globl	_ovbcopy_vector
+_ovbcopy_vector:
+	.long	_generic_bcopy
+kernel_fpu_lock:
+	.byte	0xfe
+	.space	3
 
 	.text
 
@@ -174,66 +183,147 @@ do0:
 	ret
 #endif
 
-#if 0	/* Actually lowers performance in real-world cases */
 #if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
-	pushl	%edi
-	movl	8(%esp),%edi	/* destination pointer */
-	movl	12(%esp),%edx	/* size (in 8-bit words) */
+ENTRY(i586_bzero)
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
 
-	xorl	%eax,%eax	/* store data */
-	cld
+	/*
+	 * The FPU register method is twice as fast as the integer register
+	 * method unless the target is in the L1 cache and we pre-allocate a
+	 * cache line for it (then the integer register method is 4-5 times
+	 * faster).  However, we never pre-allocate cache lines, since that
+	 * would make the integer method 25% or more slower for the common
+	 * case when the target isn't in either the L1 cache or the L2 cache.
+	 * Thus we normally use the FPU register method unless the overhead
+	 * would be too large.
+	 */
+	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
+	jb	intreg_i586_bzero
 
-/* If less than 100 bytes to write, skip tricky code.  */
-	cmpl	$100,%edx
-	movl	%edx,%ecx	/* needed when branch is taken! */
-	jl	2f
+	/*
+	 * The FPU registers may belong to an application or to fastmove()
+	 * or to another invocation of bcopy() or ourself in a higher level
+	 * interrupt or trap handler.  Preserving the registers is
+	 * complicated since we avoid it if possible at all levels.  We
+	 * want to localize the complications even when that increases them.
+	 * Here the extra work involves preserving CR0_TS in TS.
+	 * `npxproc != NULL' is supposed to be the condition that all the
+	 * FPU resources belong to an application, but npxproc and CR0_TS
+	 * aren't set atomically enough for this condition to work in
+	 * interrupt handlers.
+	 *
+	 * Case 1: FPU registers belong to the application: we must preserve
+	 * the registers if we use them, so we only use the FPU register
+	 * method if the target size is large enough to amortize the extra
+	 * overhead for preserving them.  CR0_TS must be preserved although
+	 * it is very likely to end up as set.
+	 *
+	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
+	 * makes the registers look like they belong to an application so
+	 * that cpu_switch() and savectx() don't have to know about it, so
+	 * this case reduces to case 1.
+	 *
+	 * Case 3: FPU registers belong to the kernel: don't use the FPU
+	 * register method.  This case is unlikely, and supporting it would
+	 * be more complicated and might take too much stack.
+	 *
+	 * Case 4: FPU registers don't belong to anyone: the FPU registers
+	 * don't need to be preserved, so we always use the FPU register
+	 * method.  CR0_TS must be preserved although it is very likely to
+	 * always end up as clear.
+	 */
+	cmpl	$0,_npxproc
+	je	i586_bz1
+	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
+	jb	intreg_i586_bzero
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	i586_bz2
 
-/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
-	movl	%edi,%ecx	/* Copy ptr to ecx... */
-	negl	%ecx		/* ...and negate that and... */
-	andl	$3,%ecx		/* ...mask to get byte count.  */
-	subl	%ecx,%edx	/* adjust global byte count */
-	rep
-	stosb
+i586_bz1:
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	fninit				/* XXX should avoid needing this */
+i586_bz2:
+	fldz
 
-	subl	$32,%edx	/* offset count for unrolled loop */
-	movl	(%edi),%ecx	/* Fetch destination cache line */
+	/*
+	 * Align to an 8 byte boundary (misalignment in the main loop would
+	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
+	 * already aligned) by always zeroing 8 bytes and using the part up
+	 * to the _next_ alignment position.
+	 */
+	fstl	0(%edx)
+	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
+	addl	$8,%edx
+	andl	$~7,%edx
+	subl	%edx,%ecx
 
-	.align	2,0x90		/* supply 0x90 for broken assemblers */
-1:
-	movl	28(%edi),%ecx	/* allocate cache line for destination */
-	subl	$32,%edx	/* decr loop count */
-	movl	%eax,0(%edi)	/* store words pairwise */
-	movl	%eax,4(%edi)
-	movl	%eax,8(%edi)
-	movl	%eax,12(%edi)
-	movl	%eax,16(%edi)
-	movl	%eax,20(%edi)
-	movl	%eax,24(%edi)
-	movl	%eax,28(%edi)
-
-	leal	32(%edi),%edi	/* update destination pointer */
-	jge	1b
-	leal	32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
-2:
+	/*
+	 * Similarly align `len' to a multiple of 8.
+	 */
+	fstl	-8(%edx,%ecx)
+	decl	%ecx
+	andl	$~7,%ecx
+
+	/*
+	 * This wouldn't be any faster if it were unrolled, since the loop
+	 * control instructions are much faster than the fstl and/or done
+	 * in parallel with it so their overhead is insignificant.
+	 */
+fpureg_i586_bzero_loop:
+	fstl	0(%edx)
+	addl	$8,%edx
+	subl	$8,%ecx
+	cmpl	$8,%ecx
+	jae	fpureg_i586_bzero_loop
+
+	cmpl	$0,_npxproc
+	je	i586_bz3
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+i586_bz3:
+	fstpl	%st(0)
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+intreg_i586_bzero:
+	/*
+	 * `rep stos' seems to be the best method in practice for small
+	 * counts.  Fancy methods usually take too long to start up due
+	 * to cache and BTB misses.
+	 */
+	pushl	%edi
+	movl	%edx,%edi
+	xorl	%eax,%eax
 	shrl	$2,%ecx
+	cld
 	rep
 	stosl
-
-/* Finally write the last 0-3 bytes.  */
-	movl	%edx,%ecx
+	movl	12(%esp),%ecx
 	andl	$3,%ecx
+	jne	1f
+	popl	%edi
+	ret
+
+1:
 	rep
 	stosb
-
 	popl	%edi
 	ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
@@ -279,13 +369,19 @@ bcopyb:
 	cld
 	ret
 
+ENTRY(bcopy)
+	MEXITCOUNT
+	jmp	*_bcopy_vector
+
+ENTRY(ovbcopy)
+	MEXITCOUNT
+	jmp	*_ovbcopy_vector
+
 /*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
 
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
 	cld
 	ret
 
+ENTRY(i586_bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	movl	20(%esp),%ecx
+
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
+	jb	1f
+
+	cmpl	$1024,%ecx
+	jb	small_i586_bcopy
+
+	sarb	$1,kernel_fpu_lock
+	jc	small_i586_bcopy
+	cmpl	$0,_npxproc
+	je	i586_bc1
+	smsw	%dx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	4f
+
+i586_bc1:
+	smsw	%dx
+	clts
+	fninit				/* XXX should avoid needing this */
+
+	ALIGN_TEXT
+4:
+	pushl	%ecx
+#define	DCACHE_SIZE	8192
+	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+	jbe	2f
+	movl	$(DCACHE_SIZE-512)/2,%ecx
+2:
+	subl	%ecx,0(%esp)
+	cmpl	$256,%ecx
+	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+	pushl	%esi
+	pushl	%ecx
+	ALIGN_TEXT
+3:
+	movl	0(%esi),%eax
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+	addl	$256,%esi
+	subl	$256,%ecx
+	cmpl	$256,%ecx
+	jae	3b
+	popl	%ecx
+	popl	%esi
+5:
+	ALIGN_TEXT
+large_i586_bcopy_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq	56(%edi)
+	fistpq	48(%edi)
+	fistpq	40(%edi)
+	fistpq	32(%edi)
+	fistpq	24(%edi)
+	fistpq	16(%edi)
+	fistpq	8(%edi)
+	fistpq	0(%edi)
+	addl	$64,%esi
+	addl	$64,%edi
+	subl	$64,%ecx
+	cmpl	$64,%ecx
+	jae	large_i586_bcopy_loop
+	popl	%eax
+	addl	%eax,%ecx
+	cmpl	$64,%ecx
+	jae	4b
+
+	cmpl	$0,_npxproc
+	je	i586_bc2
+	frstor	0(%esp)
+	addl	$108,%esp
+i586_bc2:
+	lmsw	%dx
+	movb	$0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy.  See the comments
+ * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+	ALIGN_TEXT
+small_i586_bcopy:
+	shrl	$2,%ecx
+	cld
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+	ALIGN_TEXT
+1:
+	addl	%ecx,%edi
+	addl	%ecx,%esi
+	decl	%edi
+	decl	%esi
+	andl	$3,%ecx
+	std
+	rep
+	movsb
+	movl	20(%esp),%ecx
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret
 
 /*
  * Note: memcpy does not support overlapping copies
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
index e583aee..8a4d66e 100644
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -30,10 +30,10 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
  */
 
-#include <sys/errno.h>
+#include "opt_temporary.h"			/* for I586_*_B* */
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
@@ -44,10 +44,19 @@
 #define KDSEL		0x10			/* kernel data selector */
 #define IDXSHIFT	10
 
-
 	.data
+	.globl	_bcopy_vector
+_bcopy_vector:
+	.long	_generic_bcopy
 	.globl	_bzero
-_bzero:	.long	_generic_bzero
+_bzero:
+	.long	_generic_bzero
+	.globl	_ovbcopy_vector
+_ovbcopy_vector:
+	.long	_generic_bcopy
+kernel_fpu_lock:
+	.byte	0xfe
+	.space	3
 
 	.text
 
@@ -174,66 +183,147 @@ do0:
 	ret
 #endif
 
-#if 0	/* Actually lowers performance in real-world cases */
 #if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
-	pushl	%edi
-	movl	8(%esp),%edi	/* destination pointer */
-	movl	12(%esp),%edx	/* size (in 8-bit words) */
+ENTRY(i586_bzero)
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
 
-	xorl	%eax,%eax	/* store data */
-	cld
+	/*
+	 * The FPU register method is twice as fast as the integer register
+	 * method unless the target is in the L1 cache and we pre-allocate a
+	 * cache line for it (then the integer register method is 4-5 times
+	 * faster).  However, we never pre-allocate cache lines, since that
+	 * would make the integer method 25% or more slower for the common
+	 * case when the target isn't in either the L1 cache or the L2 cache.
+	 * Thus we normally use the FPU register method unless the overhead
+	 * would be too large.
+	 */
+	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
+	jb	intreg_i586_bzero
 
-/* If less than 100 bytes to write, skip tricky code.  */
-	cmpl	$100,%edx
-	movl	%edx,%ecx	/* needed when branch is taken! */
-	jl	2f
+	/*
+	 * The FPU registers may belong to an application or to fastmove()
+	 * or to another invocation of bcopy() or ourself in a higher level
+	 * interrupt or trap handler.  Preserving the registers is
+	 * complicated since we avoid it if possible at all levels.  We
+	 * want to localize the complications even when that increases them.
+	 * Here the extra work involves preserving CR0_TS in TS.
+	 * `npxproc != NULL' is supposed to be the condition that all the
+	 * FPU resources belong to an application, but npxproc and CR0_TS
+	 * aren't set atomically enough for this condition to work in
+	 * interrupt handlers.
+	 *
+	 * Case 1: FPU registers belong to the application: we must preserve
+	 * the registers if we use them, so we only use the FPU register
+	 * method if the target size is large enough to amortize the extra
+	 * overhead for preserving them.  CR0_TS must be preserved although
+	 * it is very likely to end up as set.
+	 *
+	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
+	 * makes the registers look like they belong to an application so
+	 * that cpu_switch() and savectx() don't have to know about it, so
+	 * this case reduces to case 1.
+	 *
+	 * Case 3: FPU registers belong to the kernel: don't use the FPU
+	 * register method.  This case is unlikely, and supporting it would
+	 * be more complicated and might take too much stack.
+	 *
+	 * Case 4: FPU registers don't belong to anyone: the FPU registers
+	 * don't need to be preserved, so we always use the FPU register
+	 * method.  CR0_TS must be preserved although it is very likely to
+	 * always end up as clear.
+	 */
+	cmpl	$0,_npxproc
+	je	i586_bz1
+	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
+	jb	intreg_i586_bzero
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	i586_bz2
 
-/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
-	movl	%edi,%ecx	/* Copy ptr to ecx... */
-	negl	%ecx		/* ...and negate that and... */
-	andl	$3,%ecx		/* ...mask to get byte count.  */
-	subl	%ecx,%edx	/* adjust global byte count */
-	rep
-	stosb
+i586_bz1:
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	fninit				/* XXX should avoid needing this */
+i586_bz2:
+	fldz
 
-	subl	$32,%edx	/* offset count for unrolled loop */
-	movl	(%edi),%ecx	/* Fetch destination cache line */
+	/*
+	 * Align to an 8 byte boundary (misalignment in the main loop would
+	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
+	 * already aligned) by always zeroing 8 bytes and using the part up
+	 * to the _next_ alignment position.
+	 */
+	fstl	0(%edx)
+	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
+	addl	$8,%edx
+	andl	$~7,%edx
+	subl	%edx,%ecx
 
-	.align	2,0x90		/* supply 0x90 for broken assemblers */
-1:
-	movl	28(%edi),%ecx	/* allocate cache line for destination */
-	subl	$32,%edx	/* decr loop count */
-	movl	%eax,0(%edi)	/* store words pairwise */
-	movl	%eax,4(%edi)
-	movl	%eax,8(%edi)
-	movl	%eax,12(%edi)
-	movl	%eax,16(%edi)
-	movl	%eax,20(%edi)
-	movl	%eax,24(%edi)
-	movl	%eax,28(%edi)
-
-	leal	32(%edi),%edi	/* update destination pointer */
-	jge	1b
-	leal	32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
-2:
+	/*
+	 * Similarly align `len' to a multiple of 8.
+	 */
+	fstl	-8(%edx,%ecx)
+	decl	%ecx
+	andl	$~7,%ecx
+
+	/*
+	 * This wouldn't be any faster if it were unrolled, since the loop
+	 * control instructions are much faster than the fstl and/or done
+	 * in parallel with it so their overhead is insignificant.
+	 */
+fpureg_i586_bzero_loop:
+	fstl	0(%edx)
+	addl	$8,%edx
+	subl	$8,%ecx
+	cmpl	$8,%ecx
+	jae	fpureg_i586_bzero_loop
+
+	cmpl	$0,_npxproc
+	je	i586_bz3
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+i586_bz3:
+	fstpl	%st(0)
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+intreg_i586_bzero:
+	/*
+	 * `rep stos' seems to be the best method in practice for small
+	 * counts.  Fancy methods usually take too long to start up due
+	 * to cache and BTB misses.
+	 */
+	pushl	%edi
+	movl	%edx,%edi
+	xorl	%eax,%eax
 	shrl	$2,%ecx
+	cld
 	rep
 	stosl
-
-/* Finally write the last 0-3 bytes.  */
-	movl	%edx,%ecx
+	movl	12(%esp),%ecx
 	andl	$3,%ecx
+	jne	1f
+	popl	%edi
+	ret
+
+1:
 	rep
 	stosb
-
 	popl	%edi
 	ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
@@ -279,13 +369,19 @@ bcopyb:
 	cld
 	ret
 
+ENTRY(bcopy)
+	MEXITCOUNT
+	jmp	*_bcopy_vector
+
+ENTRY(ovbcopy)
+	MEXITCOUNT
+	jmp	*_ovbcopy_vector
+
 /*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
 
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
 	cld
 	ret
 
+ENTRY(i586_bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	movl	20(%esp),%ecx
+
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
+	jb	1f
+
+	cmpl	$1024,%ecx
+	jb	small_i586_bcopy
+
+	sarb	$1,kernel_fpu_lock
+	jc	small_i586_bcopy
+	cmpl	$0,_npxproc
+	je	i586_bc1
+	smsw	%dx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	4f
+
+i586_bc1:
+	smsw	%dx
+	clts
+	fninit				/* XXX should avoid needing this */
+
+	ALIGN_TEXT
+4:
+	pushl	%ecx
+#define	DCACHE_SIZE	8192
+	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+	jbe	2f
+	movl	$(DCACHE_SIZE-512)/2,%ecx
+2:
+	subl	%ecx,0(%esp)
+	cmpl	$256,%ecx
+	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+	pushl	%esi
+	pushl	%ecx
+	ALIGN_TEXT
+3:
+	movl	0(%esi),%eax
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+	addl	$256,%esi
+	subl	$256,%ecx
+	cmpl	$256,%ecx
+	jae	3b
+	popl	%ecx
+	popl	%esi
+5:
+	ALIGN_TEXT
+large_i586_bcopy_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq	56(%edi)
+	fistpq	48(%edi)
+	fistpq	40(%edi)
+	fistpq	32(%edi)
+	fistpq	24(%edi)
+	fistpq	16(%edi)
+	fistpq	8(%edi)
+	fistpq	0(%edi)
+	addl	$64,%esi
+	addl	$64,%edi
+	subl	$64,%ecx
+	cmpl	$64,%ecx
+	jae	large_i586_bcopy_loop
+	popl	%eax
+	addl	%eax,%ecx
+	cmpl	$64,%ecx
+	jae	4b
+
+	cmpl	$0,_npxproc
+	je	i586_bc2
+	frstor	0(%esp)
+	addl	$108,%esp
+i586_bc2:
+	lmsw	%dx
+	movb	$0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy.  See the comments
+ * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+	ALIGN_TEXT
+small_i586_bcopy:
+	shrl	$2,%ecx
+	cld
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+	ALIGN_TEXT
+1:
+	addl	%ecx,%edi
+	addl	%ecx,%esi
+	decl	%edi
+	decl	%esi
+	andl	$3,%ecx
+	std
+	rep
+	movsb
+	movl	20(%esp),%ecx
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret
 
 /*
  * Note: memcpy does not support overlapping copies