summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/amd64/amd64/support.S343
-rw-r--r--sys/amd64/amd64/support.s343
-rw-r--r--sys/i386/i386/support.s343
3 files changed, 861 insertions, 168 deletions
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index e583aee..8a4d66e 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -30,10 +30,10 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ * $Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
*/
-#include <sys/errno.h>
+#include "opt_temporary.h" /* for I586_*_B* */
#include <machine/asmacros.h>
#include <machine/cputypes.h>
@@ -44,10 +44,19 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
-
.data
+ .globl _bcopy_vector
+_bcopy_vector:
+ .long _generic_bcopy
.globl _bzero
-_bzero: .long _generic_bzero
+_bzero:
+ .long _generic_bzero
+ .globl _ovbcopy_vector
+_ovbcopy_vector:
+ .long _generic_bcopy
+kernel_fpu_lock:
+ .byte 0xfe
+ .space 3
.text
@@ -174,66 +183,147 @@ do0:
ret
#endif
-#if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
- pushl %edi
- movl 8(%esp),%edi /* destination pointer */
- movl 12(%esp),%edx /* size (in 8-bit words) */
+ENTRY(i586_bzero)
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
- xorl %eax,%eax /* store data */
- cld
+ /*
+ * The FPU register method is twice as fast as the integer register
+ * method unless the target is in the L1 cache and we pre-allocate a
+ * cache line for it (then the integer register method is 4-5 times
+ * faster). However, we never pre-allocate cache lines, since that
+ * would make the integer method 25% or more slower for the common
+ * case when the target isn't in either the L1 cache or the L2 cache.
+ * Thus we normally use the FPU register method unless the overhead
+ * would be too large.
+ */
+ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
+ jb intreg_i586_bzero
-/* If less than 100 bytes to write, skip tricky code. */
- cmpl $100,%edx
- movl %edx,%ecx /* needed when branch is taken! */
- jl 2f
+ /*
+ * The FPU registers may belong to an application or to fastmove()
+ * or to another invocation of bcopy() or ourself in a higher level
+ * interrupt or trap handler. Preserving the registers is
+ * complicated since we avoid it if possible at all levels. We
+ * want to localize the complications even when that increases them.
+ * Here the extra work involves preserving CR0_TS in TS.
+ * `npxproc != NULL' is supposed to be the condition that all the
+ * FPU resources belong to an application, but npxproc and CR0_TS
+ * aren't set atomically enough for this condition to work in
+ * interrupt handlers.
+ *
+ * Case 1: FPU registers belong to the application: we must preserve
+ * the registers if we use them, so we only use the FPU register
+ * method if the target size is large enough to amortize the extra
+ * overhead for preserving them. CR0_TS must be preserved although
+ * it is very likely to end up as set.
+ *
+ * Case 2: FPU registers belong to fastmove(): fastmove() currently
+ * makes the registers look like they belong to an application so
+ * that cpu_switch() and savectx() don't have to know about it, so
+ * this case reduces to case 1.
+ *
+ * Case 3: FPU registers belong to the kernel: don't use the FPU
+ * register method. This case is unlikely, and supporting it would
+ * be more complicated and might take too much stack.
+ *
+ * Case 4: FPU registers don't belong to anyone: the FPU registers
+ * don't need to be preserved, so we always use the FPU register
+ * method. CR0_TS must be preserved although it is very likely to
+ * always end up as clear.
+ */
+ cmpl $0,_npxproc
+ je i586_bz1
+ cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
+ jb intreg_i586_bzero
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp i586_bz2
-/* First write 0-3 bytes to make the pointer 32-bit aligned. */
- movl %edi,%ecx /* Copy ptr to ecx... */
- negl %ecx /* ...and negate that and... */
- andl $3,%ecx /* ...mask to get byte count. */
- subl %ecx,%edx /* adjust global byte count */
- rep
- stosb
+i586_bz1:
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ fninit /* XXX should avoid needing this */
+i586_bz2:
+ fldz
- subl $32,%edx /* offset count for unrolled loop */
- movl (%edi),%ecx /* Fetch destination cache line */
+ /*
+ * Align to an 8 byte boundary (misalignment in the main loop would
+ * cost a factor of >= 2). Avoid jumps (at little cost if it is
+ * already aligned) by always zeroing 8 bytes and using the part up
+ * to the _next_ alignment position.
+ */
+ fstl 0(%edx)
+ addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
+ addl $8,%edx
+ andl $~7,%edx
+ subl %edx,%ecx
- .align 2,0x90 /* supply 0x90 for broken assemblers */
-1:
- movl 28(%edi),%ecx /* allocate cache line for destination */
- subl $32,%edx /* decr loop count */
- movl %eax,0(%edi) /* store words pairwise */
- movl %eax,4(%edi)
- movl %eax,8(%edi)
- movl %eax,12(%edi)
- movl %eax,16(%edi)
- movl %eax,20(%edi)
- movl %eax,24(%edi)
- movl %eax,28(%edi)
-
- leal 32(%edi),%edi /* update destination pointer */
- jge 1b
- leal 32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
-2:
+ /*
+ * Similarly align `len' to a multiple of 8.
+ */
+ fstl -8(%edx,%ecx)
+ decl %ecx
+ andl $~7,%ecx
+
+ /*
+ * This wouldn't be any faster if it were unrolled, since the loop
+ * control instructions are much faster than the fstl and/or done
+ * in parallel with it so their overhead is insignificant.
+ */
+fpureg_i586_bzero_loop:
+ fstl 0(%edx)
+ addl $8,%edx
+ subl $8,%ecx
+ cmpl $8,%ecx
+ jae fpureg_i586_bzero_loop
+
+ cmpl $0,_npxproc
+ je i586_bz3
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+i586_bz3:
+ fstpl %st(0)
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+intreg_i586_bzero:
+ /*
+ * `rep stos' seems to be the best method in practice for small
+ * counts. Fancy methods usually take too long to start up due
+ * to cache and BTB misses.
+ */
+ pushl %edi
+ movl %edx,%edi
+ xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
-
-/* Finally write the last 0-3 bytes. */
- movl %edx,%ecx
+ movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+1:
rep
stosb
-
popl %edi
ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
@@ -279,13 +369,19 @@ bcopyb:
cld
ret
+ENTRY(bcopy)
+ MEXITCOUNT
+ jmp *_bcopy_vector
+
+ENTRY(ovbcopy)
+ MEXITCOUNT
+ jmp *_ovbcopy_vector
+
/*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
cld
ret
+ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping && src < dst? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+4:
+ pushl %ecx
+#define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+5:
+ ALIGN_TEXT
+large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
* Note: memcpy does not support overlapping copies
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index e583aee..8a4d66e 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -30,10 +30,10 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ * $Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
*/
-#include <sys/errno.h>
+#include "opt_temporary.h" /* for I586_*_B* */
#include <machine/asmacros.h>
#include <machine/cputypes.h>
@@ -44,10 +44,19 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
-
.data
+ .globl _bcopy_vector
+_bcopy_vector:
+ .long _generic_bcopy
.globl _bzero
-_bzero: .long _generic_bzero
+_bzero:
+ .long _generic_bzero
+ .globl _ovbcopy_vector
+_ovbcopy_vector:
+ .long _generic_bcopy
+kernel_fpu_lock:
+ .byte 0xfe
+ .space 3
.text
@@ -174,66 +183,147 @@ do0:
ret
#endif
-#if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
- pushl %edi
- movl 8(%esp),%edi /* destination pointer */
- movl 12(%esp),%edx /* size (in 8-bit words) */
+ENTRY(i586_bzero)
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
- xorl %eax,%eax /* store data */
- cld
+ /*
+ * The FPU register method is twice as fast as the integer register
+ * method unless the target is in the L1 cache and we pre-allocate a
+ * cache line for it (then the integer register method is 4-5 times
+ * faster). However, we never pre-allocate cache lines, since that
+ * would make the integer method 25% or more slower for the common
+ * case when the target isn't in either the L1 cache or the L2 cache.
+ * Thus we normally use the FPU register method unless the overhead
+ * would be too large.
+ */
+ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
+ jb intreg_i586_bzero
-/* If less than 100 bytes to write, skip tricky code. */
- cmpl $100,%edx
- movl %edx,%ecx /* needed when branch is taken! */
- jl 2f
+ /*
+ * The FPU registers may belong to an application or to fastmove()
+ * or to another invocation of bcopy() or ourself in a higher level
+ * interrupt or trap handler. Preserving the registers is
+ * complicated since we avoid it if possible at all levels. We
+ * want to localize the complications even when that increases them.
+ * Here the extra work involves preserving CR0_TS in TS.
+ * `npxproc != NULL' is supposed to be the condition that all the
+ * FPU resources belong to an application, but npxproc and CR0_TS
+ * aren't set atomically enough for this condition to work in
+ * interrupt handlers.
+ *
+ * Case 1: FPU registers belong to the application: we must preserve
+ * the registers if we use them, so we only use the FPU register
+ * method if the target size is large enough to amortize the extra
+ * overhead for preserving them. CR0_TS must be preserved although
+ * it is very likely to end up as set.
+ *
+ * Case 2: FPU registers belong to fastmove(): fastmove() currently
+ * makes the registers look like they belong to an application so
+ * that cpu_switch() and savectx() don't have to know about it, so
+ * this case reduces to case 1.
+ *
+ * Case 3: FPU registers belong to the kernel: don't use the FPU
+ * register method. This case is unlikely, and supporting it would
+ * be more complicated and might take too much stack.
+ *
+ * Case 4: FPU registers don't belong to anyone: the FPU registers
+ * don't need to be preserved, so we always use the FPU register
+ * method. CR0_TS must be preserved although it is very likely to
+ * always end up as clear.
+ */
+ cmpl $0,_npxproc
+ je i586_bz1
+ cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
+ jb intreg_i586_bzero
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp i586_bz2
-/* First write 0-3 bytes to make the pointer 32-bit aligned. */
- movl %edi,%ecx /* Copy ptr to ecx... */
- negl %ecx /* ...and negate that and... */
- andl $3,%ecx /* ...mask to get byte count. */
- subl %ecx,%edx /* adjust global byte count */
- rep
- stosb
+i586_bz1:
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ fninit /* XXX should avoid needing this */
+i586_bz2:
+ fldz
- subl $32,%edx /* offset count for unrolled loop */
- movl (%edi),%ecx /* Fetch destination cache line */
+ /*
+ * Align to an 8 byte boundary (misalignment in the main loop would
+ * cost a factor of >= 2). Avoid jumps (at little cost if it is
+ * already aligned) by always zeroing 8 bytes and using the part up
+ * to the _next_ alignment position.
+ */
+ fstl 0(%edx)
+ addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
+ addl $8,%edx
+ andl $~7,%edx
+ subl %edx,%ecx
- .align 2,0x90 /* supply 0x90 for broken assemblers */
-1:
- movl 28(%edi),%ecx /* allocate cache line for destination */
- subl $32,%edx /* decr loop count */
- movl %eax,0(%edi) /* store words pairwise */
- movl %eax,4(%edi)
- movl %eax,8(%edi)
- movl %eax,12(%edi)
- movl %eax,16(%edi)
- movl %eax,20(%edi)
- movl %eax,24(%edi)
- movl %eax,28(%edi)
-
- leal 32(%edi),%edi /* update destination pointer */
- jge 1b
- leal 32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
-2:
+ /*
+ * Similarly align `len' to a multiple of 8.
+ */
+ fstl -8(%edx,%ecx)
+ decl %ecx
+ andl $~7,%ecx
+
+ /*
+ * This wouldn't be any faster if it were unrolled, since the loop
+ * control instructions are much faster than the fstl and/or done
+ * in parallel with it so their overhead is insignificant.
+ */
+fpureg_i586_bzero_loop:
+ fstl 0(%edx)
+ addl $8,%edx
+ subl $8,%ecx
+ cmpl $8,%ecx
+ jae fpureg_i586_bzero_loop
+
+ cmpl $0,_npxproc
+ je i586_bz3
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+i586_bz3:
+ fstpl %st(0)
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+intreg_i586_bzero:
+ /*
+ * `rep stos' seems to be the best method in practice for small
+ * counts. Fancy methods usually take too long to start up due
+ * to cache and BTB misses.
+ */
+ pushl %edi
+ movl %edx,%edi
+ xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
-
-/* Finally write the last 0-3 bytes. */
- movl %edx,%ecx
+ movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+1:
rep
stosb
-
popl %edi
ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
@@ -279,13 +369,19 @@ bcopyb:
cld
ret
+ENTRY(bcopy)
+ MEXITCOUNT
+ jmp *_bcopy_vector
+
+ENTRY(ovbcopy)
+ MEXITCOUNT
+ jmp *_ovbcopy_vector
+
/*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
cld
ret
+ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping && src < dst? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+4:
+ pushl %ecx
+#define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+5:
+ ALIGN_TEXT
+large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
* Note: memcpy does not support overlapping copies
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
index e583aee..8a4d66e 100644
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -30,10 +30,10 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ * $Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
*/
-#include <sys/errno.h>
+#include "opt_temporary.h" /* for I586_*_B* */
#include <machine/asmacros.h>
#include <machine/cputypes.h>
@@ -44,10 +44,19 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
-
.data
+ .globl _bcopy_vector
+_bcopy_vector:
+ .long _generic_bcopy
.globl _bzero
-_bzero: .long _generic_bzero
+_bzero:
+ .long _generic_bzero
+ .globl _ovbcopy_vector
+_ovbcopy_vector:
+ .long _generic_bcopy
+kernel_fpu_lock:
+ .byte 0xfe
+ .space 3
.text
@@ -174,66 +183,147 @@ do0:
ret
#endif
-#if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
- pushl %edi
- movl 8(%esp),%edi /* destination pointer */
- movl 12(%esp),%edx /* size (in 8-bit words) */
+ENTRY(i586_bzero)
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
- xorl %eax,%eax /* store data */
- cld
+ /*
+ * The FPU register method is twice as fast as the integer register
+ * method unless the target is in the L1 cache and we pre-allocate a
+ * cache line for it (then the integer register method is 4-5 times
+ * faster). However, we never pre-allocate cache lines, since that
+ * would make the integer method 25% or more slower for the common
+ * case when the target isn't in either the L1 cache or the L2 cache.
+ * Thus we normally use the FPU register method unless the overhead
+ * would be too large.
+ */
+ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
+ jb intreg_i586_bzero
-/* If less than 100 bytes to write, skip tricky code. */
- cmpl $100,%edx
- movl %edx,%ecx /* needed when branch is taken! */
- jl 2f
+ /*
+ * The FPU registers may belong to an application or to fastmove()
+ * or to another invocation of bcopy() or ourself in a higher level
+ * interrupt or trap handler. Preserving the registers is
+ * complicated since we avoid it if possible at all levels. We
+ * want to localize the complications even when that increases them.
+ * Here the extra work involves preserving CR0_TS in TS.
+ * `npxproc != NULL' is supposed to be the condition that all the
+ * FPU resources belong to an application, but npxproc and CR0_TS
+ * aren't set atomically enough for this condition to work in
+ * interrupt handlers.
+ *
+ * Case 1: FPU registers belong to the application: we must preserve
+ * the registers if we use them, so we only use the FPU register
+ * method if the target size is large enough to amortize the extra
+ * overhead for preserving them. CR0_TS must be preserved although
+ * it is very likely to end up as set.
+ *
+ * Case 2: FPU registers belong to fastmove(): fastmove() currently
+ * makes the registers look like they belong to an application so
+ * that cpu_switch() and savectx() don't have to know about it, so
+ * this case reduces to case 1.
+ *
+ * Case 3: FPU registers belong to the kernel: don't use the FPU
+ * register method. This case is unlikely, and supporting it would
+ * be more complicated and might take too much stack.
+ *
+ * Case 4: FPU registers don't belong to anyone: the FPU registers
+ * don't need to be preserved, so we always use the FPU register
+ * method. CR0_TS must be preserved although it is very likely to
+ * always end up as clear.
+ */
+ cmpl $0,_npxproc
+ je i586_bz1
+ cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
+ jb intreg_i586_bzero
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp i586_bz2
-/* First write 0-3 bytes to make the pointer 32-bit aligned. */
- movl %edi,%ecx /* Copy ptr to ecx... */
- negl %ecx /* ...and negate that and... */
- andl $3,%ecx /* ...mask to get byte count. */
- subl %ecx,%edx /* adjust global byte count */
- rep
- stosb
+i586_bz1:
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ fninit /* XXX should avoid needing this */
+i586_bz2:
+ fldz
- subl $32,%edx /* offset count for unrolled loop */
- movl (%edi),%ecx /* Fetch destination cache line */
+ /*
+ * Align to an 8 byte boundary (misalignment in the main loop would
+ * cost a factor of >= 2). Avoid jumps (at little cost if it is
+ * already aligned) by always zeroing 8 bytes and using the part up
+ * to the _next_ alignment position.
+ */
+ fstl 0(%edx)
+ addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
+ addl $8,%edx
+ andl $~7,%edx
+ subl %edx,%ecx
- .align 2,0x90 /* supply 0x90 for broken assemblers */
-1:
- movl 28(%edi),%ecx /* allocate cache line for destination */
- subl $32,%edx /* decr loop count */
- movl %eax,0(%edi) /* store words pairwise */
- movl %eax,4(%edi)
- movl %eax,8(%edi)
- movl %eax,12(%edi)
- movl %eax,16(%edi)
- movl %eax,20(%edi)
- movl %eax,24(%edi)
- movl %eax,28(%edi)
-
- leal 32(%edi),%edi /* update destination pointer */
- jge 1b
- leal 32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
-2:
+ /*
+ * Similarly align `len' to a multiple of 8.
+ */
+ fstl -8(%edx,%ecx)
+ decl %ecx
+ andl $~7,%ecx
+
+ /*
+ * This wouldn't be any faster if it were unrolled, since the loop
+ * control instructions are much faster than the fstl and/or done
+ * in parallel with it so their overhead is insignificant.
+ */
+fpureg_i586_bzero_loop:
+ fstl 0(%edx)
+ addl $8,%edx
+ subl $8,%ecx
+ cmpl $8,%ecx
+ jae fpureg_i586_bzero_loop
+
+ cmpl $0,_npxproc
+ je i586_bz3
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+i586_bz3:
+ fstpl %st(0)
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+intreg_i586_bzero:
+ /*
+ * `rep stos' seems to be the best method in practice for small
+ * counts. Fancy methods usually take too long to start up due
+ * to cache and BTB misses.
+ */
+ pushl %edi
+ movl %edx,%edi
+ xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
-
-/* Finally write the last 0-3 bytes. */
- movl %edx,%ecx
+ movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+1:
rep
stosb
-
popl %edi
ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
@@ -279,13 +369,19 @@ bcopyb:
cld
ret
+ENTRY(bcopy)
+ MEXITCOUNT
+ jmp *_bcopy_vector
+
+ENTRY(ovbcopy)
+ MEXITCOUNT
+ jmp *_ovbcopy_vector
+
/*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
cld
ret
+ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping && src < dst? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+4:
+ pushl %ecx
+#define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+5:
+ ALIGN_TEXT
+large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
* Note: memcpy does not support overlapping copies
OpenPOWER on IntegriCloud