summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbde <bde@FreeBSD.org>1996-10-09 18:16:17 +0000
committerbde <bde@FreeBSD.org>1996-10-09 18:16:17 +0000
commit6b1de9ceec6ac5b2aae04ed58a3374a3fe4734d8 (patch)
tree9a89e274bcfa89690389ef0bf942f4d76eb8c47d
parent3d953050fb2396c16164170de0f2f546c1db5154 (diff)
downloadFreeBSD-src-6b1de9ceec6ac5b2aae04ed58a3374a3fe4734d8.zip
FreeBSD-src-6b1de9ceec6ac5b2aae04ed58a3374a3fe4734d8.tar.gz
Added i586-optimized bcopy() and bzero().
These are based on using the FPU to do 64-bit stores. They also use i586-optimized instruction ordering, i586-optimized cache management and a couple of other tricks. They should work on any i*86 with a h/w FPU, but are slower on at least i386's and i486's. They come close to saturating the memory bus on i586's. bzero() can maintain a 3-3-3-3 burst cycle to 66 MHz non-EDO main memory on a P133 (but is too slow to keep up with a 2-2-2-2 burst cycle for EDO - someone with EDO should fix this). bcopy() is several cycles short of keeping up with a 3-3-3-3 cycle for writing. For a P133 writing to 66 MHz main memory, it just manages an N-3-3-3, 3-3-3-3 pair of burst cycles, where N is typically 6. The new routines are not used by default. They are always configured and can be enabled at runtime using a debugger or an lkm to change their function pointer, or at compile time using new options (see another log message). Removed old, dead i586_bzero() and i686_bzero(). Read-before-write is usually bad for i586's. It doubles the memory traffic unless the data is already cached, and data is (or should be) very rarely cached for large bzero()s (the system should prefer uncached pages for cleaning), and the amount of data handled by small bzero()s is relatively small in the kernel. Improved comments about overlapping copies. Removed unused #include.
-rw-r--r--sys/amd64/amd64/support.S343
-rw-r--r--sys/amd64/amd64/support.s343
-rw-r--r--sys/i386/i386/support.s343
3 files changed, 861 insertions, 168 deletions
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index e583aee..8a4d66e 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -30,10 +30,10 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ * $Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
*/
-#include <sys/errno.h>
+#include "opt_temporary.h" /* for I586_*_B* */
#include <machine/asmacros.h>
#include <machine/cputypes.h>
@@ -44,10 +44,19 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
-
.data
+ .globl _bcopy_vector
+_bcopy_vector:
+ .long _generic_bcopy
.globl _bzero
-_bzero: .long _generic_bzero
+_bzero:
+ .long _generic_bzero
+ .globl _ovbcopy_vector
+_ovbcopy_vector:
+ .long _generic_bcopy
+kernel_fpu_lock:
+ .byte 0xfe
+ .space 3
.text
@@ -174,66 +183,147 @@ do0:
ret
#endif
-#if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
- pushl %edi
- movl 8(%esp),%edi /* destination pointer */
- movl 12(%esp),%edx /* size (in 8-bit words) */
+ENTRY(i586_bzero)
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
- xorl %eax,%eax /* store data */
- cld
+ /*
+ * The FPU register method is twice as fast as the integer register
+ * method unless the target is in the L1 cache and we pre-allocate a
+ * cache line for it (then the integer register method is 4-5 times
+ * faster). However, we never pre-allocate cache lines, since that
+ * would make the integer method 25% or more slower for the common
+ * case when the target isn't in either the L1 cache or the L2 cache.
+ * Thus we normally use the FPU register method unless the overhead
+ * would be too large.
+ */
+ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
+ jb intreg_i586_bzero
-/* If less than 100 bytes to write, skip tricky code. */
- cmpl $100,%edx
- movl %edx,%ecx /* needed when branch is taken! */
- jl 2f
+ /*
+ * The FPU registers may belong to an application or to fastmove()
+ * or to another invocation of bcopy() or ourself in a higher level
+ * interrupt or trap handler. Preserving the registers is
+ * complicated since we avoid it if possible at all levels. We
+ * want to localize the complications even when that increases them.
+ * Here the extra work involves preserving CR0_TS in TS.
+ * `npxproc != NULL' is supposed to be the condition that all the
+ * FPU resources belong to an application, but npxproc and CR0_TS
+ * aren't set atomically enough for this condition to work in
+ * interrupt handlers.
+ *
+ * Case 1: FPU registers belong to the application: we must preserve
+ * the registers if we use them, so we only use the FPU register
+ * method if the target size is large enough to amortize the extra
+ * overhead for preserving them. CR0_TS must be preserved although
+ * it is very likely to end up as set.
+ *
+ * Case 2: FPU registers belong to fastmove(): fastmove() currently
+ * makes the registers look like they belong to an application so
+ * that cpu_switch() and savectx() don't have to know about it, so
+ * this case reduces to case 1.
+ *
+ * Case 3: FPU registers belong to the kernel: don't use the FPU
+ * register method. This case is unlikely, and supporting it would
+ * be more complicated and might take too much stack.
+ *
+ * Case 4: FPU registers don't belong to anyone: the FPU registers
+ * don't need to be preserved, so we always use the FPU register
+ * method. CR0_TS must be preserved although it is very likely to
+ * always end up as clear.
+ */
+ cmpl $0,_npxproc
+ je i586_bz1
+ cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
+ jb intreg_i586_bzero
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp i586_bz2
-/* First write 0-3 bytes to make the pointer 32-bit aligned. */
- movl %edi,%ecx /* Copy ptr to ecx... */
- negl %ecx /* ...and negate that and... */
- andl $3,%ecx /* ...mask to get byte count. */
- subl %ecx,%edx /* adjust global byte count */
- rep
- stosb
+i586_bz1:
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ fninit /* XXX should avoid needing this */
+i586_bz2:
+ fldz
- subl $32,%edx /* offset count for unrolled loop */
- movl (%edi),%ecx /* Fetch destination cache line */
+ /*
+ * Align to an 8 byte boundary (misalignment in the main loop would
+ * cost a factor of >= 2). Avoid jumps (at little cost if it is
+ * already aligned) by always zeroing 8 bytes and using the part up
+ * to the _next_ alignment position.
+ */
+ fstl 0(%edx)
+ addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
+ addl $8,%edx
+ andl $~7,%edx
+ subl %edx,%ecx
- .align 2,0x90 /* supply 0x90 for broken assemblers */
-1:
- movl 28(%edi),%ecx /* allocate cache line for destination */
- subl $32,%edx /* decr loop count */
- movl %eax,0(%edi) /* store words pairwise */
- movl %eax,4(%edi)
- movl %eax,8(%edi)
- movl %eax,12(%edi)
- movl %eax,16(%edi)
- movl %eax,20(%edi)
- movl %eax,24(%edi)
- movl %eax,28(%edi)
-
- leal 32(%edi),%edi /* update destination pointer */
- jge 1b
- leal 32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
-2:
+ /*
+ * Similarly align `len' to a multiple of 8.
+ */
+ fstl -8(%edx,%ecx)
+ decl %ecx
+ andl $~7,%ecx
+
+ /*
+ * This wouldn't be any faster if it were unrolled, since the loop
+ * control instructions are much faster than the fstl and/or done
+ * in parallel with it so their overhead is insignificant.
+ */
+fpureg_i586_bzero_loop:
+ fstl 0(%edx)
+ addl $8,%edx
+ subl $8,%ecx
+ cmpl $8,%ecx
+ jae fpureg_i586_bzero_loop
+
+ cmpl $0,_npxproc
+ je i586_bz3
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+i586_bz3:
+ fstpl %st(0)
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+intreg_i586_bzero:
+ /*
+ * `rep stos' seems to be the best method in practice for small
+ * counts. Fancy methods usually take too long to start up due
+ * to cache and BTB misses.
+ */
+ pushl %edi
+ movl %edx,%edi
+ xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
-
-/* Finally write the last 0-3 bytes. */
- movl %edx,%ecx
+ movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+1:
rep
stosb
-
popl %edi
ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
@@ -279,13 +369,19 @@ bcopyb:
cld
ret
+ENTRY(bcopy)
+ MEXITCOUNT
+ jmp *_bcopy_vector
+
+ENTRY(ovbcopy)
+ MEXITCOUNT
+ jmp *_ovbcopy_vector
+
/*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
cld
ret
+ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping && src < dst? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+4:
+ pushl %ecx
+#define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+5:
+ ALIGN_TEXT
+large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
* Note: memcpy does not support overlapping copies
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index e583aee..8a4d66e 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -30,10 +30,10 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ * $Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
*/
-#include <sys/errno.h>
+#include "opt_temporary.h" /* for I586_*_B* */
#include <machine/asmacros.h>
#include <machine/cputypes.h>
@@ -44,10 +44,19 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
-
.data
+ .globl _bcopy_vector
+_bcopy_vector:
+ .long _generic_bcopy
.globl _bzero
-_bzero: .long _generic_bzero
+_bzero:
+ .long _generic_bzero
+ .globl _ovbcopy_vector
+_ovbcopy_vector:
+ .long _generic_bcopy
+kernel_fpu_lock:
+ .byte 0xfe
+ .space 3
.text
@@ -174,66 +183,147 @@ do0:
ret
#endif
-#if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
- pushl %edi
- movl 8(%esp),%edi /* destination pointer */
- movl 12(%esp),%edx /* size (in 8-bit words) */
+ENTRY(i586_bzero)
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
- xorl %eax,%eax /* store data */
- cld
+ /*
+ * The FPU register method is twice as fast as the integer register
+ * method unless the target is in the L1 cache and we pre-allocate a
+ * cache line for it (then the integer register method is 4-5 times
+ * faster). However, we never pre-allocate cache lines, since that
+ * would make the integer method 25% or more slower for the common
+ * case when the target isn't in either the L1 cache or the L2 cache.
+ * Thus we normally use the FPU register method unless the overhead
+ * would be too large.
+ */
+ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
+ jb intreg_i586_bzero
-/* If less than 100 bytes to write, skip tricky code. */
- cmpl $100,%edx
- movl %edx,%ecx /* needed when branch is taken! */
- jl 2f
+ /*
+ * The FPU registers may belong to an application or to fastmove()
+ * or to another invocation of bcopy() or ourself in a higher level
+ * interrupt or trap handler. Preserving the registers is
+ * complicated since we avoid it if possible at all levels. We
+ * want to localize the complications even when that increases them.
+ * Here the extra work involves preserving CR0_TS in TS.
+ * `npxproc != NULL' is supposed to be the condition that all the
+ * FPU resources belong to an application, but npxproc and CR0_TS
+ * aren't set atomically enough for this condition to work in
+ * interrupt handlers.
+ *
+ * Case 1: FPU registers belong to the application: we must preserve
+ * the registers if we use them, so we only use the FPU register
+ * method if the target size is large enough to amortize the extra
+ * overhead for preserving them. CR0_TS must be preserved although
+ * it is very likely to end up as set.
+ *
+ * Case 2: FPU registers belong to fastmove(): fastmove() currently
+ * makes the registers look like they belong to an application so
+ * that cpu_switch() and savectx() don't have to know about it, so
+ * this case reduces to case 1.
+ *
+ * Case 3: FPU registers belong to the kernel: don't use the FPU
+ * register method. This case is unlikely, and supporting it would
+ * be more complicated and might take too much stack.
+ *
+ * Case 4: FPU registers don't belong to anyone: the FPU registers
+ * don't need to be preserved, so we always use the FPU register
+ * method. CR0_TS must be preserved although it is very likely to
+ * always end up as clear.
+ */
+ cmpl $0,_npxproc
+ je i586_bz1
+ cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
+ jb intreg_i586_bzero
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp i586_bz2
-/* First write 0-3 bytes to make the pointer 32-bit aligned. */
- movl %edi,%ecx /* Copy ptr to ecx... */
- negl %ecx /* ...and negate that and... */
- andl $3,%ecx /* ...mask to get byte count. */
- subl %ecx,%edx /* adjust global byte count */
- rep
- stosb
+i586_bz1:
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ fninit /* XXX should avoid needing this */
+i586_bz2:
+ fldz
- subl $32,%edx /* offset count for unrolled loop */
- movl (%edi),%ecx /* Fetch destination cache line */
+ /*
+ * Align to an 8 byte boundary (misalignment in the main loop would
+ * cost a factor of >= 2). Avoid jumps (at little cost if it is
+ * already aligned) by always zeroing 8 bytes and using the part up
+ * to the _next_ alignment position.
+ */
+ fstl 0(%edx)
+ addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
+ addl $8,%edx
+ andl $~7,%edx
+ subl %edx,%ecx
- .align 2,0x90 /* supply 0x90 for broken assemblers */
-1:
- movl 28(%edi),%ecx /* allocate cache line for destination */
- subl $32,%edx /* decr loop count */
- movl %eax,0(%edi) /* store words pairwise */
- movl %eax,4(%edi)
- movl %eax,8(%edi)
- movl %eax,12(%edi)
- movl %eax,16(%edi)
- movl %eax,20(%edi)
- movl %eax,24(%edi)
- movl %eax,28(%edi)
-
- leal 32(%edi),%edi /* update destination pointer */
- jge 1b
- leal 32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
-2:
+ /*
+ * Similarly align `len' to a multiple of 8.
+ */
+ fstl -8(%edx,%ecx)
+ decl %ecx
+ andl $~7,%ecx
+
+ /*
+ * This wouldn't be any faster if it were unrolled, since the loop
+ * control instructions are much faster than the fstl and/or done
+ * in parallel with it so their overhead is insignificant.
+ */
+fpureg_i586_bzero_loop:
+ fstl 0(%edx)
+ addl $8,%edx
+ subl $8,%ecx
+ cmpl $8,%ecx
+ jae fpureg_i586_bzero_loop
+
+ cmpl $0,_npxproc
+ je i586_bz3
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+i586_bz3:
+ fstpl %st(0)
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+intreg_i586_bzero:
+ /*
+ * `rep stos' seems to be the best method in practice for small
+ * counts. Fancy methods usually take too long to start up due
+ * to cache and BTB misses.
+ */
+ pushl %edi
+ movl %edx,%edi
+ xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
-
-/* Finally write the last 0-3 bytes. */
- movl %edx,%ecx
+ movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+1:
rep
stosb
-
popl %edi
ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
@@ -279,13 +369,19 @@ bcopyb:
cld
ret
+ENTRY(bcopy)
+ MEXITCOUNT
+ jmp *_bcopy_vector
+
+ENTRY(ovbcopy)
+ MEXITCOUNT
+ jmp *_ovbcopy_vector
+
/*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
cld
ret
+ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping && src < dst? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+4:
+ pushl %ecx
+#define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+5:
+ ALIGN_TEXT
+large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
* Note: memcpy does not support overlapping copies
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
index e583aee..8a4d66e 100644
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -30,10 +30,10 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ * $Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
*/
-#include <sys/errno.h>
+#include "opt_temporary.h" /* for I586_*_B* */
#include <machine/asmacros.h>
#include <machine/cputypes.h>
@@ -44,10 +44,19 @@
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
-
.data
+ .globl _bcopy_vector
+_bcopy_vector:
+ .long _generic_bcopy
.globl _bzero
-_bzero: .long _generic_bzero
+_bzero:
+ .long _generic_bzero
+ .globl _ovbcopy_vector
+_ovbcopy_vector:
+ .long _generic_bcopy
+kernel_fpu_lock:
+ .byte 0xfe
+ .space 3
.text
@@ -174,66 +183,147 @@ do0:
ret
#endif
-#if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
- pushl %edi
- movl 8(%esp),%edi /* destination pointer */
- movl 12(%esp),%edx /* size (in 8-bit words) */
+ENTRY(i586_bzero)
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
- xorl %eax,%eax /* store data */
- cld
+ /*
+ * The FPU register method is twice as fast as the integer register
+ * method unless the target is in the L1 cache and we pre-allocate a
+ * cache line for it (then the integer register method is 4-5 times
+ * faster). However, we never pre-allocate cache lines, since that
+ * would make the integer method 25% or more slower for the common
+ * case when the target isn't in either the L1 cache or the L2 cache.
+ * Thus we normally use the FPU register method unless the overhead
+ * would be too large.
+ */
+ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
+ jb intreg_i586_bzero
-/* If less than 100 bytes to write, skip tricky code. */
- cmpl $100,%edx
- movl %edx,%ecx /* needed when branch is taken! */
- jl 2f
+ /*
+ * The FPU registers may belong to an application or to fastmove()
+ * or to another invocation of bcopy() or ourself in a higher level
+ * interrupt or trap handler. Preserving the registers is
+ * complicated since we avoid it if possible at all levels. We
+ * want to localize the complications even when that increases them.
+ * Here the extra work involves preserving CR0_TS in TS.
+ * `npxproc != NULL' is supposed to be the condition that all the
+ * FPU resources belong to an application, but npxproc and CR0_TS
+ * aren't set atomically enough for this condition to work in
+ * interrupt handlers.
+ *
+ * Case 1: FPU registers belong to the application: we must preserve
+ * the registers if we use them, so we only use the FPU register
+ * method if the target size is large enough to amortize the extra
+ * overhead for preserving them. CR0_TS must be preserved although
+ * it is very likely to end up as set.
+ *
+ * Case 2: FPU registers belong to fastmove(): fastmove() currently
+ * makes the registers look like they belong to an application so
+ * that cpu_switch() and savectx() don't have to know about it, so
+ * this case reduces to case 1.
+ *
+ * Case 3: FPU registers belong to the kernel: don't use the FPU
+ * register method. This case is unlikely, and supporting it would
+ * be more complicated and might take too much stack.
+ *
+ * Case 4: FPU registers don't belong to anyone: the FPU registers
+ * don't need to be preserved, so we always use the FPU register
+ * method. CR0_TS must be preserved although it is very likely to
+ * always end up as clear.
+ */
+ cmpl $0,_npxproc
+ je i586_bz1
+ cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
+ jb intreg_i586_bzero
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp i586_bz2
-/* First write 0-3 bytes to make the pointer 32-bit aligned. */
- movl %edi,%ecx /* Copy ptr to ecx... */
- negl %ecx /* ...and negate that and... */
- andl $3,%ecx /* ...mask to get byte count. */
- subl %ecx,%edx /* adjust global byte count */
- rep
- stosb
+i586_bz1:
+ sarb $1,kernel_fpu_lock
+ jc intreg_i586_bzero
+ smsw %ax
+ clts
+ fninit /* XXX should avoid needing this */
+i586_bz2:
+ fldz
- subl $32,%edx /* offset count for unrolled loop */
- movl (%edi),%ecx /* Fetch destination cache line */
+ /*
+ * Align to an 8 byte boundary (misalignment in the main loop would
+ * cost a factor of >= 2). Avoid jumps (at little cost if it is
+ * already aligned) by always zeroing 8 bytes and using the part up
+ * to the _next_ alignment position.
+ */
+ fstl 0(%edx)
+ addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
+ addl $8,%edx
+ andl $~7,%edx
+ subl %edx,%ecx
- .align 2,0x90 /* supply 0x90 for broken assemblers */
-1:
- movl 28(%edi),%ecx /* allocate cache line for destination */
- subl $32,%edx /* decr loop count */
- movl %eax,0(%edi) /* store words pairwise */
- movl %eax,4(%edi)
- movl %eax,8(%edi)
- movl %eax,12(%edi)
- movl %eax,16(%edi)
- movl %eax,20(%edi)
- movl %eax,24(%edi)
- movl %eax,28(%edi)
-
- leal 32(%edi),%edi /* update destination pointer */
- jge 1b
- leal 32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
-2:
+ /*
+ * Similarly align `len' to a multiple of 8.
+ */
+ fstl -8(%edx,%ecx)
+ decl %ecx
+ andl $~7,%ecx
+
+ /*
+ * This wouldn't be any faster if it were unrolled, since the loop
+ * control instructions are much faster than the fstl and/or done
+ * in parallel with it so their overhead is insignificant.
+ */
+fpureg_i586_bzero_loop:
+ fstl 0(%edx)
+ addl $8,%edx
+ subl $8,%ecx
+ cmpl $8,%ecx
+ jae fpureg_i586_bzero_loop
+
+ cmpl $0,_npxproc
+ je i586_bz3
+ frstor 0(%esp)
+ addl $108,%esp
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+i586_bz3:
+ fstpl %st(0)
+ lmsw %ax
+ movb $0xfe,kernel_fpu_lock
+ ret
+
+intreg_i586_bzero:
+ /*
+ * `rep stos' seems to be the best method in practice for small
+ * counts. Fancy methods usually take too long to start up due
+ * to cache and BTB misses.
+ */
+ pushl %edi
+ movl %edx,%edi
+ xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
-
-/* Finally write the last 0-3 bytes. */
- movl %edx,%ecx
+ movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+1:
rep
stosb
-
popl %edi
ret
-#endif
-#endif
+#endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
@@ -256,7 +346,7 @@ bcopyb:
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
@@ -279,13 +369,19 @@ bcopyb:
cld
ret
+ENTRY(bcopy)
+ MEXITCOUNT
+ jmp *_bcopy_vector
+
+ENTRY(ovbcopy)
+ MEXITCOUNT
+ jmp *_ovbcopy_vector
+
/*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
@@ -294,7 +390,7 @@ bcopy:
movl %edi,%eax
subl %esi,%eax
- cmpl %ecx,%eax /* overlapping? */
+ cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
@@ -330,6 +426,141 @@ bcopy:
cld
ret
+ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping && src < dst? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+4:
+ pushl %ecx
+#define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+5:
+ ALIGN_TEXT
+large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
* Note: memcpy does not support overlapping copies
OpenPOWER on IntegriCloud