summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorjake <jake@FreeBSD.org>2003-04-03 18:43:40 +0000
committerjake <jake@FreeBSD.org>2003-04-03 18:43:40 +0000
commitaca72a7a77a602e7e160b91c98fd6d8e40eaf6f7 (patch)
tree70a23fb5f3c2600b3419c28b73852f077c6a483a /sys
parent9fc72a0ca0c33bbf04fe0c339088174a153cfaa3 (diff)
downloadFreeBSD-src-aca72a7a77a602e7e160b91c98fd6d8e40eaf6f7.zip
FreeBSD-src-aca72a7a77a602e7e160b91c98fd6d8e40eaf6f7.tar.gz
Add optimized block copy and zero functions using vis instructions, which
can do 64 bytes at a time and don't allocate lines in the L2 cache. These assume that everything is 64 byte aligned, and that there's more than 128 bytes of data (best for whole pages). The block load and store instructions don't follow normal memory ordering rules and require either a memory barrier or move between registers before the data can actually be used. This implementation correctly shuffles around 3 out of the 4 sets of registers in order to avoid memory barriers expect for the last 2 blocks.
Diffstat (limited to 'sys')
-rw-r--r--sys/sparc64/include/cpufunc.h3
-rw-r--r--sys/sparc64/sparc64/support.S137
2 files changed, 140 insertions, 0 deletions
diff --git a/sys/sparc64/include/cpufunc.h b/sys/sparc64/include/cpufunc.h
index 86e7ae5..8a1ae4c 100644
--- a/sys/sparc64/include/cpufunc.h
+++ b/sys/sparc64/include/cpufunc.h
@@ -196,6 +196,9 @@ void ascopyfrom(u_long sasi, vm_offset_t src, caddr_t dst, size_t len);
void ascopyto(caddr_t src, u_long dasi, vm_offset_t dst, size_t len);
void aszero(u_long asi, vm_offset_t dst, size_t len);
+void spitfire_block_copy(void *src, void *dst, size_t len);
+void spitfire_block_zero(void *dst, size_t len);
+
/*
* Ultrasparc II doesn't implement popc in hardware. Suck.
*/
diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S
index b701f04..c2e68a7 100644
--- a/sys/sparc64/sparc64/support.S
+++ b/sys/sparc64/sparc64/support.S
@@ -527,6 +527,143 @@ ENTRY(fs_fault)
mov -1, %o0
END(fsfault)
+ .globl fpu_fault_begin
+fpu_fault_begin:
+ nop
+
+/*
+ * void spitfire_block_copy(void *src, void *dst, size_t len)
+ */
+ENTRY(spitfire_block_copy)
+ rdpr %pil, %o3
+ wrpr %g0, PIL_TICK, %pil
+
+ wr %g0, ASI_BLK_S, %asi
+ wr %g0, FPRS_FEF, %fprs
+
+ sub PCB_REG, TF_SIZEOF, %o4
+ ldx [%o4 + TF_FPRS], %o5
+ andcc %o5, FPRS_FEF, %g0
+ bz,a,pt %xcc, 1f
+ nop
+ stda %f0, [PCB_REG + PCB_UFP + (0 * 64)] %asi
+ stda %f16, [PCB_REG + PCB_UFP + (1 * 64)] %asi
+ stda %f32, [PCB_REG + PCB_UFP + (2 * 64)] %asi
+ stda %f48, [PCB_REG + PCB_UFP + (3 * 64)] %asi
+ membar #Sync
+
+ andn %o5, FPRS_FEF, %o5
+ stx %o5, [%o4 + TF_FPRS]
+ ldx [PCB_REG + PCB_FLAGS], %o4
+ or %o4, PCB_FEF, %o4
+ stx %o4, [PCB_REG + PCB_FLAGS]
+
+1: wrpr %o3, 0, %pil
+
+ ldda [%o0] %asi, %f0
+ add %o0, 64, %o0
+ sub %o2, 64, %o2
+
+2: ldda [%o0] %asi, %f16
+ fsrc1 %f0, %f32
+ fsrc1 %f2, %f34
+ fsrc1 %f4, %f36
+ fsrc1 %f6, %f38
+ fsrc1 %f8, %f40
+ fsrc1 %f10, %f42
+ fsrc1 %f12, %f44
+ fsrc1 %f14, %f46
+ stda %f32, [%o1] %asi
+ add %o0, 64, %o0
+ subcc %o2, 64, %o2
+ bz,pn %xcc, 3f
+ add %o1, 64, %o1
+ ldda [%o0] %asi, %f0
+ fsrc1 %f16, %f32
+ fsrc1 %f18, %f34
+ fsrc1 %f20, %f36
+ fsrc1 %f22, %f38
+ fsrc1 %f24, %f40
+ fsrc1 %f26, %f42
+ fsrc1 %f28, %f44
+ fsrc1 %f30, %f46
+ stda %f32, [%o1] %asi
+ add %o0, 64, %o0
+ sub %o2, 64, %o2
+ ba %xcc, 2b
+ add %o1, 64, %o1
+
+3: membar #Sync
+
+ stda %f16, [%o1] %asi
+ membar #Sync
+
+ wr %g0, 0, %fprs
+
+ retl
+ nop
+END(spitfire_block_copy)
+
+/*
+ * void spitfire_block_zero(void *dst, size_t len)
+ */
+ENTRY(spitfire_block_zero)
+ rdpr %pil, %o3
+ wrpr %g0, PIL_TICK, %pil
+
+ wr %g0, ASI_BLK_S, %asi
+ wr %g0, FPRS_FEF, %fprs
+
+ sub PCB_REG, TF_SIZEOF, %o4
+ ldx [%o4 + TF_FPRS], %o5
+ andcc %o5, FPRS_FEF, %g0
+ bz,a,pt %xcc, 1f
+ nop
+ stda %f0, [PCB_REG + PCB_UFP + (0 * 64)] %asi
+ stda %f16, [PCB_REG + PCB_UFP + (1 * 64)] %asi
+ stda %f32, [PCB_REG + PCB_UFP + (2 * 64)] %asi
+ stda %f48, [PCB_REG + PCB_UFP + (3 * 64)] %asi
+ membar #Sync
+
+ andn %o5, FPRS_FEF, %o5
+ stx %o5, [%o4 + TF_FPRS]
+ ldx [PCB_REG + PCB_FLAGS], %o4
+ or %o4, PCB_FEF, %o4
+ stx %o4, [PCB_REG + PCB_FLAGS]
+
+1: wrpr %o3, 0, %pil
+
+ fzero %f0
+ fzero %f2
+ fzero %f4
+ fzero %f6
+ fzero %f8
+ fzero %f10
+ fzero %f12
+ fzero %f14
+
+1: stda %f0, [%o0] %asi
+ stda %f0, [%o0 + 64] %asi
+ stda %f0, [%o0 + 128] %asi
+ stda %f0, [%o0 + 192] %asi
+ sub %o1, 256, %o1
+ brnz %o1, 1b
+ add %o0, 256, %o0
+ membar #Sync
+
+ wr %g0, 0, %fprs
+
+ retl
+ nop
+END(spitfire_block_zero)
+
+ .globl fpu_fault_end
+fpu_fault_end:
+ nop
+
+ .globl fpu_fault_size
+ .set fpu_fault_size, fpu_fault_end - fpu_fault_begin
+
ENTRY(longjmp)
set 1, %g3
movrz %o1, %o1, %g3
OpenPOWER on IntegriCloud