summaryrefslogtreecommitdiffstats
path: root/sys/sparc64
diff options
context:
space:
mode:
authormarius <marius@FreeBSD.org>2010-09-15 21:44:31 +0000
committermarius <marius@FreeBSD.org>2010-09-15 21:44:31 +0000
commit6c236115f4b69ebb0a466ee25701d1092d1b900d (patch)
tree62c5f97ba461f8e0ad29a727836ebf6b175438e3 /sys/sparc64
parent51d5293d0c39c6e91cbb9d7ef18f8d109dfaf939 (diff)
downloadFreeBSD-src-6c236115f4b69ebb0a466ee25701d1092d1b900d.zip
FreeBSD-src-6c236115f4b69ebb0a466ee25701d1092d1b900d.tar.gz
Add a VIS-based block copy function for SPARC64 V and later, which
additionally takes advantage of the prefetch cache of these CPUs. Unlike the uncommitted US-III version, which provide no measurable speedup or even resulted in a slight slowdown on certain CPUs models compared to using the US-I version with these, the SPARC64 version actually results in a slight improvement.
Diffstat (limited to 'sys/sparc64')
-rw-r--r--sys/sparc64/include/md_var.h2
-rw-r--r--sys/sparc64/sparc64/machdep.c7
-rw-r--r--sys/sparc64/sparc64/support.S113
3 files changed, 121 insertions, 1 deletions
diff --git a/sys/sparc64/include/md_var.h b/sys/sparc64/include/md_var.h
index 8f064e7..8503b43 100644
--- a/sys/sparc64/include/md_var.h
+++ b/sys/sparc64/include/md_var.h
@@ -58,6 +58,8 @@ struct md_utrap *utrap_hold(struct md_utrap *ut);
cpu_block_copy_t spitfire_block_copy;
cpu_block_zero_t spitfire_block_zero;
+cpu_block_copy_t zeus_block_copy;
+cpu_block_zero_t zeus_block_zero;
extern cpu_block_copy_t *cpu_block_copy;
extern cpu_block_zero_t *cpu_block_zero;
diff --git a/sys/sparc64/sparc64/machdep.c b/sys/sparc64/sparc64/machdep.c
index 193da2f..a4ed381 100644
--- a/sys/sparc64/sparc64/machdep.c
+++ b/sys/sparc64/sparc64/machdep.c
@@ -495,7 +495,6 @@ sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
if (cpu_use_vis) {
switch (cpu_impl) {
case CPU_IMPL_SPARC64:
- case CPU_IMPL_SPARC64V:
case CPU_IMPL_ULTRASPARCI:
case CPU_IMPL_ULTRASPARCII:
case CPU_IMPL_ULTRASPARCIIi:
@@ -509,6 +508,12 @@ sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
cpu_block_copy = spitfire_block_copy;
cpu_block_zero = spitfire_block_zero;
break;
+ case CPU_IMPL_SPARC64V:
+ cpu_block_copy = zeus_block_copy;
+ cpu_block_zero = zeus_block_zero;
+ cpu_block_copy = spitfire_block_copy;
+ cpu_block_zero = spitfire_block_zero;
+ break;
}
}
diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S
index d4ca5d3..70241d8 100644
--- a/sys/sparc64/sparc64/support.S
+++ b/sys/sparc64/sparc64/support.S
@@ -661,8 +661,121 @@ ENTRY(spitfire_block_copy)
END(spitfire_block_copy)
/*
+ * void zeus_block_copy(void *src, void *dst, size_t len)
+ */
+ENTRY(zeus_block_copy)
+ prefetch [%o0 + (0 * 64)], 0
+
+ rdpr %pil, %o3
+ wrpr %g0, PIL_TICK, %pil
+
+ wr %g0, ASI_BLK_S, %asi
+ wr %g0, FPRS_FEF, %fprs
+
+ sub PCB_REG, TF_SIZEOF, %o4
+ ldx [%o4 + TF_FPRS], %o5
+ andcc %o5, FPRS_FEF, %g0
+ bz,a,pt %xcc, 1f
+ nop
+ stda %f0, [PCB_REG + PCB_UFP + (0 * 64)] %asi
+ stda %f16, [PCB_REG + PCB_UFP + (1 * 64)] %asi
+ stda %f32, [PCB_REG + PCB_UFP + (2 * 64)] %asi
+ stda %f48, [PCB_REG + PCB_UFP + (3 * 64)] %asi
+ membar #Sync
+
+ andn %o5, FPRS_FEF, %o5
+ stx %o5, [%o4 + TF_FPRS]
+ ldx [PCB_REG + PCB_FLAGS], %o4
+ or %o4, PCB_FEF, %o4
+ stx %o4, [PCB_REG + PCB_FLAGS]
+
+1: wrpr %o3, 0, %pil
+
+ ldd [%o0 + (0 * 8)], %f0
+ prefetch [%o0 + (1 * 64)], 0
+ ldd [%o0 + (1 * 8)], %f2
+ prefetch [%o0 + (2 * 64)], 0
+ fmovd %f0, %f32
+ ldd [%o0 + (2 * 8)], %f4
+ prefetch [%o0 + (3 * 64)], 0
+ fmovd %f2, %f34
+ ldd [%o0 + (3 * 8)], %f6
+ prefetch [%o0 + (4 * 64)], 1
+ fmovd %f4, %f36
+ ldd [%o0 + (4 * 8)], %f8
+ prefetch [%o0 + (8 * 64)], 1
+ fmovd %f6, %f38
+ ldd [%o0 + (5 * 8)], %f10
+ prefetch [%o0 + (12 * 64)], 1
+ fmovd %f8, %f40
+ ldd [%o0 + (6 * 8)], %f12
+ prefetch [%o0 + (16 * 64)], 1
+ fmovd %f10, %f42
+ ldd [%o0 + (7 * 8)], %f14
+ ldd [%o0 + (8 * 8)], %f0
+ sub %o2, 64, %o2
+ add %o0, 64, %o0
+ prefetch [%o0 + (19 * 64)], 1
+ ba,pt %xcc, 2f
+ prefetch [%o0 + (23 * 64)], 1
+ .align 32
+
+2: ldd [%o0 + (1 * 8)], %f2
+ fmovd %f12, %f44
+ ldd [%o0 + (2 * 8)], %f4
+ fmovd %f14, %f46
+ stda %f32, [%o1] %asi
+ ldd [%o0 + (3 * 8)], %f6
+ fmovd %f0, %f32
+ ldd [%o0 + (4 * 8)], %f8
+ fmovd %f2, %f34
+ ldd [%o0 + (5 * 8)], %f10
+ fmovd %f4, %f36
+ ldd [%o0 + (6 * 8)], %f12
+ fmovd %f6, %f38
+ ldd [%o0 + (7 * 8)], %f14
+ fmovd %f8, %f40
+ ldd [%o0 + (8 * 8)], %f0
+ fmovd %f10, %f42
+ sub %o2, 64, %o2
+ prefetch [%o0 + (3 * 64)], 0
+ add %o1, 64, %o1
+ prefetch [%o0 + (24 * 64)], 1
+ add %o0, 64, %o0
+ cmp %o2, 64 + 8
+ bgu,pt %xcc, 2b
+ prefetch [%o0 + (12 * 64)], 1
+ ldd [%o0 + (1 * 8)], %f2
+ fsrc1 %f12, %f44
+ ldd [%o0 + (2 * 8)], %f4
+ fsrc1 %f14, %f46
+ stda %f32, [%o1] %asi
+ ldd [%o0 + (3 * 8)], %f6
+ fsrc1 %f0, %f32
+ ldd [%o0 + (4 * 8)], %f8
+ fsrc1 %f2, %f34
+ ldd [%o0 + (5 * 8)], %f10
+ fsrc1 %f4, %f36
+ ldd [%o0 + (6 * 8)], %f12
+ fsrc1 %f6, %f38
+ ldd [%o0 + (7 * 8)], %f14
+ fsrc1 %f8, %f40
+ add %o1, 64, %o1
+ fsrc1 %f10, %f42
+ fsrc1 %f12, %f44
+ fsrc1 %f14, %f46
+ stda %f32, [%o1] %asi
+ membar #Sync
+
+ retl
+ wr %g0, 0, %fprs
+END(zeus_block_copy)
+
+/*
* void spitfire_block_zero(void *dst, size_t len)
+ * void zeus_block_zero(void *dst, size_t len)
*/
+ALTENTRY(zeus_block_zero)
ENTRY(spitfire_block_zero)
rdpr %pil, %o3
wrpr %g0, PIL_TICK, %pil
OpenPOWER on IntegriCloud