2 files changed, 474 insertions, 232 deletions
diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S
index 41469f5..bbd2d13 100644
--- a/sys/sparc64/sparc64/support.S
+++ b/sys/sparc64/sparc64/support.S
@@ -33,62 +33,184 @@
 
 #include "assym.s"
 
-#define	E
+#define	E	/* empty */
 
+/*
+ * Generate load and store instructions for the corresponding width and asi
+ * (or not).  Note that we want to evaluate the macro args before
+ * concatenating, so that E really turns into nothing.
+ */
 #define	_LD(w, a)	ld ## w ## a
 #define	_ST(w, a)	st ## w ## a
 
 #define	LD(w, a)	_LD(w, a)
 #define	ST(w, a)	_ST(w, a)
 
-#define	_BCOPY(src, dst, len, sa, sasi, da, dasi) \
-	brz,pn	len, 2f ; \
-	 mov	len, %o3 ; \
-1:	LD(ub, sa) [src] sasi, %o4 ; \
-	ST(b, da) %o4, [dst] dasi ; \
-	dec	%o3 ; \
-	inc	src ; \
-	brnz,pt	%o3, 1b ; \
-	 inc	dst ; \
-2:
-
-#define	BCOPY(src, dst, len) \
-	_BCOPY(src, dst, len, E, E, E, E)
-
-#define	COPYIN(uaddr, kaddr, len) \
-	wr	%g0, ASI_AIUP, %asi ; \
-	_BCOPY(uaddr, kaddr, len, a, %asi, E, E)
-
-#define	COPYOUT(kaddr, uaddr, len) \
-	wr	%g0, ASI_AIUP, %asi ; \
-	_BCOPY(kaddr, uaddr, len, E, E, a, %asi)
+/*
+ * Common code for copy routines.
+ *
+ * We use large macros to generate functions for each of the copy routines.
+ * This allows the load and store instructions to be generated for the right
+ * operation, asi or not.  It is possible to write an asi independent function
+ * but this would require 2 expensive wrs in the main loop to switch %asi.
+ * It would also screw up profiling (if we ever get it), but may save some I$.
+ * We assume that either one of dasi and sasi is empty, or that they are both
+ * the same (empty or non-empty).  It is up to the caller to set %asi.
+ */
 
+/*
+ * ASI independent implementation of copystr(9).
+ * Used to implement copyinstr() and copystr().
+ *
+ * Return value is in %g1.
+ */
 #define	_COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
-	clr	%o4 ; \
-	clr	%o5 ; \
-1:	LD(ub, sa) [src] sasi, %g1 ; \
+	brz	len, 4f ; \
+	 mov	src, %g2 ; \
+1:	deccc	1, len ; \
+	bl,a,pn	%xcc, 3f ; \
+	 nop ; \
+	LD(ub, sa) [src] sasi, %g1 ; \
 	ST(b, da) %g1, [dst] dasi ; \
-	brz,pn	%g1, 2f ; \
-	 inc	%o4 ; \
-	dec	len ; \
-	inc	src ; \
-	brgz,pt	len, 1b ; \
+	brz,pn	%g1, 3f ; \
+	 inc	src ; \
+	b	%xcc, 1b ; \
 	 inc	dst ; \
-	mov	ENAMETOOLONG, %o5 ; \
-2:	brnz,a	done, 3f ; \
-	 stx	%o4, [done] ; \
-3:
+2:	mov	ENAMETOOLONG, %g1 ; \
+3:	sub	src, %g2, %g2 ; \
+	brnz,a	done, 4f ; \
+	 stx	%g2, [done] ; \
+4:
 
-#define	COPYSTR(dst, src, len, done) \
-	_COPYSTR(dst, src, len, done, E, E, E, E)
+/*
+ * ASI independent implementation of memset(3).
+ * Used to implement bzero(), memset() and physzero().
+ *
+ * If the pattern is non-zero, duplicate it to fill 64 bits.
+ * Store bytes until dst is 8-byte aligned, then store 8 bytes.
+ * It has yet to be determined how much unrolling is beneficial.
+ * Could also read and compare before writing to minimize snoop traffic.
+ *
+ * XXX bzero() should be implemented as
+ * #define bzero(dst, len) (void)memset((dst), 0, (len))
+ * if at all.
+ */
+#define	_MEMSET(dst, pat, len, da, dasi) \
+	brlez,pn len, 5f ; \
+	 and	pat, 0xff, pat ; \
+	brz,pt	pat, 1f ; \
+	 sllx	pat, 8, %g1 ; \
+	or	pat, %g1, pat ; \
+	sllx	pat, 16, %g1 ; \
+	or	pat, %g1, pat ; \
+	sllx	pat, 32, %g1 ; \
+	or	pat, %g1, pat ; \
+	.align	16 ; \
+1:	deccc	1, len ; \
+	bl,pn	%xcc, 5f ; \
+	 btst	7, dst ; \
+	bz,a,pt	%xcc, 2f ; \
+	 inc	1, len ; \
+	ST(b, da) pat, [dst] dasi ; \
+	b	%xcc, 1b ; \
+	 inc	dst ; \
+	.align	16 ; \
+2:	deccc	32, len ; \
+	bl,a,pn	%xcc, 3f ; \
+	 inc	32, len ; \
+	ST(x, da) pat, [dst] dasi ; \
+	ST(x, da) pat, [dst + 8] dasi ; \
+	ST(x, da) pat, [dst + 16] dasi ; \
+	ST(x, da) pat, [dst + 24] dasi ; \
+	b	%xcc, 2b ; \
+	 inc	32, dst ; \
+	.align	16 ; \
+3:	deccc	8, len ; \
+	bl,a,pn	%xcc, 4f ; \
+	 inc	8, len ; \
+	ST(x, da) pat, [dst] dasi ; \
+	b	%xcc, 3b ; \
+	 inc	8, dst ; \
+	.align	16 ; \
+4:	deccc	1, len ; \
+	bl,a,pn	%xcc, 5f ; \
+	 nop ; \
+	ST(b, da) pat, [dst] dasi ; \
+	b	%xcc, 4b ; \
+	 inc	1, dst ; \
+5:
 
-#define	COPYINSTR(uaddr, kaddr, len, done) \
-	wr	%g0, ASI_AIUP, %asi ; \
-	_COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)
+/*
+ * ASI independent implementation of memcpy(3).
+ * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().
+ *
+ * Transfer bytes until dst is 8-byte aligned.  If src is then also 8 byte
+ * aligned, transfer 8 bytes, otherwise finish with bytes.  The unaligned
+ * case could be optimized, but it is expected that this is the uncommon
+ * case and of questionable value.  The code to do so is also rather large
+ * and ugly.
+ * It has yet to be determined how much unrolling is beneficial.
+ *
+ * XXX bcopy() must also check for overlap.  This is stupid.
+ * XXX bcopy() should be implemented as
+ * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
+ * if at all.
+ */
+#define	_MEMCPY(dst, src, len, da, dasi, sa, sasi) \
+1:	deccc	1, len ; \
+	bl,pn	%xcc, 6f ; \
+	 btst	7, dst ; \
+	bz,a,pt	%xcc, 2f ; \
+	 inc	1, len ; \
+	LD(ub, sa) [src] sasi, %g1 ; \
+	ST(b, da) %g1, [dst] dasi ; \
+	inc	1, src ; \
+	b	%xcc, 1b ; \
+	 inc	1, dst ; \
+	.align	16 ; \
+2:	btst	7, src ; \
+	bz,a,pt	%xcc, 3f ; \
+	 nop ; \
+	b,a	%xcc, 5f ; \
+	.align	16 ; \
+3:	deccc	32, len ; \
+	bl,a,pn	%xcc, 4f ; \
+	 inc	32, len ; \
+	LD(x, sa) [src] sasi, %g1 ; \
+	LD(x, sa) [src + 8] sasi, %g2 ; \
+	LD(x, sa) [src + 16] sasi, %g3 ; \
+	LD(x, sa) [src + 24] sasi, %g4 ; \
+	ST(x, da) %g1, [dst] dasi ; \
+	ST(x, da) %g2, [dst + 8] dasi ; \
+	ST(x, da) %g3, [dst + 16] dasi ; \
+	ST(x, da) %g4, [dst + 24] dasi ; \
+	inc	32, src ; \
+	b	%xcc, 3b ; \
+	 inc	32, dst ; \
+	.align	16 ; \
+4:	deccc	8, len ; \
+	bl,a,pn	%xcc, 5f ; \
+	 inc	8, len ; \
+	LD(x, sa) [src] sasi, %g1 ; \
+	ST(x, da) %g1, [dst] dasi ; \
+	inc	8, src ; \
+	b	%xcc, 4b ; \
+	 inc	8, dst ; \
+	.align	16 ; \
+5:	deccc	1, len ; \
+	bl,a,pn	%xcc, 6f ; \
+	 nop ; \
+	LD(ub, sa) [src] sasi, %g1 ; \
+	ST(b, da) %g1, [dst] dasi ; \
+	inc	src ; \
+	b	%xcc, 5b ; \
+	 inc	dst ; \
+6:
 
 #define	CATCH_SETUP(label) \
 	setx	label, %g2, %g1 ; \
-	ldx	[PCPU(CURPCB)], %g6 ; \
+	ldx	[PCPU(CURTHREAD)], %g6 ; \
+	ldx	[%g6 + TD_PCB], %g6 ; \
 	stx	%g1, [%g6 + PCB_ONFAULT] ;
 
 #define	CATCH_END() \
@@ -119,7 +241,7 @@
 	SU_ALIGNED(storer, label)
 
 /*
- * void bcmp(void *b, size_t len)
+ * int bcmp(const void *b1, const void *b2, size_t len)
  */
 ENTRY(bcmp)
 	brz,pn	%o2, 2f
@@ -127,7 +249,7 @@ ENTRY(bcmp)
 1:	ldub	[%o0 + %o3], %o4
 	ldub	[%o1 + %o3], %o5
 	cmp	%o4, %o5
-	bne,pn	%xcc, 1f
+	bne,pn	%xcc, 2f
 	 inc	%o3
 	deccc	%o2
 	bne,pt	%xcc, 1b
@@ -139,46 +261,90 @@ END(bcmp)
 /*
  * void bcopy(const void *src, void *dst, size_t len)
  */
+ENTRY(ovbcopy)
 ENTRY(bcopy)
-	BCOPY(%o0, %o1, %o2)
+	/*
+	 * Check for overlap, and copy backwards if so.
+	 */
+	sub	%o1, %o0, %g1
+	cmp	%g1, %o2
+	bgeu,a,pt %xcc, 3f
+	 nop
+
+	/*
+	 * Copy backwards.
+	 */
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+1:	deccc	1, %o2
+	bl,a,pn	%xcc, 2f
+	 nop
+	dec	1, %o0
+	ldub	[%o0], %g1
+	dec	1, %o1
+	b	%xcc, 1b
+	 stb	%g1, [%o1]
+2:	retl
+	 nop
+
+	/*
+	 * Do the fast version.
+	 */
+3:	_MEMCPY(%o1, %o0, %o2, E, E, E, E)
 	retl
 	 nop
 END(bcopy)
 
 /*
- * void ovbcopy(const void *src, void *dst, size_t len)
- * XXX handle overlap...
+ * void bzero(void *b, size_t len)
  */
-ENTRY(ovbcopy)
-	BCOPY(%o0, %o1, %o2)
+ENTRY(bzero)
+	_MEMSET(%o0, %g0, %o1, E, E)
 	retl
 	 nop
-END(ovbcopy)
+END(bzero)
 
 /*
- * void bzero(void *b, size_t len)
+ * void physzero(vm_offset_t pa, size_t len)
  */
-ENTRY(bzero)
-	brz,pn	%o1, 1f
+ENTRY(physzero)
+	wr	%g0, ASI_PHYS_USE_EC, %asi
+	_MEMSET(%o0, %g0, %o1, a, %asi)
+	retl
 	 nop
-1:	deccc	%o1
-	stb	%g0, [%o0]
-	bne,pt	%xcc, 1b
-	 inc	%o0
-2:	retl
+END(physzero)
+
+/*
+ * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)
+ */
+ENTRY(physcopy)
+	wr	%g0, ASI_PHYS_USE_EC, %asi
+	_MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)
+	retl
 	 nop
-END(bzero)
+END(physcopy)
 
 /*
  * void *memcpy(void *dst, const void *src, size_t len)
  */
 ENTRY(memcpy)
-	BCOPY(%o1, %o0, %o2)
+	mov	%o0, %o3
+	_MEMCPY(%o3, %o1, %o2, E, E, E, E)
 	retl
 	 nop
 END(memcpy)
 
 /*
+ * void *memset(void *b, int c, size_t len)
+ */
+ENTRY(memset)
+	mov	%o0, %o3
+	_MEMSET(%o3, %o1, %o2, E, E)
+	retl
+	 nop
+END(memset)
+
+/*
  * int copyin(const void *uaddr, void *kaddr, size_t len)
  */
 ENTRY(copyin)
@@ -191,7 +357,8 @@ ENTRY(copyin)
 	stx	%o2, [%o3 + KTR_PARM3]
 9:
 #endif
-	COPYIN(%o0, %o1, %o2)
+	wr	%g0, ASI_AIUP, %asi
+	_MEMCPY(%o1, %o0, %o2, E, E, a, %asi)
 	CATCH_END()
 	retl
 	 clr	%o0
@@ -211,10 +378,11 @@ ENTRY(copyinstr)
 	stx	%o3, [%g1 + KTR_PARM4]
 9:
 #endif
-	COPYINSTR(%o0, %o1, %o2, %o3)
+	wr	%g0, ASI_AIUP, %asi
+	_COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)
 	CATCH_END()
 	retl
-	 mov	%o5, %o0
+	 mov	%g1, %o0
 END(copyinstr)
 
 /*
@@ -230,7 +398,8 @@ ENTRY(copyout)
 	stx	%o2, [%o3 + KTR_PARM3]
 9:
 #endif
-	COPYOUT(%o0, %o1, %o2)
+	wr	%g0, ASI_AIUP, %asi
+	_MEMCPY(%o1, %o0, %o2, a, %asi, E, E)
 	CATCH_END()
 	retl
 	 clr	%o0
@@ -250,9 +419,9 @@ END(copyout)
  * int copystr(const void *src, void *dst, size_t len, size_t *done)
  */
 ENTRY(copystr)
-	COPYSTR(%o0, %o1, %o2, %o3)
+	_COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)
 	retl
-	 mov	%o5, %o0
+	 mov	%g1, %o0
 END(copystr)
 
 /*
@@ -325,7 +494,6 @@ ENTRY(fsbail)
 .Lfsalign:
 	retl
 	 mov	-1, %o0
-END(fsbail)
 
 ENTRY(longjmp)
 	set	1, %g3
@@ -355,64 +523,17 @@ ENTRY(setjmp)
 END(setjmp)
 
 /*
- * Temporary stack for calling into the firmware. We need to setup one, because
- * the MMU mapping for our stack page may be lost. When the firmware tries to
- * spill the last window (the others are flushed before), this results in an
- * DMMU miss trap, which is fatal with the firmware trap handlers installed.
- * Additionally, it seems that the firmware does not immediately switch to an
- * own stack (or maybe never?), therefore more space needs to be reserved.
- * I hope this is sufficient now.
- */
-	.align	4
-DATA(ofwstack)
-	.rept	CCFSZ * 8
-	.byte	0
-	.endr
-ofwstack_last:
-	.rept	CCFSZ
-	.byte	0
-	.endr
-END(ofwstack)
-
-/*
  * void openfirmware(cell_t args[])
  */
 ENTRY(openfirmware)
-	/*
-	 * Disable interrupts. The firmware should not deal with our interrupts
-	 * anyway, and the temporary stack is not large enough to hold the stack
-	 * footprint of the interrrupt handling.
-	 */
-	rdpr	%pstate, %o3
-	andn	%o3, PSTATE_IE, %o1
-	wrpr	%o1, 0, %pstate
-	setx	ofwstack_last - SPOFF, %o1, %o2
-	save	%o2, 0, %sp
-	flushw
-	rdpr	%tl, %l1
-	rdpr	%tba, %l2
-	mov	AA_DMMU_PCXR, %l3
-	ldxa	[%l3] ASI_DMMU, %l4
-	stxa	%g0, [%l3] ASI_DMMU
-	membar	#Sync
-	flush	%sp
-	setx	ofw_tba, %l7, %l5
-	ldx	[%l5], %l5
+	save	%sp, -CCFSZ, %sp
 	setx	ofw_vec, %l7, %l6
 	ldx	[%l6], %l6
 	rdpr	%pil, %l7
-	wrpr	%g0, 14, %pil
-	wrpr	%l5, 0, %tba
-	wrpr	%g0, 0, %tl
+	wrpr	%g0, PIL_TICK, %pil
 	call	%l6
 	 mov	%i0, %o0
-	wrpr	%l1, 0, %tl
-	wrpr	%l2, 0, %tba
-	stxa	%l4, [%l3] ASI_DMMU
 	wrpr	%l7, 0, %pil
-	membar	#Sync
-	flush	%sp
-	restore
-	retl
-	 wrpr	%o3, 0, %pstate
+	ret
+	 restore %o0, %g0, %o0
 END(openfirmware)
diff --git a/sys/sparc64/sparc64/support.s b/sys/sparc64/sparc64/support.s
index 41469f5..bbd2d13 100644
--- a/sys/sparc64/sparc64/support.s
+++ b/sys/sparc64/sparc64/support.s
@@ -33,62 +33,184 @@
 
 #include "assym.s"
 
-#define	E
+#define	E	/* empty */
 
+/*
+ * Generate load and store instructions for the corresponding width and asi
+ * (or not).  Note that we want to evaluate the macro args before
+ * concatenating, so that E really turns into nothing.
+ */
 #define	_LD(w, a)	ld ## w ## a
 #define	_ST(w, a)	st ## w ## a
 
 #define	LD(w, a)	_LD(w, a)
 #define	ST(w, a)	_ST(w, a)
 
-#define	_BCOPY(src, dst, len, sa, sasi, da, dasi) \
-	brz,pn	len, 2f ; \
-	 mov	len, %o3 ; \
-1:	LD(ub, sa) [src] sasi, %o4 ; \
-	ST(b, da) %o4, [dst] dasi ; \
-	dec	%o3 ; \
-	inc	src ; \
-	brnz,pt	%o3, 1b ; \
-	 inc	dst ; \
-2:
-
-#define	BCOPY(src, dst, len) \
-	_BCOPY(src, dst, len, E, E, E, E)
-
-#define	COPYIN(uaddr, kaddr, len) \
-	wr	%g0, ASI_AIUP, %asi ; \
-	_BCOPY(uaddr, kaddr, len, a, %asi, E, E)
-
-#define	COPYOUT(kaddr, uaddr, len) \
-	wr	%g0, ASI_AIUP, %asi ; \
-	_BCOPY(kaddr, uaddr, len, E, E, a, %asi)
+/*
+ * Common code for copy routines.
+ *
+ * We use large macros to generate functions for each of the copy routines.
+ * This allows the load and store instructions to be generated for the right
+ * operation, asi or not.  It is possible to write an asi independent function
+ * but this would require 2 expensive wrs in the main loop to switch %asi.
+ * It would also screw up profiling (if we ever get it), but may save some I$.
+ * We assume that either one of dasi and sasi is empty, or that they are both
+ * the same (empty or non-empty).  It is up to the caller to set %asi.
+ */
 
+/*
+ * ASI independent implementation of copystr(9).
+ * Used to implement copyinstr() and copystr().
+ *
+ * Return value is in %g1.
+ */
 #define	_COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
-	clr	%o4 ; \
-	clr	%o5 ; \
-1:	LD(ub, sa) [src] sasi, %g1 ; \
+	brz	len, 4f ; \
+	 mov	src, %g2 ; \
+1:	deccc	1, len ; \
+	bl,a,pn	%xcc, 3f ; \
+	 nop ; \
+	LD(ub, sa) [src] sasi, %g1 ; \
 	ST(b, da) %g1, [dst] dasi ; \
-	brz,pn	%g1, 2f ; \
-	 inc	%o4 ; \
-	dec	len ; \
-	inc	src ; \
-	brgz,pt	len, 1b ; \
+	brz,pn	%g1, 3f ; \
+	 inc	src ; \
+	b	%xcc, 1b ; \
 	 inc	dst ; \
-	mov	ENAMETOOLONG, %o5 ; \
-2:	brnz,a	done, 3f ; \
-	 stx	%o4, [done] ; \
-3:
+2:	mov	ENAMETOOLONG, %g1 ; \
+3:	sub	src, %g2, %g2 ; \
+	brnz,a	done, 4f ; \
+	 stx	%g2, [done] ; \
+4:
 
-#define	COPYSTR(dst, src, len, done) \
-	_COPYSTR(dst, src, len, done, E, E, E, E)
+/*
+ * ASI independent implementation of memset(3).
+ * Used to implement bzero(), memset() and physzero().
+ *
+ * If the pattern is non-zero, duplicate it to fill 64 bits.
+ * Store bytes until dst is 8-byte aligned, then store 8 bytes.
+ * It has yet to be determined how much unrolling is beneficial.
+ * Could also read and compare before writing to minimize snoop traffic.
+ *
+ * XXX bzero() should be implemented as
+ * #define bzero(dst, len) (void)memset((dst), 0, (len))
+ * if at all.
+ */
+#define	_MEMSET(dst, pat, len, da, dasi) \
+	brlez,pn len, 5f ; \
+	 and	pat, 0xff, pat ; \
+	brz,pt	pat, 1f ; \
+	 sllx	pat, 8, %g1 ; \
+	or	pat, %g1, pat ; \
+	sllx	pat, 16, %g1 ; \
+	or	pat, %g1, pat ; \
+	sllx	pat, 32, %g1 ; \
+	or	pat, %g1, pat ; \
+	.align	16 ; \
+1:	deccc	1, len ; \
+	bl,pn	%xcc, 5f ; \
+	 btst	7, dst ; \
+	bz,a,pt	%xcc, 2f ; \
+	 inc	1, len ; \
+	ST(b, da) pat, [dst] dasi ; \
+	b	%xcc, 1b ; \
+	 inc	dst ; \
+	.align	16 ; \
+2:	deccc	32, len ; \
+	bl,a,pn	%xcc, 3f ; \
+	 inc	32, len ; \
+	ST(x, da) pat, [dst] dasi ; \
+	ST(x, da) pat, [dst + 8] dasi ; \
+	ST(x, da) pat, [dst + 16] dasi ; \
+	ST(x, da) pat, [dst + 24] dasi ; \
+	b	%xcc, 2b ; \
+	 inc	32, dst ; \
+	.align	16 ; \
+3:	deccc	8, len ; \
+	bl,a,pn	%xcc, 4f ; \
+	 inc	8, len ; \
+	ST(x, da) pat, [dst] dasi ; \
+	b	%xcc, 3b ; \
+	 inc	8, dst ; \
+	.align	16 ; \
+4:	deccc	1, len ; \
+	bl,a,pn	%xcc, 5f ; \
+	 nop ; \
+	ST(b, da) pat, [dst] dasi ; \
+	b	%xcc, 4b ; \
+	 inc	1, dst ; \
+5:
 
-#define	COPYINSTR(uaddr, kaddr, len, done) \
-	wr	%g0, ASI_AIUP, %asi ; \
-	_COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)
+/*
+ * ASI independent implementation of memcpy(3).
+ * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().
+ *
+ * Transfer bytes until dst is 8-byte aligned.  If src is then also 8 byte
+ * aligned, transfer 8 bytes, otherwise finish with bytes.  The unaligned
+ * case could be optimized, but it is expected that this is the uncommon
+ * case and of questionable value.  The code to do so is also rather large
+ * and ugly.
+ * It has yet to be determined how much unrolling is beneficial.
+ *
+ * XXX bcopy() must also check for overlap.  This is stupid.
+ * XXX bcopy() should be implemented as
+ * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
+ * if at all.
+ */
+#define	_MEMCPY(dst, src, len, da, dasi, sa, sasi) \
+1:	deccc	1, len ; \
+	bl,pn	%xcc, 6f ; \
+	 btst	7, dst ; \
+	bz,a,pt	%xcc, 2f ; \
+	 inc	1, len ; \
+	LD(ub, sa) [src] sasi, %g1 ; \
+	ST(b, da) %g1, [dst] dasi ; \
+	inc	1, src ; \
+	b	%xcc, 1b ; \
+	 inc	1, dst ; \
+	.align	16 ; \
+2:	btst	7, src ; \
+	bz,a,pt	%xcc, 3f ; \
+	 nop ; \
+	b,a	%xcc, 5f ; \
+	.align	16 ; \
+3:	deccc	32, len ; \
+	bl,a,pn	%xcc, 4f ; \
+	 inc	32, len ; \
+	LD(x, sa) [src] sasi, %g1 ; \
+	LD(x, sa) [src + 8] sasi, %g2 ; \
+	LD(x, sa) [src + 16] sasi, %g3 ; \
+	LD(x, sa) [src + 24] sasi, %g4 ; \
+	ST(x, da) %g1, [dst] dasi ; \
+	ST(x, da) %g2, [dst + 8] dasi ; \
+	ST(x, da) %g3, [dst + 16] dasi ; \
+	ST(x, da) %g4, [dst + 24] dasi ; \
+	inc	32, src ; \
+	b	%xcc, 3b ; \
+	 inc	32, dst ; \
+	.align	16 ; \
+4:	deccc	8, len ; \
+	bl,a,pn	%xcc, 5f ; \
+	 inc	8, len ; \
+	LD(x, sa) [src] sasi, %g1 ; \
+	ST(x, da) %g1, [dst] dasi ; \
+	inc	8, src ; \
+	b	%xcc, 4b ; \
+	 inc	8, dst ; \
+	.align	16 ; \
+5:	deccc	1, len ; \
+	bl,a,pn	%xcc, 6f ; \
+	 nop ; \
+	LD(ub, sa) [src] sasi, %g1 ; \
+	ST(b, da) %g1, [dst] dasi ; \
+	inc	src ; \
+	b	%xcc, 5b ; \
+	 inc	dst ; \
+6:
 
 #define	CATCH_SETUP(label) \
 	setx	label, %g2, %g1 ; \
-	ldx	[PCPU(CURPCB)], %g6 ; \
+	ldx	[PCPU(CURTHREAD)], %g6 ; \
+	ldx	[%g6 + TD_PCB], %g6 ; \
 	stx	%g1, [%g6 + PCB_ONFAULT] ;
 
 #define	CATCH_END() \
@@ -119,7 +241,7 @@
 	SU_ALIGNED(storer, label)
 
 /*
- * void bcmp(void *b, size_t len)
+ * int bcmp(const void *b1, const void *b2, size_t len)
  */
 ENTRY(bcmp)
 	brz,pn	%o2, 2f
@@ -127,7 +249,7 @@ ENTRY(bcmp)
 1:	ldub	[%o0 + %o3], %o4
 	ldub	[%o1 + %o3], %o5
 	cmp	%o4, %o5
-	bne,pn	%xcc, 1f
+	bne,pn	%xcc, 2f
 	 inc	%o3
 	deccc	%o2
 	bne,pt	%xcc, 1b
@@ -139,46 +261,90 @@ END(bcmp)
 /*
  * void bcopy(const void *src, void *dst, size_t len)
  */
+ENTRY(ovbcopy)
 ENTRY(bcopy)
-	BCOPY(%o0, %o1, %o2)
+	/*
+	 * Check for overlap, and copy backwards if so.
+	 */
+	sub	%o1, %o0, %g1
+	cmp	%g1, %o2
+	bgeu,a,pt %xcc, 3f
+	 nop
+
+	/*
+	 * Copy backwards.
+	 */
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+1:	deccc	1, %o2
+	bl,a,pn	%xcc, 2f
+	 nop
+	dec	1, %o0
+	ldub	[%o0], %g1
+	dec	1, %o1
+	b	%xcc, 1b
+	 stb	%g1, [%o1]
+2:	retl
+	 nop
+
+	/*
+	 * Do the fast version.
+	 */
+3:	_MEMCPY(%o1, %o0, %o2, E, E, E, E)
 	retl
 	 nop
 END(bcopy)
 
 /*
- * void ovbcopy(const void *src, void *dst, size_t len)
- * XXX handle overlap...
+ * void bzero(void *b, size_t len)
  */
-ENTRY(ovbcopy)
-	BCOPY(%o0, %o1, %o2)
+ENTRY(bzero)
+	_MEMSET(%o0, %g0, %o1, E, E)
 	retl
 	 nop
-END(ovbcopy)
+END(bzero)
 
 /*
- * void bzero(void *b, size_t len)
+ * void physzero(vm_offset_t pa, size_t len)
  */
-ENTRY(bzero)
-	brz,pn	%o1, 1f
+ENTRY(physzero)
+	wr	%g0, ASI_PHYS_USE_EC, %asi
+	_MEMSET(%o0, %g0, %o1, a, %asi)
+	retl
 	 nop
-1:	deccc	%o1
-	stb	%g0, [%o0]
-	bne,pt	%xcc, 1b
-	 inc	%o0
-2:	retl
+END(physzero)
+
+/*
+ * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)
+ */
+ENTRY(physcopy)
+	wr	%g0, ASI_PHYS_USE_EC, %asi
+	_MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)
+	retl
 	 nop
-END(bzero)
+END(physcopy)
 
 /*
  * void *memcpy(void *dst, const void *src, size_t len)
  */
 ENTRY(memcpy)
-	BCOPY(%o1, %o0, %o2)
+	mov	%o0, %o3
+	_MEMCPY(%o3, %o1, %o2, E, E, E, E)
 	retl
 	 nop
 END(memcpy)
 
 /*
+ * void *memset(void *b, int c, size_t len)
+ */
+ENTRY(memset)
+	mov	%o0, %o3
+	_MEMSET(%o3, %o1, %o2, E, E)
+	retl
+	 nop
+END(memset)
+
+/*
  * int copyin(const void *uaddr, void *kaddr, size_t len)
  */
 ENTRY(copyin)
@@ -191,7 +357,8 @@ ENTRY(copyin)
 	stx	%o2, [%o3 + KTR_PARM3]
 9:
 #endif
-	COPYIN(%o0, %o1, %o2)
+	wr	%g0, ASI_AIUP, %asi
+	_MEMCPY(%o1, %o0, %o2, E, E, a, %asi)
 	CATCH_END()
 	retl
 	 clr	%o0
@@ -211,10 +378,11 @@ ENTRY(copyinstr)
 	stx	%o3, [%g1 + KTR_PARM4]
 9:
 #endif
-	COPYINSTR(%o0, %o1, %o2, %o3)
+	wr	%g0, ASI_AIUP, %asi
+	_COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)
 	CATCH_END()
 	retl
-	 mov	%o5, %o0
+	 mov	%g1, %o0
 END(copyinstr)
 
 /*
@@ -230,7 +398,8 @@ ENTRY(copyout)
 	stx	%o2, [%o3 + KTR_PARM3]
 9:
 #endif
-	COPYOUT(%o0, %o1, %o2)
+	wr	%g0, ASI_AIUP, %asi
+	_MEMCPY(%o1, %o0, %o2, a, %asi, E, E)
 	CATCH_END()
 	retl
 	 clr	%o0
@@ -250,9 +419,9 @@ END(copyout)
  * int copystr(const void *src, void *dst, size_t len, size_t *done)
  */
 ENTRY(copystr)
-	COPYSTR(%o0, %o1, %o2, %o3)
+	_COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)
 	retl
-	 mov	%o5, %o0
+	 mov	%g1, %o0
 END(copystr)
 
 /*
@@ -325,7 +494,6 @@ ENTRY(fsbail)
 .Lfsalign:
 	retl
 	 mov	-1, %o0
-END(fsbail)
 
 ENTRY(longjmp)
 	set	1, %g3
@@ -355,64 +523,17 @@ ENTRY(setjmp)
 END(setjmp)
 
 /*
- * Temporary stack for calling into the firmware. We need to setup one, because
- * the MMU mapping for our stack page may be lost. When the firmware tries to
- * spill the last window (the others are flushed before), this results in an
- * DMMU miss trap, which is fatal with the firmware trap handlers installed.
- * Additionally, it seems that the firmware does not immediately switch to an
- * own stack (or maybe never?), therefore more space needs to be reserved.
- * I hope this is sufficient now.
- */
-	.align	4
-DATA(ofwstack)
-	.rept	CCFSZ * 8
-	.byte	0
-	.endr
-ofwstack_last:
-	.rept	CCFSZ
-	.byte	0
-	.endr
-END(ofwstack)
-
-/*
  * void openfirmware(cell_t args[])
  */
 ENTRY(openfirmware)
-	/*
-	 * Disable interrupts. The firmware should not deal with our interrupts
-	 * anyway, and the temporary stack is not large enough to hold the stack
-	 * footprint of the interrrupt handling.
-	 */
-	rdpr	%pstate, %o3
-	andn	%o3, PSTATE_IE, %o1
-	wrpr	%o1, 0, %pstate
-	setx	ofwstack_last - SPOFF, %o1, %o2
-	save	%o2, 0, %sp
-	flushw
-	rdpr	%tl, %l1
-	rdpr	%tba, %l2
-	mov	AA_DMMU_PCXR, %l3
-	ldxa	[%l3] ASI_DMMU, %l4
-	stxa	%g0, [%l3] ASI_DMMU
-	membar	#Sync
-	flush	%sp
-	setx	ofw_tba, %l7, %l5
-	ldx	[%l5], %l5
+	save	%sp, -CCFSZ, %sp
 	setx	ofw_vec, %l7, %l6
 	ldx	[%l6], %l6
 	rdpr	%pil, %l7
-	wrpr	%g0, 14, %pil
-	wrpr	%l5, 0, %tba
-	wrpr	%g0, 0, %tl
+	wrpr	%g0, PIL_TICK, %pil
 	call	%l6
 	 mov	%i0, %o0
-	wrpr	%l1, 0, %tl
-	wrpr	%l2, 0, %tba
-	stxa	%l4, [%l3] ASI_DMMU
 	wrpr	%l7, 0, %pil
-	membar	#Sync
-	flush	%sp
-	restore
-	retl
-	 wrpr	%o3, 0, %pstate
+	ret
+	 restore %o0, %g0, %o0
 END(openfirmware)