summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/sparc64/sparc64/support.S353
-rw-r--r--sys/sparc64/sparc64/support.s353
2 files changed, 474 insertions, 232 deletions
diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S
index 41469f5..bbd2d13 100644
--- a/sys/sparc64/sparc64/support.S
+++ b/sys/sparc64/sparc64/support.S
@@ -33,62 +33,184 @@
#include "assym.s"
-#define E
+#define E /* empty */
+/*
+ * Generate load and store instructions for the corresponding width and asi
+ * (or not). Note that we want to evaluate the macro args before
+ * concatenating, so that E really turns into nothing.
+ */
#define _LD(w, a) ld ## w ## a
#define _ST(w, a) st ## w ## a
#define LD(w, a) _LD(w, a)
#define ST(w, a) _ST(w, a)
-#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \
- brz,pn len, 2f ; \
- mov len, %o3 ; \
-1: LD(ub, sa) [src] sasi, %o4 ; \
- ST(b, da) %o4, [dst] dasi ; \
- dec %o3 ; \
- inc src ; \
- brnz,pt %o3, 1b ; \
- inc dst ; \
-2:
-
-#define BCOPY(src, dst, len) \
- _BCOPY(src, dst, len, E, E, E, E)
-
-#define COPYIN(uaddr, kaddr, len) \
- wr %g0, ASI_AIUP, %asi ; \
- _BCOPY(uaddr, kaddr, len, a, %asi, E, E)
-
-#define COPYOUT(kaddr, uaddr, len) \
- wr %g0, ASI_AIUP, %asi ; \
- _BCOPY(kaddr, uaddr, len, E, E, a, %asi)
+/*
+ * Common code for copy routines.
+ *
+ * We use large macros to generate functions for each of the copy routines.
+ * This allows the load and store instructions to be generated for the right
+ * operation, asi or not. It is possible to write an asi independent function
+ * but this would require 2 expensive wrs in the main loop to switch %asi.
+ * It would also screw up profiling (if we ever get it), but may save some I$.
+ * We assume that either one of dasi and sasi is empty, or that they are both
+ * the same (empty or non-empty). It is up to the caller to set %asi.
+ */
+/*
+ * ASI independent implementation of copystr(9).
+ * Used to implement copyinstr() and copystr().
+ *
+ * Return value is in %g1.
+ */
#define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
- clr %o4 ; \
- clr %o5 ; \
-1: LD(ub, sa) [src] sasi, %g1 ; \
+ brz len, 4f ; \
+ mov src, %g2 ; \
+1: deccc 1, len ; \
+ bl,a,pn %xcc, 3f ; \
+ nop ; \
+ LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
- brz,pn %g1, 2f ; \
- inc %o4 ; \
- dec len ; \
- inc src ; \
- brgz,pt len, 1b ; \
+ brz,pn %g1, 3f ; \
+ inc src ; \
+ b %xcc, 1b ; \
inc dst ; \
- mov ENAMETOOLONG, %o5 ; \
-2: brnz,a done, 3f ; \
- stx %o4, [done] ; \
-3:
+2: mov ENAMETOOLONG, %g1 ; \
+3: sub src, %g2, %g2 ; \
+ brnz,a done, 4f ; \
+ stx %g2, [done] ; \
+4:
-#define COPYSTR(dst, src, len, done) \
- _COPYSTR(dst, src, len, done, E, E, E, E)
+/*
+ * ASI independent implementation of memset(3).
+ * Used to implement bzero(), memset() and physzero().
+ *
+ * If the pattern is non-zero, duplicate it to fill 64 bits.
+ * Store bytes until dst is 8-byte aligned, then store 8 bytes.
+ * It has yet to be determined how much unrolling is beneficial.
+ * Could also read and compare before writing to minimize snoop traffic.
+ *
+ * XXX bzero() should be implemented as
+ * #define bzero(dst, len) (void)memset((dst), 0, (len))
+ * if at all.
+ */
+#define _MEMSET(dst, pat, len, da, dasi) \
+ brlez,pn len, 5f ; \
+ and pat, 0xff, pat ; \
+ brz,pt pat, 1f ; \
+ sllx pat, 8, %g1 ; \
+ or pat, %g1, pat ; \
+ sllx pat, 16, %g1 ; \
+ or pat, %g1, pat ; \
+ sllx pat, 32, %g1 ; \
+ or pat, %g1, pat ; \
+ .align 16 ; \
+1: deccc 1, len ; \
+ bl,pn %xcc, 5f ; \
+ btst 7, dst ; \
+ bz,a,pt %xcc, 2f ; \
+ inc 1, len ; \
+ ST(b, da) pat, [dst] dasi ; \
+ b %xcc, 1b ; \
+ inc dst ; \
+ .align 16 ; \
+2: deccc 32, len ; \
+ bl,a,pn %xcc, 3f ; \
+ inc 32, len ; \
+ ST(x, da) pat, [dst] dasi ; \
+ ST(x, da) pat, [dst + 8] dasi ; \
+ ST(x, da) pat, [dst + 16] dasi ; \
+ ST(x, da) pat, [dst + 24] dasi ; \
+ b %xcc, 2b ; \
+ inc 32, dst ; \
+ .align 16 ; \
+3: deccc 8, len ; \
+ bl,a,pn %xcc, 4f ; \
+ inc 8, len ; \
+ ST(x, da) pat, [dst] dasi ; \
+ b %xcc, 3b ; \
+ inc 8, dst ; \
+ .align 16 ; \
+4: deccc 1, len ; \
+ bl,a,pn %xcc, 5f ; \
+ nop ; \
+ ST(b, da) pat, [dst] dasi ; \
+ b %xcc, 4b ; \
+ inc 1, dst ; \
+5:
-#define COPYINSTR(uaddr, kaddr, len, done) \
- wr %g0, ASI_AIUP, %asi ; \
- _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)
+/*
+ * ASI independent implementation of memcpy(3).
+ * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().
+ *
+ * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte
+ * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned
+ * case could be optimized, but it is expected that this is the uncommon
+ * case and of questionable value. The code to do so is also rather large
+ * and ugly.
+ * It has yet to be determined how much unrolling is beneficial.
+ *
+ * XXX bcopy() must also check for overlap. This is stupid.
+ * XXX bcopy() should be implemented as
+ * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
+ * if at all.
+ */
+#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \
+1: deccc 1, len ; \
+ bl,pn %xcc, 6f ; \
+ btst 7, dst ; \
+ bz,a,pt %xcc, 2f ; \
+ inc 1, len ; \
+ LD(ub, sa) [src] sasi, %g1 ; \
+ ST(b, da) %g1, [dst] dasi ; \
+ inc 1, src ; \
+ b %xcc, 1b ; \
+ inc 1, dst ; \
+ .align 16 ; \
+2: btst 7, src ; \
+ bz,a,pt %xcc, 3f ; \
+ nop ; \
+ b,a %xcc, 5f ; \
+ .align 16 ; \
+3: deccc 32, len ; \
+ bl,a,pn %xcc, 4f ; \
+ inc 32, len ; \
+ LD(x, sa) [src] sasi, %g1 ; \
+ LD(x, sa) [src + 8] sasi, %g2 ; \
+ LD(x, sa) [src + 16] sasi, %g3 ; \
+ LD(x, sa) [src + 24] sasi, %g4 ; \
+ ST(x, da) %g1, [dst] dasi ; \
+ ST(x, da) %g2, [dst + 8] dasi ; \
+ ST(x, da) %g3, [dst + 16] dasi ; \
+ ST(x, da) %g4, [dst + 24] dasi ; \
+ inc 32, src ; \
+ b %xcc, 3b ; \
+ inc 32, dst ; \
+ .align 16 ; \
+4: deccc 8, len ; \
+ bl,a,pn %xcc, 5f ; \
+ inc 8, len ; \
+ LD(x, sa) [src] sasi, %g1 ; \
+ ST(x, da) %g1, [dst] dasi ; \
+ inc 8, src ; \
+ b %xcc, 4b ; \
+ inc 8, dst ; \
+ .align 16 ; \
+5: deccc 1, len ; \
+ bl,a,pn %xcc, 6f ; \
+ nop ; \
+ LD(ub, sa) [src] sasi, %g1 ; \
+ ST(b, da) %g1, [dst] dasi ; \
+ inc src ; \
+ b %xcc, 5b ; \
+ inc dst ; \
+6:
#define CATCH_SETUP(label) \
setx label, %g2, %g1 ; \
- ldx [PCPU(CURPCB)], %g6 ; \
+ ldx [PCPU(CURTHREAD)], %g6 ; \
+ ldx [%g6 + TD_PCB], %g6 ; \
stx %g1, [%g6 + PCB_ONFAULT] ;
#define CATCH_END() \
@@ -119,7 +241,7 @@
SU_ALIGNED(storer, label)
/*
- * void bcmp(void *b, size_t len)
+ * int bcmp(const void *b1, const void *b2, size_t len)
*/
ENTRY(bcmp)
brz,pn %o2, 2f
@@ -127,7 +249,7 @@ ENTRY(bcmp)
1: ldub [%o0 + %o3], %o4
ldub [%o1 + %o3], %o5
cmp %o4, %o5
- bne,pn %xcc, 1f
+ bne,pn %xcc, 2f
inc %o3
deccc %o2
bne,pt %xcc, 1b
@@ -139,46 +261,90 @@ END(bcmp)
/*
* void bcopy(const void *src, void *dst, size_t len)
*/
+ENTRY(ovbcopy)
ENTRY(bcopy)
- BCOPY(%o0, %o1, %o2)
+ /*
+ * Check for overlap, and copy backwards if so.
+ */
+ sub %o1, %o0, %g1
+ cmp %g1, %o2
+ bgeu,a,pt %xcc, 3f
+ nop
+
+ /*
+ * Copy backwards.
+ */
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+1: deccc 1, %o2
+ bl,a,pn %xcc, 2f
+ nop
+ dec 1, %o0
+ ldub [%o0], %g1
+ dec 1, %o1
+ b %xcc, 1b
+ stb %g1, [%o1]
+2: retl
+ nop
+
+ /*
+ * Do the fast version.
+ */
+3: _MEMCPY(%o1, %o0, %o2, E, E, E, E)
retl
nop
END(bcopy)
/*
- * void ovbcopy(const void *src, void *dst, size_t len)
- * XXX handle overlap...
+ * void bzero(void *b, size_t len)
*/
-ENTRY(ovbcopy)
- BCOPY(%o0, %o1, %o2)
+ENTRY(bzero)
+ _MEMSET(%o0, %g0, %o1, E, E)
retl
nop
-END(ovbcopy)
+END(bzero)
/*
- * void bzero(void *b, size_t len)
+ * void physzero(vm_offset_t pa, size_t len)
*/
-ENTRY(bzero)
- brz,pn %o1, 1f
+ENTRY(physzero)
+ wr %g0, ASI_PHYS_USE_EC, %asi
+ _MEMSET(%o0, %g0, %o1, a, %asi)
+ retl
nop
-1: deccc %o1
- stb %g0, [%o0]
- bne,pt %xcc, 1b
- inc %o0
-2: retl
+END(physzero)
+
+/*
+ * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)
+ */
+ENTRY(physcopy)
+ wr %g0, ASI_PHYS_USE_EC, %asi
+ _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)
+ retl
nop
-END(bzero)
+END(physcopy)
/*
* void *memcpy(void *dst, const void *src, size_t len)
*/
ENTRY(memcpy)
- BCOPY(%o1, %o0, %o2)
+ mov %o0, %o3
+ _MEMCPY(%o3, %o1, %o2, E, E, E, E)
retl
nop
END(memcpy)
/*
+ * void *memset(void *b, int c, size_t len)
+ */
+ENTRY(memset)
+ mov %o0, %o3
+ _MEMSET(%o3, %o1, %o2, E, E)
+ retl
+ nop
+END(memset)
+
+/*
* int copyin(const void *uaddr, void *kaddr, size_t len)
*/
ENTRY(copyin)
@@ -191,7 +357,8 @@ ENTRY(copyin)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
- COPYIN(%o0, %o1, %o2)
+ wr %g0, ASI_AIUP, %asi
+ _MEMCPY(%o1, %o0, %o2, E, E, a, %asi)
CATCH_END()
retl
clr %o0
@@ -211,10 +378,11 @@ ENTRY(copyinstr)
stx %o3, [%g1 + KTR_PARM4]
9:
#endif
- COPYINSTR(%o0, %o1, %o2, %o3)
+ wr %g0, ASI_AIUP, %asi
+ _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)
CATCH_END()
retl
- mov %o5, %o0
+ mov %g1, %o0
END(copyinstr)
/*
@@ -230,7 +398,8 @@ ENTRY(copyout)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
- COPYOUT(%o0, %o1, %o2)
+ wr %g0, ASI_AIUP, %asi
+ _MEMCPY(%o1, %o0, %o2, a, %asi, E, E)
CATCH_END()
retl
clr %o0
@@ -250,9 +419,9 @@ END(copyout)
* int copystr(const void *src, void *dst, size_t len, size_t *done)
*/
ENTRY(copystr)
- COPYSTR(%o0, %o1, %o2, %o3)
+ _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)
retl
- mov %o5, %o0
+ mov %g1, %o0
END(copystr)
/*
@@ -325,7 +494,6 @@ ENTRY(fsbail)
.Lfsalign:
retl
mov -1, %o0
-END(fsbail)
ENTRY(longjmp)
set 1, %g3
@@ -355,64 +523,17 @@ ENTRY(setjmp)
END(setjmp)
/*
- * Temporary stack for calling into the firmware. We need to setup one, because
- * the MMU mapping for our stack page may be lost. When the firmware tries to
- * spill the last window (the others are flushed before), this results in an
- * DMMU miss trap, which is fatal with the firmware trap handlers installed.
- * Additionally, it seems that the firmware does not immediately switch to an
- * own stack (or maybe never?), therefore more space needs to be reserved.
- * I hope this is sufficient now.
- */
- .align 4
-DATA(ofwstack)
- .rept CCFSZ * 8
- .byte 0
- .endr
-ofwstack_last:
- .rept CCFSZ
- .byte 0
- .endr
-END(ofwstack)
-
-/*
* void openfirmware(cell_t args[])
*/
ENTRY(openfirmware)
- /*
- * Disable interrupts. The firmware should not deal with our interrupts
- * anyway, and the temporary stack is not large enough to hold the stack
- * footprint of the interrrupt handling.
- */
- rdpr %pstate, %o3
- andn %o3, PSTATE_IE, %o1
- wrpr %o1, 0, %pstate
- setx ofwstack_last - SPOFF, %o1, %o2
- save %o2, 0, %sp
- flushw
- rdpr %tl, %l1
- rdpr %tba, %l2
- mov AA_DMMU_PCXR, %l3
- ldxa [%l3] ASI_DMMU, %l4
- stxa %g0, [%l3] ASI_DMMU
- membar #Sync
- flush %sp
- setx ofw_tba, %l7, %l5
- ldx [%l5], %l5
+ save %sp, -CCFSZ, %sp
setx ofw_vec, %l7, %l6
ldx [%l6], %l6
rdpr %pil, %l7
- wrpr %g0, 14, %pil
- wrpr %l5, 0, %tba
- wrpr %g0, 0, %tl
+ wrpr %g0, PIL_TICK, %pil
call %l6
mov %i0, %o0
- wrpr %l1, 0, %tl
- wrpr %l2, 0, %tba
- stxa %l4, [%l3] ASI_DMMU
wrpr %l7, 0, %pil
- membar #Sync
- flush %sp
- restore
- retl
- wrpr %o3, 0, %pstate
+ ret
+ restore %o0, %g0, %o0
END(openfirmware)
diff --git a/sys/sparc64/sparc64/support.s b/sys/sparc64/sparc64/support.s
index 41469f5..bbd2d13 100644
--- a/sys/sparc64/sparc64/support.s
+++ b/sys/sparc64/sparc64/support.s
@@ -33,62 +33,184 @@
#include "assym.s"
-#define E
+#define E /* empty */
+/*
+ * Generate load and store instructions for the corresponding width and asi
+ * (or not). Note that we want to evaluate the macro args before
+ * concatenating, so that E really turns into nothing.
+ */
#define _LD(w, a) ld ## w ## a
#define _ST(w, a) st ## w ## a
#define LD(w, a) _LD(w, a)
#define ST(w, a) _ST(w, a)
-#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \
- brz,pn len, 2f ; \
- mov len, %o3 ; \
-1: LD(ub, sa) [src] sasi, %o4 ; \
- ST(b, da) %o4, [dst] dasi ; \
- dec %o3 ; \
- inc src ; \
- brnz,pt %o3, 1b ; \
- inc dst ; \
-2:
-
-#define BCOPY(src, dst, len) \
- _BCOPY(src, dst, len, E, E, E, E)
-
-#define COPYIN(uaddr, kaddr, len) \
- wr %g0, ASI_AIUP, %asi ; \
- _BCOPY(uaddr, kaddr, len, a, %asi, E, E)
-
-#define COPYOUT(kaddr, uaddr, len) \
- wr %g0, ASI_AIUP, %asi ; \
- _BCOPY(kaddr, uaddr, len, E, E, a, %asi)
+/*
+ * Common code for copy routines.
+ *
+ * We use large macros to generate functions for each of the copy routines.
+ * This allows the load and store instructions to be generated for the right
+ * operation, asi or not. It is possible to write an asi independent function
+ * but this would require 2 expensive wrs in the main loop to switch %asi.
+ * It would also screw up profiling (if we ever get it), but may save some I$.
+ * We assume that either one of dasi and sasi is empty, or that they are both
+ * the same (empty or non-empty). It is up to the caller to set %asi.
+ */
+/*
+ * ASI independent implementation of copystr(9).
+ * Used to implement copyinstr() and copystr().
+ *
+ * Return value is in %g1.
+ */
#define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
- clr %o4 ; \
- clr %o5 ; \
-1: LD(ub, sa) [src] sasi, %g1 ; \
+ brz len, 4f ; \
+ mov src, %g2 ; \
+1: deccc 1, len ; \
+ bl,a,pn %xcc, 3f ; \
+ nop ; \
+ LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
- brz,pn %g1, 2f ; \
- inc %o4 ; \
- dec len ; \
- inc src ; \
- brgz,pt len, 1b ; \
+ brz,pn %g1, 3f ; \
+ inc src ; \
+ b %xcc, 1b ; \
inc dst ; \
- mov ENAMETOOLONG, %o5 ; \
-2: brnz,a done, 3f ; \
- stx %o4, [done] ; \
-3:
+2: mov ENAMETOOLONG, %g1 ; \
+3: sub src, %g2, %g2 ; \
+ brnz,a done, 4f ; \
+ stx %g2, [done] ; \
+4:
-#define COPYSTR(dst, src, len, done) \
- _COPYSTR(dst, src, len, done, E, E, E, E)
+/*
+ * ASI independent implementation of memset(3).
+ * Used to implement bzero(), memset() and physzero().
+ *
+ * If the pattern is non-zero, duplicate it to fill 64 bits.
+ * Store bytes until dst is 8-byte aligned, then store 8 bytes.
+ * It has yet to be determined how much unrolling is beneficial.
+ * Could also read and compare before writing to minimize snoop traffic.
+ *
+ * XXX bzero() should be implemented as
+ * #define bzero(dst, len) (void)memset((dst), 0, (len))
+ * if at all.
+ */
+#define _MEMSET(dst, pat, len, da, dasi) \
+ brlez,pn len, 5f ; \
+ and pat, 0xff, pat ; \
+ brz,pt pat, 1f ; \
+ sllx pat, 8, %g1 ; \
+ or pat, %g1, pat ; \
+ sllx pat, 16, %g1 ; \
+ or pat, %g1, pat ; \
+ sllx pat, 32, %g1 ; \
+ or pat, %g1, pat ; \
+ .align 16 ; \
+1: deccc 1, len ; \
+ bl,pn %xcc, 5f ; \
+ btst 7, dst ; \
+ bz,a,pt %xcc, 2f ; \
+ inc 1, len ; \
+ ST(b, da) pat, [dst] dasi ; \
+ b %xcc, 1b ; \
+ inc dst ; \
+ .align 16 ; \
+2: deccc 32, len ; \
+ bl,a,pn %xcc, 3f ; \
+ inc 32, len ; \
+ ST(x, da) pat, [dst] dasi ; \
+ ST(x, da) pat, [dst + 8] dasi ; \
+ ST(x, da) pat, [dst + 16] dasi ; \
+ ST(x, da) pat, [dst + 24] dasi ; \
+ b %xcc, 2b ; \
+ inc 32, dst ; \
+ .align 16 ; \
+3: deccc 8, len ; \
+ bl,a,pn %xcc, 4f ; \
+ inc 8, len ; \
+ ST(x, da) pat, [dst] dasi ; \
+ b %xcc, 3b ; \
+ inc 8, dst ; \
+ .align 16 ; \
+4: deccc 1, len ; \
+ bl,a,pn %xcc, 5f ; \
+ nop ; \
+ ST(b, da) pat, [dst] dasi ; \
+ b %xcc, 4b ; \
+ inc 1, dst ; \
+5:
-#define COPYINSTR(uaddr, kaddr, len, done) \
- wr %g0, ASI_AIUP, %asi ; \
- _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)
+/*
+ * ASI independent implementation of memcpy(3).
+ * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().
+ *
+ * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte
+ * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned
+ * case could be optimized, but it is expected that this is the uncommon
+ * case and of questionable value. The code to do so is also rather large
+ * and ugly.
+ * It has yet to be determined how much unrolling is beneficial.
+ *
+ * XXX bcopy() must also check for overlap. This is stupid.
+ * XXX bcopy() should be implemented as
+ * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
+ * if at all.
+ */
+#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \
+1: deccc 1, len ; \
+ bl,pn %xcc, 6f ; \
+ btst 7, dst ; \
+ bz,a,pt %xcc, 2f ; \
+ inc 1, len ; \
+ LD(ub, sa) [src] sasi, %g1 ; \
+ ST(b, da) %g1, [dst] dasi ; \
+ inc 1, src ; \
+ b %xcc, 1b ; \
+ inc 1, dst ; \
+ .align 16 ; \
+2: btst 7, src ; \
+ bz,a,pt %xcc, 3f ; \
+ nop ; \
+ b,a %xcc, 5f ; \
+ .align 16 ; \
+3: deccc 32, len ; \
+ bl,a,pn %xcc, 4f ; \
+ inc 32, len ; \
+ LD(x, sa) [src] sasi, %g1 ; \
+ LD(x, sa) [src + 8] sasi, %g2 ; \
+ LD(x, sa) [src + 16] sasi, %g3 ; \
+ LD(x, sa) [src + 24] sasi, %g4 ; \
+ ST(x, da) %g1, [dst] dasi ; \
+ ST(x, da) %g2, [dst + 8] dasi ; \
+ ST(x, da) %g3, [dst + 16] dasi ; \
+ ST(x, da) %g4, [dst + 24] dasi ; \
+ inc 32, src ; \
+ b %xcc, 3b ; \
+ inc 32, dst ; \
+ .align 16 ; \
+4: deccc 8, len ; \
+ bl,a,pn %xcc, 5f ; \
+ inc 8, len ; \
+ LD(x, sa) [src] sasi, %g1 ; \
+ ST(x, da) %g1, [dst] dasi ; \
+ inc 8, src ; \
+ b %xcc, 4b ; \
+ inc 8, dst ; \
+ .align 16 ; \
+5: deccc 1, len ; \
+ bl,a,pn %xcc, 6f ; \
+ nop ; \
+ LD(ub, sa) [src] sasi, %g1 ; \
+ ST(b, da) %g1, [dst] dasi ; \
+ inc src ; \
+ b %xcc, 5b ; \
+ inc dst ; \
+6:
#define CATCH_SETUP(label) \
setx label, %g2, %g1 ; \
- ldx [PCPU(CURPCB)], %g6 ; \
+ ldx [PCPU(CURTHREAD)], %g6 ; \
+ ldx [%g6 + TD_PCB], %g6 ; \
stx %g1, [%g6 + PCB_ONFAULT] ;
#define CATCH_END() \
@@ -119,7 +241,7 @@
SU_ALIGNED(storer, label)
/*
- * void bcmp(void *b, size_t len)
+ * int bcmp(const void *b1, const void *b2, size_t len)
*/
ENTRY(bcmp)
brz,pn %o2, 2f
@@ -127,7 +249,7 @@ ENTRY(bcmp)
1: ldub [%o0 + %o3], %o4
ldub [%o1 + %o3], %o5
cmp %o4, %o5
- bne,pn %xcc, 1f
+ bne,pn %xcc, 2f
inc %o3
deccc %o2
bne,pt %xcc, 1b
@@ -139,46 +261,90 @@ END(bcmp)
/*
* void bcopy(const void *src, void *dst, size_t len)
*/
+ENTRY(ovbcopy)
ENTRY(bcopy)
- BCOPY(%o0, %o1, %o2)
+ /*
+ * Check for overlap, and copy backwards if so.
+ */
+ sub %o1, %o0, %g1
+ cmp %g1, %o2
+ bgeu,a,pt %xcc, 3f
+ nop
+
+ /*
+ * Copy backwards.
+ */
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+1: deccc 1, %o2
+ bl,a,pn %xcc, 2f
+ nop
+ dec 1, %o0
+ ldub [%o0], %g1
+ dec 1, %o1
+ b %xcc, 1b
+ stb %g1, [%o1]
+2: retl
+ nop
+
+ /*
+ * Do the fast version.
+ */
+3: _MEMCPY(%o1, %o0, %o2, E, E, E, E)
retl
nop
END(bcopy)
/*
- * void ovbcopy(const void *src, void *dst, size_t len)
- * XXX handle overlap...
+ * void bzero(void *b, size_t len)
*/
-ENTRY(ovbcopy)
- BCOPY(%o0, %o1, %o2)
+ENTRY(bzero)
+ _MEMSET(%o0, %g0, %o1, E, E)
retl
nop
-END(ovbcopy)
+END(bzero)
/*
- * void bzero(void *b, size_t len)
+ * void physzero(vm_offset_t pa, size_t len)
*/
-ENTRY(bzero)
- brz,pn %o1, 1f
+ENTRY(physzero)
+ wr %g0, ASI_PHYS_USE_EC, %asi
+ _MEMSET(%o0, %g0, %o1, a, %asi)
+ retl
nop
-1: deccc %o1
- stb %g0, [%o0]
- bne,pt %xcc, 1b
- inc %o0
-2: retl
+END(physzero)
+
+/*
+ * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)
+ */
+ENTRY(physcopy)
+ wr %g0, ASI_PHYS_USE_EC, %asi
+ _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)
+ retl
nop
-END(bzero)
+END(physcopy)
/*
* void *memcpy(void *dst, const void *src, size_t len)
*/
ENTRY(memcpy)
- BCOPY(%o1, %o0, %o2)
+ mov %o0, %o3
+ _MEMCPY(%o3, %o1, %o2, E, E, E, E)
retl
nop
END(memcpy)
/*
+ * void *memset(void *b, int c, size_t len)
+ */
+ENTRY(memset)
+ mov %o0, %o3
+ _MEMSET(%o3, %o1, %o2, E, E)
+ retl
+ nop
+END(memset)
+
+/*
* int copyin(const void *uaddr, void *kaddr, size_t len)
*/
ENTRY(copyin)
@@ -191,7 +357,8 @@ ENTRY(copyin)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
- COPYIN(%o0, %o1, %o2)
+ wr %g0, ASI_AIUP, %asi
+ _MEMCPY(%o1, %o0, %o2, E, E, a, %asi)
CATCH_END()
retl
clr %o0
@@ -211,10 +378,11 @@ ENTRY(copyinstr)
stx %o3, [%g1 + KTR_PARM4]
9:
#endif
- COPYINSTR(%o0, %o1, %o2, %o3)
+ wr %g0, ASI_AIUP, %asi
+ _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)
CATCH_END()
retl
- mov %o5, %o0
+ mov %g1, %o0
END(copyinstr)
/*
@@ -230,7 +398,8 @@ ENTRY(copyout)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
- COPYOUT(%o0, %o1, %o2)
+ wr %g0, ASI_AIUP, %asi
+ _MEMCPY(%o1, %o0, %o2, a, %asi, E, E)
CATCH_END()
retl
clr %o0
@@ -250,9 +419,9 @@ END(copyout)
* int copystr(const void *src, void *dst, size_t len, size_t *done)
*/
ENTRY(copystr)
- COPYSTR(%o0, %o1, %o2, %o3)
+ _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)
retl
- mov %o5, %o0
+ mov %g1, %o0
END(copystr)
/*
@@ -325,7 +494,6 @@ ENTRY(fsbail)
.Lfsalign:
retl
mov -1, %o0
-END(fsbail)
ENTRY(longjmp)
set 1, %g3
@@ -355,64 +523,17 @@ ENTRY(setjmp)
END(setjmp)
/*
- * Temporary stack for calling into the firmware. We need to setup one, because
- * the MMU mapping for our stack page may be lost. When the firmware tries to
- * spill the last window (the others are flushed before), this results in an
- * DMMU miss trap, which is fatal with the firmware trap handlers installed.
- * Additionally, it seems that the firmware does not immediately switch to an
- * own stack (or maybe never?), therefore more space needs to be reserved.
- * I hope this is sufficient now.
- */
- .align 4
-DATA(ofwstack)
- .rept CCFSZ * 8
- .byte 0
- .endr
-ofwstack_last:
- .rept CCFSZ
- .byte 0
- .endr
-END(ofwstack)
-
-/*
* void openfirmware(cell_t args[])
*/
ENTRY(openfirmware)
- /*
- * Disable interrupts. The firmware should not deal with our interrupts
- * anyway, and the temporary stack is not large enough to hold the stack
- * footprint of the interrrupt handling.
- */
- rdpr %pstate, %o3
- andn %o3, PSTATE_IE, %o1
- wrpr %o1, 0, %pstate
- setx ofwstack_last - SPOFF, %o1, %o2
- save %o2, 0, %sp
- flushw
- rdpr %tl, %l1
- rdpr %tba, %l2
- mov AA_DMMU_PCXR, %l3
- ldxa [%l3] ASI_DMMU, %l4
- stxa %g0, [%l3] ASI_DMMU
- membar #Sync
- flush %sp
- setx ofw_tba, %l7, %l5
- ldx [%l5], %l5
+ save %sp, -CCFSZ, %sp
setx ofw_vec, %l7, %l6
ldx [%l6], %l6
rdpr %pil, %l7
- wrpr %g0, 14, %pil
- wrpr %l5, 0, %tba
- wrpr %g0, 0, %tl
+ wrpr %g0, PIL_TICK, %pil
call %l6
mov %i0, %o0
- wrpr %l1, 0, %tl
- wrpr %l2, 0, %tba
- stxa %l4, [%l3] ASI_DMMU
wrpr %l7, 0, %pil
- membar #Sync
- flush %sp
- restore
- retl
- wrpr %o3, 0, %pstate
+ ret
+ restore %o0, %g0, %o0
END(openfirmware)
OpenPOWER on IntegriCloud