diff options
-rw-r--r-- | sys/sparc64/sparc64/support.S | 353 | ||||
-rw-r--r-- | sys/sparc64/sparc64/support.s | 353 |
2 files changed, 474 insertions, 232 deletions
diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S index 41469f5..bbd2d13 100644 --- a/sys/sparc64/sparc64/support.S +++ b/sys/sparc64/sparc64/support.S @@ -33,62 +33,184 @@ #include "assym.s" -#define E +#define E /* empty */ +/* + * Generate load and store instructions for the corresponding width and asi + * (or not). Note that we want to evaluate the macro args before + * concatenating, so that E really turns into nothing. + */ #define _LD(w, a) ld ## w ## a #define _ST(w, a) st ## w ## a #define LD(w, a) _LD(w, a) #define ST(w, a) _ST(w, a) -#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \ - brz,pn len, 2f ; \ - mov len, %o3 ; \ -1: LD(ub, sa) [src] sasi, %o4 ; \ - ST(b, da) %o4, [dst] dasi ; \ - dec %o3 ; \ - inc src ; \ - brnz,pt %o3, 1b ; \ - inc dst ; \ -2: - -#define BCOPY(src, dst, len) \ - _BCOPY(src, dst, len, E, E, E, E) - -#define COPYIN(uaddr, kaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(uaddr, kaddr, len, a, %asi, E, E) - -#define COPYOUT(kaddr, uaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(kaddr, uaddr, len, E, E, a, %asi) +/* + * Common code for copy routines. + * + * We use large macros to generate functions for each of the copy routines. + * This allows the load and store instructions to be generated for the right + * operation, asi or not. It is possible to write an asi independent function + * but this would require 2 expensive wrs in the main loop to switch %asi. + * It would also screw up profiling (if we ever get it), but may save some I$. + * We assume that either one of dasi and sasi is empty, or that they are both + * the same (empty or non-empty). It is up to the caller to set %asi. + */ +/* + * ASI independent implementation of copystr(9). + * Used to implement copyinstr() and copystr(). + * + * Return value is in %g1. + */ #define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \ - clr %o4 ; \ - clr %o5 ; \ -1: LD(ub, sa) [src] sasi, %g1 ; \ + brz len, 4f ; \ + mov src, %g2 ; \ +1: deccc 1, len ; \ + bl,a,pn %xcc, 3f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ ST(b, da) %g1, [dst] dasi ; \ - brz,pn %g1, 2f ; \ - inc %o4 ; \ - dec len ; \ - inc src ; \ - brgz,pt len, 1b ; \ + brz,pn %g1, 3f ; \ + inc src ; \ + b %xcc, 1b ; \ inc dst ; \ - mov ENAMETOOLONG, %o5 ; \ -2: brnz,a done, 3f ; \ - stx %o4, [done] ; \ -3: +2: mov ENAMETOOLONG, %g1 ; \ +3: sub src, %g2, %g2 ; \ + brnz,a done, 4f ; \ + stx %g2, [done] ; \ +4: -#define COPYSTR(dst, src, len, done) \ - _COPYSTR(dst, src, len, done, E, E, E, E) +/* + * ASI independent implementation of memset(3). + * Used to implement bzero(), memset() and physzero(). + * + * If the pattern is non-zero, duplicate it to fill 64 bits. + * Store bytes until dst is 8-byte aligned, then store 8 bytes. + * It has yet to be determined how much unrolling is beneficial. + * Could also read and compare before writing to minimize snoop traffic. + * + * XXX bzero() should be implemented as + * #define bzero(dst, len) (void)memset((dst), 0, (len)) + * if at all. + */ +#define _MEMSET(dst, pat, len, da, dasi) \ + brlez,pn len, 5f ; \ + and pat, 0xff, pat ; \ + brz,pt pat, 1f ; \ + sllx pat, 8, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 16, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 32, %g1 ; \ + or pat, %g1, pat ; \ + .align 16 ; \ +1: deccc 1, len ; \ + bl,pn %xcc, 5f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 1b ; \ + inc dst ; \ + .align 16 ; \ +2: deccc 32, len ; \ + bl,a,pn %xcc, 3f ; \ + inc 32, len ; \ + ST(x, da) pat, [dst] dasi ; \ + ST(x, da) pat, [dst + 8] dasi ; \ + ST(x, da) pat, [dst + 16] dasi ; \ + ST(x, da) pat, [dst + 24] dasi ; \ + b %xcc, 2b ; \ + inc 32, dst ; \ + .align 16 ; \ +3: deccc 8, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 8, len ; \ + ST(x, da) pat, [dst] dasi ; \ + b %xcc, 3b ; \ + inc 8, dst ; \ + .align 16 ; \ +4: deccc 1, len ; \ + bl,a,pn %xcc, 5f ; \ + nop ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 4b ; \ + inc 1, dst ; \ +5: -#define COPYINSTR(uaddr, kaddr, len, done) \ - wr %g0, ASI_AIUP, %asi ; \ - _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E) +/* + * ASI independent implementation of memcpy(3). + * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy(). + * + * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte + * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned + * case could be optimized, but it is expected that this is the uncommon + * case and of questionable value. The code to do so is also rather large + * and ugly. + * It has yet to be determined how much unrolling is beneficial. + * + * XXX bcopy() must also check for overlap. This is stupid. + * XXX bcopy() should be implemented as + * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len)) + * if at all. + */ +#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \ +1: deccc 1, len ; \ + bl,pn %xcc, 6f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc 1, src ; \ + b %xcc, 1b ; \ + inc 1, dst ; \ + .align 16 ; \ +2: btst 7, src ; \ + bz,a,pt %xcc, 3f ; \ + nop ; \ + b,a %xcc, 5f ; \ + .align 16 ; \ +3: deccc 32, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 32, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + LD(x, sa) [src + 8] sasi, %g2 ; \ + LD(x, sa) [src + 16] sasi, %g3 ; \ + LD(x, sa) [src + 24] sasi, %g4 ; \ + ST(x, da) %g1, [dst] dasi ; \ + ST(x, da) %g2, [dst + 8] dasi ; \ + ST(x, da) %g3, [dst + 16] dasi ; \ + ST(x, da) %g4, [dst + 24] dasi ; \ + inc 32, src ; \ + b %xcc, 3b ; \ + inc 32, dst ; \ + .align 16 ; \ +4: deccc 8, len ; \ + bl,a,pn %xcc, 5f ; \ + inc 8, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + ST(x, da) %g1, [dst] dasi ; \ + inc 8, src ; \ + b %xcc, 4b ; \ + inc 8, dst ; \ + .align 16 ; \ +5: deccc 1, len ; \ + bl,a,pn %xcc, 6f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc src ; \ + b %xcc, 5b ; \ + inc dst ; \ +6: #define CATCH_SETUP(label) \ setx label, %g2, %g1 ; \ - ldx [PCPU(CURPCB)], %g6 ; \ + ldx [PCPU(CURTHREAD)], %g6 ; \ + ldx [%g6 + TD_PCB], %g6 ; \ stx %g1, [%g6 + PCB_ONFAULT] ; #define CATCH_END() \ @@ -119,7 +241,7 @@ SU_ALIGNED(storer, label) /* - * void bcmp(void *b, size_t len) + * int bcmp(const void *b1, const void *b2, size_t len) */ ENTRY(bcmp) brz,pn %o2, 2f @@ -127,7 +249,7 @@ ENTRY(bcmp) 1: ldub [%o0 + %o3], %o4 ldub [%o1 + %o3], %o5 cmp %o4, %o5 - bne,pn %xcc, 1f + bne,pn %xcc, 2f inc %o3 deccc %o2 bne,pt %xcc, 1b @@ -139,46 +261,90 @@ END(bcmp) /* * void bcopy(const void *src, void *dst, size_t len) */ +ENTRY(ovbcopy) ENTRY(bcopy) - BCOPY(%o0, %o1, %o2) + /* + * Check for overlap, and copy backwards if so. + */ + sub %o1, %o0, %g1 + cmp %g1, %o2 + bgeu,a,pt %xcc, 3f + nop + + /* + * Copy backwards. + */ + add %o0, %o2, %o0 + add %o1, %o2, %o1 +1: deccc 1, %o2 + bl,a,pn %xcc, 2f + nop + dec 1, %o0 + ldub [%o0], %g1 + dec 1, %o1 + b %xcc, 1b + stb %g1, [%o1] +2: retl + nop + + /* + * Do the fast version. + */ +3: _MEMCPY(%o1, %o0, %o2, E, E, E, E) retl nop END(bcopy) /* - * void ovbcopy(const void *src, void *dst, size_t len) - * XXX handle overlap... + * void bzero(void *b, size_t len) */ -ENTRY(ovbcopy) - BCOPY(%o0, %o1, %o2) +ENTRY(bzero) + _MEMSET(%o0, %g0, %o1, E, E) retl nop -END(ovbcopy) +END(bzero) /* - * void bzero(void *b, size_t len) + * void physzero(vm_offset_t pa, size_t len) */ -ENTRY(bzero) - brz,pn %o1, 1f +ENTRY(physzero) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMSET(%o0, %g0, %o1, a, %asi) + retl nop -1: deccc %o1 - stb %g0, [%o0] - bne,pt %xcc, 1b - inc %o0 -2: retl +END(physzero) + +/* + * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len) + */ +ENTRY(physcopy) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi) + retl nop -END(bzero) +END(physcopy) /* * void *memcpy(void *dst, const void *src, size_t len) */ ENTRY(memcpy) - BCOPY(%o1, %o0, %o2) + mov %o0, %o3 + _MEMCPY(%o3, %o1, %o2, E, E, E, E) retl nop END(memcpy) /* + * void *memset(void *b, int c, size_t len) + */ +ENTRY(memset) + mov %o0, %o3 + _MEMSET(%o3, %o1, %o2, E, E) + retl + nop +END(memset) + +/* * int copyin(const void *uaddr, void *kaddr, size_t len) */ ENTRY(copyin) @@ -191,7 +357,8 @@ ENTRY(copyin) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYIN(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, E, E, a, %asi) CATCH_END() retl clr %o0 @@ -211,10 +378,11 @@ ENTRY(copyinstr) stx %o3, [%g1 + KTR_PARM4] 9: #endif - COPYINSTR(%o0, %o1, %o2, %o3) + wr %g0, ASI_AIUP, %asi + _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E) CATCH_END() retl - mov %o5, %o0 + mov %g1, %o0 END(copyinstr) /* @@ -230,7 +398,8 @@ ENTRY(copyout) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYOUT(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, E, E) CATCH_END() retl clr %o0 @@ -250,9 +419,9 @@ END(copyout) * int copystr(const void *src, void *dst, size_t len, size_t *done) */ ENTRY(copystr) - COPYSTR(%o0, %o1, %o2, %o3) + _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E) retl - mov %o5, %o0 + mov %g1, %o0 END(copystr) /* @@ -325,7 +494,6 @@ ENTRY(fsbail) .Lfsalign: retl mov -1, %o0 -END(fsbail) ENTRY(longjmp) set 1, %g3 @@ -355,64 +523,17 @@ ENTRY(setjmp) END(setjmp) /* - * Temporary stack for calling into the firmware. We need to setup one, because - * the MMU mapping for our stack page may be lost. When the firmware tries to - * spill the last window (the others are flushed before), this results in an - * DMMU miss trap, which is fatal with the firmware trap handlers installed. - * Additionally, it seems that the firmware does not immediately switch to an - * own stack (or maybe never?), therefore more space needs to be reserved. - * I hope this is sufficient now. - */ - .align 4 -DATA(ofwstack) - .rept CCFSZ * 8 - .byte 0 - .endr -ofwstack_last: - .rept CCFSZ - .byte 0 - .endr -END(ofwstack) - -/* * void openfirmware(cell_t args[]) */ ENTRY(openfirmware) - /* - * Disable interrupts. The firmware should not deal with our interrupts - * anyway, and the temporary stack is not large enough to hold the stack - * footprint of the interrrupt handling. - */ - rdpr %pstate, %o3 - andn %o3, PSTATE_IE, %o1 - wrpr %o1, 0, %pstate - setx ofwstack_last - SPOFF, %o1, %o2 - save %o2, 0, %sp - flushw - rdpr %tl, %l1 - rdpr %tba, %l2 - mov AA_DMMU_PCXR, %l3 - ldxa [%l3] ASI_DMMU, %l4 - stxa %g0, [%l3] ASI_DMMU - membar #Sync - flush %sp - setx ofw_tba, %l7, %l5 - ldx [%l5], %l5 + save %sp, -CCFSZ, %sp setx ofw_vec, %l7, %l6 ldx [%l6], %l6 rdpr %pil, %l7 - wrpr %g0, 14, %pil - wrpr %l5, 0, %tba - wrpr %g0, 0, %tl + wrpr %g0, PIL_TICK, %pil call %l6 mov %i0, %o0 - wrpr %l1, 0, %tl - wrpr %l2, 0, %tba - stxa %l4, [%l3] ASI_DMMU wrpr %l7, 0, %pil - membar #Sync - flush %sp - restore - retl - wrpr %o3, 0, %pstate + ret + restore %o0, %g0, %o0 END(openfirmware) diff --git a/sys/sparc64/sparc64/support.s b/sys/sparc64/sparc64/support.s index 41469f5..bbd2d13 100644 --- a/sys/sparc64/sparc64/support.s +++ b/sys/sparc64/sparc64/support.s @@ -33,62 +33,184 @@ #include "assym.s" -#define E +#define E /* empty */ +/* + * Generate load and store instructions for the corresponding width and asi + * (or not). Note that we want to evaluate the macro args before + * concatenating, so that E really turns into nothing. + */ #define _LD(w, a) ld ## w ## a #define _ST(w, a) st ## w ## a #define LD(w, a) _LD(w, a) #define ST(w, a) _ST(w, a) -#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \ - brz,pn len, 2f ; \ - mov len, %o3 ; \ -1: LD(ub, sa) [src] sasi, %o4 ; \ - ST(b, da) %o4, [dst] dasi ; \ - dec %o3 ; \ - inc src ; \ - brnz,pt %o3, 1b ; \ - inc dst ; \ -2: - -#define BCOPY(src, dst, len) \ - _BCOPY(src, dst, len, E, E, E, E) - -#define COPYIN(uaddr, kaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(uaddr, kaddr, len, a, %asi, E, E) - -#define COPYOUT(kaddr, uaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(kaddr, uaddr, len, E, E, a, %asi) +/* + * Common code for copy routines. + * + * We use large macros to generate functions for each of the copy routines. + * This allows the load and store instructions to be generated for the right + * operation, asi or not. It is possible to write an asi independent function + * but this would require 2 expensive wrs in the main loop to switch %asi. + * It would also screw up profiling (if we ever get it), but may save some I$. + * We assume that either one of dasi and sasi is empty, or that they are both + * the same (empty or non-empty). It is up to the caller to set %asi. + */ +/* + * ASI independent implementation of copystr(9). + * Used to implement copyinstr() and copystr(). + * + * Return value is in %g1. + */ #define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \ - clr %o4 ; \ - clr %o5 ; \ -1: LD(ub, sa) [src] sasi, %g1 ; \ + brz len, 4f ; \ + mov src, %g2 ; \ +1: deccc 1, len ; \ + bl,a,pn %xcc, 3f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ ST(b, da) %g1, [dst] dasi ; \ - brz,pn %g1, 2f ; \ - inc %o4 ; \ - dec len ; \ - inc src ; \ - brgz,pt len, 1b ; \ + brz,pn %g1, 3f ; \ + inc src ; \ + b %xcc, 1b ; \ inc dst ; \ - mov ENAMETOOLONG, %o5 ; \ -2: brnz,a done, 3f ; \ - stx %o4, [done] ; \ -3: +2: mov ENAMETOOLONG, %g1 ; \ +3: sub src, %g2, %g2 ; \ + brnz,a done, 4f ; \ + stx %g2, [done] ; \ +4: -#define COPYSTR(dst, src, len, done) \ - _COPYSTR(dst, src, len, done, E, E, E, E) +/* + * ASI independent implementation of memset(3). + * Used to implement bzero(), memset() and physzero(). + * + * If the pattern is non-zero, duplicate it to fill 64 bits. + * Store bytes until dst is 8-byte aligned, then store 8 bytes. + * It has yet to be determined how much unrolling is beneficial. + * Could also read and compare before writing to minimize snoop traffic. + * + * XXX bzero() should be implemented as + * #define bzero(dst, len) (void)memset((dst), 0, (len)) + * if at all. + */ +#define _MEMSET(dst, pat, len, da, dasi) \ + brlez,pn len, 5f ; \ + and pat, 0xff, pat ; \ + brz,pt pat, 1f ; \ + sllx pat, 8, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 16, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 32, %g1 ; \ + or pat, %g1, pat ; \ + .align 16 ; \ +1: deccc 1, len ; \ + bl,pn %xcc, 5f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 1b ; \ + inc dst ; \ + .align 16 ; \ +2: deccc 32, len ; \ + bl,a,pn %xcc, 3f ; \ + inc 32, len ; \ + ST(x, da) pat, [dst] dasi ; \ + ST(x, da) pat, [dst + 8] dasi ; \ + ST(x, da) pat, [dst + 16] dasi ; \ + ST(x, da) pat, [dst + 24] dasi ; \ + b %xcc, 2b ; \ + inc 32, dst ; \ + .align 16 ; \ +3: deccc 8, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 8, len ; \ + ST(x, da) pat, [dst] dasi ; \ + b %xcc, 3b ; \ + inc 8, dst ; \ + .align 16 ; \ +4: deccc 1, len ; \ + bl,a,pn %xcc, 5f ; \ + nop ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 4b ; \ + inc 1, dst ; \ +5: -#define COPYINSTR(uaddr, kaddr, len, done) \ - wr %g0, ASI_AIUP, %asi ; \ - _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E) +/* + * ASI independent implementation of memcpy(3). + * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy(). + * + * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte + * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned + * case could be optimized, but it is expected that this is the uncommon + * case and of questionable value. The code to do so is also rather large + * and ugly. + * It has yet to be determined how much unrolling is beneficial. + * + * XXX bcopy() must also check for overlap. This is stupid. + * XXX bcopy() should be implemented as + * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len)) + * if at all. + */ +#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \ +1: deccc 1, len ; \ + bl,pn %xcc, 6f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc 1, src ; \ + b %xcc, 1b ; \ + inc 1, dst ; \ + .align 16 ; \ +2: btst 7, src ; \ + bz,a,pt %xcc, 3f ; \ + nop ; \ + b,a %xcc, 5f ; \ + .align 16 ; \ +3: deccc 32, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 32, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + LD(x, sa) [src + 8] sasi, %g2 ; \ + LD(x, sa) [src + 16] sasi, %g3 ; \ + LD(x, sa) [src + 24] sasi, %g4 ; \ + ST(x, da) %g1, [dst] dasi ; \ + ST(x, da) %g2, [dst + 8] dasi ; \ + ST(x, da) %g3, [dst + 16] dasi ; \ + ST(x, da) %g4, [dst + 24] dasi ; \ + inc 32, src ; \ + b %xcc, 3b ; \ + inc 32, dst ; \ + .align 16 ; \ +4: deccc 8, len ; \ + bl,a,pn %xcc, 5f ; \ + inc 8, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + ST(x, da) %g1, [dst] dasi ; \ + inc 8, src ; \ + b %xcc, 4b ; \ + inc 8, dst ; \ + .align 16 ; \ +5: deccc 1, len ; \ + bl,a,pn %xcc, 6f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc src ; \ + b %xcc, 5b ; \ + inc dst ; \ +6: #define CATCH_SETUP(label) \ setx label, %g2, %g1 ; \ - ldx [PCPU(CURPCB)], %g6 ; \ + ldx [PCPU(CURTHREAD)], %g6 ; \ + ldx [%g6 + TD_PCB], %g6 ; \ stx %g1, [%g6 + PCB_ONFAULT] ; #define CATCH_END() \ @@ -119,7 +241,7 @@ SU_ALIGNED(storer, label) /* - * void bcmp(void *b, size_t len) + * int bcmp(const void *b1, const void *b2, size_t len) */ ENTRY(bcmp) brz,pn %o2, 2f @@ -127,7 +249,7 @@ ENTRY(bcmp) 1: ldub [%o0 + %o3], %o4 ldub [%o1 + %o3], %o5 cmp %o4, %o5 - bne,pn %xcc, 1f + bne,pn %xcc, 2f inc %o3 deccc %o2 bne,pt %xcc, 1b @@ -139,46 +261,90 @@ END(bcmp) /* * void bcopy(const void *src, void *dst, size_t len) */ +ENTRY(ovbcopy) ENTRY(bcopy) - BCOPY(%o0, %o1, %o2) + /* + * Check for overlap, and copy backwards if so. + */ + sub %o1, %o0, %g1 + cmp %g1, %o2 + bgeu,a,pt %xcc, 3f + nop + + /* + * Copy backwards. + */ + add %o0, %o2, %o0 + add %o1, %o2, %o1 +1: deccc 1, %o2 + bl,a,pn %xcc, 2f + nop + dec 1, %o0 + ldub [%o0], %g1 + dec 1, %o1 + b %xcc, 1b + stb %g1, [%o1] +2: retl + nop + + /* + * Do the fast version. + */ +3: _MEMCPY(%o1, %o0, %o2, E, E, E, E) retl nop END(bcopy) /* - * void ovbcopy(const void *src, void *dst, size_t len) - * XXX handle overlap... + * void bzero(void *b, size_t len) */ -ENTRY(ovbcopy) - BCOPY(%o0, %o1, %o2) +ENTRY(bzero) + _MEMSET(%o0, %g0, %o1, E, E) retl nop -END(ovbcopy) +END(bzero) /* - * void bzero(void *b, size_t len) + * void physzero(vm_offset_t pa, size_t len) */ -ENTRY(bzero) - brz,pn %o1, 1f +ENTRY(physzero) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMSET(%o0, %g0, %o1, a, %asi) + retl nop -1: deccc %o1 - stb %g0, [%o0] - bne,pt %xcc, 1b - inc %o0 -2: retl +END(physzero) + +/* + * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len) + */ +ENTRY(physcopy) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi) + retl nop -END(bzero) +END(physcopy) /* * void *memcpy(void *dst, const void *src, size_t len) */ ENTRY(memcpy) - BCOPY(%o1, %o0, %o2) + mov %o0, %o3 + _MEMCPY(%o3, %o1, %o2, E, E, E, E) retl nop END(memcpy) /* + * void *memset(void *b, int c, size_t len) + */ +ENTRY(memset) + mov %o0, %o3 + _MEMSET(%o3, %o1, %o2, E, E) + retl + nop +END(memset) + +/* * int copyin(const void *uaddr, void *kaddr, size_t len) */ ENTRY(copyin) @@ -191,7 +357,8 @@ ENTRY(copyin) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYIN(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, E, E, a, %asi) CATCH_END() retl clr %o0 @@ -211,10 +378,11 @@ ENTRY(copyinstr) stx %o3, [%g1 + KTR_PARM4] 9: #endif - COPYINSTR(%o0, %o1, %o2, %o3) + wr %g0, ASI_AIUP, %asi + _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E) CATCH_END() retl - mov %o5, %o0 + mov %g1, %o0 END(copyinstr) /* @@ -230,7 +398,8 @@ ENTRY(copyout) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYOUT(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, E, E) CATCH_END() retl clr %o0 @@ -250,9 +419,9 @@ END(copyout) * int copystr(const void *src, void *dst, size_t len, size_t *done) */ ENTRY(copystr) - COPYSTR(%o0, %o1, %o2, %o3) + _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E) retl - mov %o5, %o0 + mov %g1, %o0 END(copystr) /* @@ -325,7 +494,6 @@ ENTRY(fsbail) .Lfsalign: retl mov -1, %o0 -END(fsbail) ENTRY(longjmp) set 1, %g3 @@ -355,64 +523,17 @@ ENTRY(setjmp) END(setjmp) /* - * Temporary stack for calling into the firmware. We need to setup one, because - * the MMU mapping for our stack page may be lost. When the firmware tries to - * spill the last window (the others are flushed before), this results in an - * DMMU miss trap, which is fatal with the firmware trap handlers installed. - * Additionally, it seems that the firmware does not immediately switch to an - * own stack (or maybe never?), therefore more space needs to be reserved. - * I hope this is sufficient now. - */ - .align 4 -DATA(ofwstack) - .rept CCFSZ * 8 - .byte 0 - .endr -ofwstack_last: - .rept CCFSZ - .byte 0 - .endr -END(ofwstack) - -/* * void openfirmware(cell_t args[]) */ ENTRY(openfirmware) - /* - * Disable interrupts. The firmware should not deal with our interrupts - * anyway, and the temporary stack is not large enough to hold the stack - * footprint of the interrrupt handling. - */ - rdpr %pstate, %o3 - andn %o3, PSTATE_IE, %o1 - wrpr %o1, 0, %pstate - setx ofwstack_last - SPOFF, %o1, %o2 - save %o2, 0, %sp - flushw - rdpr %tl, %l1 - rdpr %tba, %l2 - mov AA_DMMU_PCXR, %l3 - ldxa [%l3] ASI_DMMU, %l4 - stxa %g0, [%l3] ASI_DMMU - membar #Sync - flush %sp - setx ofw_tba, %l7, %l5 - ldx [%l5], %l5 + save %sp, -CCFSZ, %sp setx ofw_vec, %l7, %l6 ldx [%l6], %l6 rdpr %pil, %l7 - wrpr %g0, 14, %pil - wrpr %l5, 0, %tba - wrpr %g0, 0, %tl + wrpr %g0, PIL_TICK, %pil call %l6 mov %i0, %o0 - wrpr %l1, 0, %tl - wrpr %l2, 0, %tba - stxa %l4, [%l3] ASI_DMMU wrpr %l7, 0, %pil - membar #Sync - flush %sp - restore - retl - wrpr %o3, 0, %pstate + ret + restore %o0, %g0, %o0 END(openfirmware) |