diff options
author | kmacy <kmacy@FreeBSD.org> | 2006-11-23 19:58:06 +0000 |
---|---|---|
committer | kmacy <kmacy@FreeBSD.org> | 2006-11-23 19:58:06 +0000 |
commit | c784eb28fd564165eec1c920c5ea555324910901 (patch) | |
tree | bd9650d04ee7cc53bb5869647cfe9a7209e77afe /sys/sun4v/cddl | |
parent | d47577789b1b7d11b294d9305019db910d35efe0 (diff) | |
download | FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.zip FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.tar.gz |
separate out legitimately CDDL code - optimized routines taken from
opensolaries
Diffstat (limited to 'sys/sun4v/cddl')
-rw-r--r-- | sys/sun4v/cddl/t1_copy.S | 1598 |
1 files changed, 1598 insertions, 0 deletions
diff --git a/sys/sun4v/cddl/t1_copy.S b/sys/sun4v/cddl/t1_copy.S new file mode 100644 index 0000000..6fe5a3d --- /dev/null +++ b/sys/sun4v/cddl/t1_copy.S @@ -0,0 +1,1598 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http: //www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$") + +#include <machine/asi.h> +#include <machine/asmacros.h> +#include <machine/ktr.h> +#include <machine/pstate.h> +#include <machine/trap.h> +#include <machine/tstate.h> +#include <machine/wstate.h> +#include <machine/hypervisorvar.h> + + .register %g2,#ignore + .register %g3,#ignore + .register %g6,#ignore + .register %g7,#ignore + + +/* + * This define is to align data for the unaligned source cases. + * The data1, data2 and data3 is merged into data1 and data2. + * The data3 is preserved for next merge. + */ +#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ + sllx data1, lshift, data1 ;\ + srlx data2, rshift, tmp ;\ + or data1, tmp, data1 ;\ + sllx data2, lshift, data2 ;\ + srlx data3, rshift, tmp ;\ + or data2, tmp, data2 +/* + * This macro is to align the data. Basically it merges + * data1 and data2 to form double word. + */ +#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ + sllx data1, lshift, data1 ;\ + srlx data2, rshift, tmp ;\ + or data1, tmp, data1 + + + + + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .section ".data" ; \ + .global name ; \ + .type name, @object ; \ + .size name, sz; \ +name: + +#define DGDEF3(name, sz, algn) \ + .section ".data" ; \ + .align algn ; \ + .global name ; \ + .type name, @object ; \ + .size name, sz; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +.align 4 +DGDEF(hw_copy_limit_1) +.word 0x100 +DGDEF(hw_copy_limit_2) +.word 0x200 +DGDEF(hw_copy_limit_4) +.word 0x400 +DGDEF(hw_copy_limit_8) +.word 0x400 +.align 64 +.section ".text" + + +#if defined(lint) + +/*ARGSUSED*/ +void +ovbcopy(const void *from, void *to, size_t count) +{} + +#else /* lint */ + +ENTRY(bcopy) + tst %o2 ! check count + bgu,a %xcc, 1f ! nothing to do or bad arguments + subcc %o0, %o1, %o3 ! difference of from and to address + + retl ! return + nop +1: + bneg,a %xcc, 2f + neg %o3 ! if < 0, make it positive +2: cmp %o2, %o3 ! cmp size and abs(from - to) + bleu %xcc, novbcopy ! if size <= abs(diff): use bcopy, + nop + cmp %o0, %o1 ! compare from and to addresses + blu %xcc, ov_bkwd ! if from < to, copy backwards + nop + ! + ! Copy forwards. + ! +ov_fwd: + ldub [%o0], %o3 ! read from address + inc %o0 ! inc from address + stb %o3, [%o1] ! write to address + deccc %o2 ! dec count + bgu %xcc, ov_fwd ! loop till done + inc %o1 ! inc to address + + retl ! return + nop + ! + ! Copy backwards. + ! +ov_bkwd: + deccc %o2 ! dec count + ldub [%o0 + %o2], %o3 ! get byte at end of src + bgu %xcc, ov_bkwd ! loop till done + stb %o3, [%o1 + %o2] ! delay slot, store at end of dst + + retl ! return + nop +END(bcopy) + +#endif /* lint */ + + + +/* + * Copy a block of storage - must not overlap (from + len <= to). + */ +ENTRY(novbcopy) + + save %sp, -SA(MINFRAME), %sp + +do_copy: + cmp %i2, 12 ! for small counts + blu %xcc, bytecp ! just copy bytes + nop + + cmp %i2, 128 ! for less than 128 bytes + blu,pn %xcc, bcb_punt ! no block st/quad ld + nop +#if 0 + set use_hw_bcopy, %o2 + ld [%o2], %o2 + tst %o2 + bz bcb_punt + nop +#endif + subcc %i1, %i0, %i3 + bneg,a,pn %xcc, 1f + neg %i3 +1: + /* + * Compare against 256 since we should be checking block addresses + * and (dest & ~63) - (src & ~63) can be 3 blocks even if + * src = dest + (64 * 3) + 63. + */ + cmp %i3, 256 + blu,pn %xcc, bcb_punt + nop + + /* + * Copy that reach here have at least 2 blocks of data to copy. + */ +do_blockcopy: + ! Swap src/dst since the code below is memcpy code + ! and memcpy/bcopy have different calling sequences + mov %i1, %i5 + mov %i0, %i1 + mov %i5, %i0 + + andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes + bz %xcc, chksrc ! dst is already double aligned + sub %i3, 0x40, %i3 + neg %i3 ! bytes till dst 64 bytes aligned + sub %i2, %i3, %i2 ! update i2 with new count + +1: ldub [%i1], %i4 + stb %i4, [%i0] + inc %i1 + deccc %i3 + bgu %xcc, 1b + inc %i0 + + ! Now Destination is block (64 bytes) aligned +chksrc: + andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size + sub %i2, %i3, %i2 ! Residue bytes in %i2 + + wr %g0, ASI_LDSTBI_P, %asi + + andcc %i1, 0xf, %o2 ! is src quadword aligned + bz,pn %xcc, blkcpy ! src offset in %o2 + nop + cmp %o2, 0x8 + bg cpy_upper_double + nop + bl cpy_lower_double + nop + + ! Falls through when source offset is equal to 8 i.e. + ! source is double word aligned. + ! In this case no shift/merge of data is required + sub %i1, %o2, %i1 ! align the src at 16 bytes. + andn %i1, 0x3f, %l0 ! %l0 has block aligned source + prefetch [%l0+0x0], #one_read + ldda [%i1+0x0]%asi, %l2 +loop0: + ldda [%i1+0x10]%asi, %l4 + prefetch [%l0+0x40], #one_read + + stxa %l3, [%i0+0x0]%asi + stxa %l4, [%i0+0x8]%asi + + ldda [%i1+0x20]%asi, %l2 + stxa %l5, [%i0+0x10]%asi + stxa %l2, [%i0+0x18]%asi + + ldda [%i1+0x30]%asi, %l4 + stxa %l3, [%i0+0x20]%asi + stxa %l4, [%i0+0x28]%asi + + ldda [%i1+0x40]%asi, %l2 + stxa %l5, [%i0+0x30]%asi + stxa %l2, [%i0+0x38]%asi + + add %l0, 0x40, %l0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, loop0 + add %i0, 0x40, %i0 + ba blkdone + add %i1, %o2, %i1 ! increment the source by src offset + ! the src offset was stored in %o2 + +cpy_lower_double: + sub %i1, %o2, %i1 ! align the src at 16 bytes. + sll %o2, 3, %o0 ! %o0 left shift + mov 0x40, %o1 + sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) + andn %i1, 0x3f, %l0 ! %l0 has block aligned source + prefetch [%l0+0x0], #one_read + ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has + ! complete data +loop1: + ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read. + ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 + ! into %l2 and %l3 + prefetch [%l0+0x40], #one_read + stxa %l2, [%i0+0x0]%asi + stxa %l3, [%i0+0x8]%asi + + ldda [%i1+0x20]%asi, %l2 + ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and + stxa %l4, [%i0+0x10]%asi ! %l4 from previous read + stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5 + + ! Repeat the same for next 32 bytes. + + ldda [%i1+0x30]%asi, %l4 + ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) + stxa %l2, [%i0+0x20]%asi + stxa %l3, [%i0+0x28]%asi + + ldda [%i1+0x40]%asi, %l2 + ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) + stxa %l4, [%i0+0x30]%asi + stxa %l5, [%i0+0x38]%asi + + add %l0, 0x40, %l0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, loop1 + add %i0, 0x40, %i0 + ba blkdone + add %i1, %o2, %i1 ! increment the source by src offset + ! the src offset was stored in %o2 + +cpy_upper_double: + sub %i1, %o2, %i1 ! align the src at 16 bytes. + mov 0x8, %o0 + sub %o2, %o0, %o0 + sll %o0, 3, %o0 ! %o0 left shift + mov 0x40, %o1 + sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) + andn %i1, 0x3f, %l0 ! %l0 has block aligned source + prefetch [%l0+0x0], #one_read + ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and + ! no data in %l2 +loop2: + ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has + ! partial + ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 + ! into %l3 and %l4 + prefetch [%l0+0x40], #one_read + stxa %l3, [%i0+0x0]%asi + stxa %l4, [%i0+0x8]%asi + + ldda [%i1+0x20]%asi, %l2 + ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with + stxa %l5, [%i0+0x10]%asi ! %l5 from previous read + stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2 + + ! Repeat the same for next 32 bytes. + + ldda [%i1+0x30]%asi, %l4 + ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) + stxa %l3, [%i0+0x20]%asi + stxa %l4, [%i0+0x28]%asi + + ldda [%i1+0x40]%asi, %l2 + ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) + stxa %l5, [%i0+0x30]%asi + stxa %l2, [%i0+0x38]%asi + + add %l0, 0x40, %l0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, loop2 + add %i0, 0x40, %i0 + ba blkdone + add %i1, %o2, %i1 ! increment the source by src offset + ! the src offset was stored in %o2 + + + ! Both Source and Destination are block aligned. + ! Do fast copy using ASI_LDSTBI_P +blkcpy: + prefetch [%i1+0x0], #one_read +1: + ldda [%i1+0x0]%asi, %l0 + ldda [%i1+0x10]%asi, %l2 + prefetch [%i1+0x40], #one_read + + stxa %l0, [%i0+0x0]%asi + ldda [%i1+0x20]%asi, %l4 + ldda [%i1+0x30]%asi, %l6 + + stxa %l1, [%i0+0x8]%asi + stxa %l2, [%i0+0x10]%asi + stxa %l3, [%i0+0x18]%asi + stxa %l4, [%i0+0x20]%asi + stxa %l5, [%i0+0x28]%asi + stxa %l6, [%i0+0x30]%asi + stxa %l7, [%i0+0x38]%asi + + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, 1b + add %i0, 0x40, %i0 + +blkdone: + tst %i2 + bz,pt %xcc, blkexit + nop + +residue: + ldub [%i1], %i4 + stb %i4, [%i0] + inc %i1 + deccc %i2 + bgu %xcc, residue + inc %i0 + +blkexit: + membar #Sync ! sync error barrier + ret + restore %g0, 0, %o0 + +bcb_punt: + ! + ! use aligned transfers where possible + ! + xor %i0, %i1, %o4 ! xor from and to address + btst 7, %o4 ! if lower three bits zero + bz aldoubcp ! can align on double boundary + nop ! assembler complaints about label + + xor %i0, %i1, %o4 ! xor from and to address + btst 3, %o4 ! if lower two bits zero + bz alwordcp ! can align on word boundary + btst 3, %i0 ! delay slot, from address unaligned? + ! + ! use aligned reads and writes where possible + ! this differs from wordcp in that it copes + ! with odd alignment between source and destnation + ! using word reads and writes with the proper shifts + ! in between to align transfers to and from memory + ! i0 - src address, i1 - dest address, i2 - count + ! i3, i4 - tmps for used generating complete word + ! i5 (word to write) + ! l0 size in bits of upper part of source word (US) + ! l1 size in bits of lower part of source word (LS = 32 - US) + ! l2 size in bits of upper part of destination word (UD) + ! l3 size in bits of lower part of destination word (LD = 32 - UD) + ! l4 number of bytes leftover after aligned transfers complete + ! l5 the number 32 + ! + mov 32, %l5 ! load an oft-needed constant + bz align_dst_only + btst 3, %i1 ! is destnation address aligned? + clr %i4 ! clear registers used in either case + bz align_src_only + clr %l0 + ! + ! both source and destination addresses are unaligned + ! +1: ! align source + ldub [%i0], %i3 ! read a byte from source address + add %i0, 1, %i0 ! increment source address + or %i4, %i3, %i4 ! or in with previous bytes (if any) + btst 3, %i0 ! is source aligned? + add %l0, 8, %l0 ! increment size of upper source (US) + bnz,a 1b + sll %i4, 8, %i4 ! make room for next byte + + sub %l5, %l0, %l1 ! generate shift left count (LS) + sll %i4, %l1, %i4 ! prepare to get rest + ld [%i0], %i3 ! read a word + add %i0, 4, %i0 ! increment source address + srl %i3, %l0, %i5 ! upper src bits into lower dst bits + or %i4, %i5, %i5 ! merge + mov 24, %l3 ! align destination +1: + srl %i5, %l3, %i4 ! prepare to write a single byte + stb %i4, [%i1] ! write a byte + add %i1, 1, %i1 ! increment destination address + sub %i2, 1, %i2 ! decrement count + btst 3, %i1 ! is destination aligned? + bnz,a 1b + sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) + sub %l5, %l3, %l2 ! generate shift left count (UD) + sll %i5, %l2, %i5 ! move leftover into upper bytes + cmp %l2, %l0 ! cmp # reqd to fill dst w old src left + bgu %xcc, more_needed ! need more to fill than we have + nop + + sll %i3, %l1, %i3 ! clear upper used byte(s) + srl %i3, %l1, %i3 + ! get the odd bytes between alignments + sub %l0, %l2, %l0 ! regenerate shift count + sub %l5, %l0, %l1 ! generate new shift left count (LS) + and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 + andn %i2, 3, %i2 ! # of aligned bytes that can be moved + srl %i3, %l0, %i4 + or %i5, %i4, %i5 + st %i5, [%i1] ! write a word + subcc %i2, 4, %i2 ! decrement count + bz %xcc, unalign_out + add %i1, 4, %i1 ! increment destination address + + b 2f + sll %i3, %l1, %i5 ! get leftover into upper bits +more_needed: + sll %i3, %l0, %i3 ! save remaining byte(s) + srl %i3, %l0, %i3 + sub %l2, %l0, %l1 ! regenerate shift count + sub %l5, %l1, %l0 ! generate new shift left count + sll %i3, %l1, %i4 ! move to fill empty space + b 3f + or %i5, %i4, %i5 ! merge to complete word + ! + ! the source address is aligned and destination is not + ! +align_dst_only: + ld [%i0], %i4 ! read a word + add %i0, 4, %i0 ! increment source address + mov 24, %l0 ! initial shift alignment count +1: + srl %i4, %l0, %i3 ! prepare to write a single byte + stb %i3, [%i1] ! write a byte + add %i1, 1, %i1 ! increment destination address + sub %i2, 1, %i2 ! decrement count + btst 3, %i1 ! is destination aligned? + bnz,a 1b + sub %l0, 8, %l0 ! delay slot, decrement shift count +xfer: + sub %l5, %l0, %l1 ! generate shift left count + sll %i4, %l1, %i5 ! get leftover +3: + and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 + andn %i2, 3, %i2 ! # of aligned bytes that can be moved +2: + ld [%i0], %i3 ! read a source word + add %i0, 4, %i0 ! increment source address + srl %i3, %l0, %i4 ! upper src bits into lower dst bits + or %i5, %i4, %i5 ! merge with upper dest bits (leftover) + st %i5, [%i1] ! write a destination word + subcc %i2, 4, %i2 ! decrement count + bz %xcc, unalign_out ! check if done + add %i1, 4, %i1 ! increment destination address + b 2b ! loop + sll %i3, %l1, %i5 ! get leftover +unalign_out: + tst %l4 ! any bytes leftover? + bz %xcc, cpdone + nop +1: + sub %l0, 8, %l0 ! decrement shift + srl %i3, %l0, %i4 ! upper src byte into lower dst byte + stb %i4, [%i1] ! write a byte + subcc %l4, 1, %l4 ! decrement count + bz %xcc, cpdone ! done? + add %i1, 1, %i1 ! increment destination + tst %l0 ! any more previously read bytes + bnz %xcc, 1b ! we have leftover bytes + mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants + b dbytecp ! let dbytecp do the rest + sub %i0, %i1, %i0 ! i0 gets the difference of src and dst + ! + ! the destination address is aligned and the source is not + ! +align_src_only: + ldub [%i0], %i3 ! read a byte from source address + add %i0, 1, %i0 ! increment source address + or %i4, %i3, %i4 ! or in with previous bytes (if any) + btst 3, %i0 ! is source aligned? + add %l0, 8, %l0 ! increment shift count (US) + bnz,a align_src_only + sll %i4, 8, %i4 ! make room for next byte + b,a xfer + ! + ! if from address unaligned for double-word moves, + ! move bytes till it is, if count is < 56 it could take + ! longer to align the thing than to do the transfer + ! in word size chunks right away + ! +aldoubcp: + cmp %i2, 56 ! if count < 56, use wordcp, it takes + blu,a %xcc, alwordcp ! longer to align doubles than words + mov 3, %o0 ! mask for word alignment + call alignit ! copy bytes until aligned + mov 7, %o0 ! mask for double alignment + ! + ! source and destination are now double-word aligned + ! i3 has aligned count returned by alignit + ! + and %i2, 7, %i2 ! unaligned leftover count + sub %i0, %i1, %i0 ! i0 gets the difference of src and dst +5: + ldx [%i0+%i1], %o4 ! read from address + stx %o4, [%i1] ! write at destination address + subcc %i3, 8, %i3 ! dec count + bgu %xcc, 5b + add %i1, 8, %i1 ! delay slot, inc to address + cmp %i2, 4 ! see if we can copy a word + blu %xcc, dbytecp ! if 3 or less bytes use bytecp + nop + ! + ! for leftover bytes we fall into wordcp, if needed + ! +wordcp: + and %i2, 3, %i2 ! unaligned leftover count +5: + ld [%i0+%i1], %o4 ! read from address + st %o4, [%i1] ! write at destination address + subcc %i3, 4, %i3 ! dec count + bgu %xcc, 5b + add %i1, 4, %i1 ! delay slot, inc to address + b,a dbytecp + + ! we come here to align copies on word boundaries +alwordcp: + call alignit ! go word-align it + mov 3, %o0 ! bits that must be zero to be aligned + b wordcp + sub %i0, %i1, %i0 ! i0 gets the difference of src and dst + + ! + ! byte copy, works with any alignment + ! +bytecp: + b dbytecp + sub %i0, %i1, %i0 ! i0 gets difference of src and dst + + ! + ! differenced byte copy, works with any alignment + ! assumes dest in %i1 and (source - dest) in %i0 + ! +1: + stb %o4, [%i1] ! write to address + inc %i1 ! inc to address +dbytecp: + deccc %i2 ! dec count + bgeu,a %xcc, 1b ! loop till done + ldub [%i0+%i1], %o4 ! read from address +cpdone: + membar #Sync ! sync error barrier + ret + restore %g0, 0, %o0 ! return (0) + +/* + * Common code used to align transfers on word and doubleword + * boudaries. Aligns source and destination and returns a count + * of aligned bytes to transfer in %i3 + */ +1: + inc %i0 ! inc from + stb %o4, [%i1] ! write a byte + inc %i1 ! inc to + dec %i2 ! dec count +alignit: + btst %o0, %i0 ! %o0 is bit mask to check for alignment + bnz,a 1b + ldub [%i0], %o4 ! read next byte + + retl + andn %i2, %o0, %i3 ! return size of aligned bytes +END(novbcopy) + + +/* + * hwblkclr - clears block-aligned, block-multiple-sized regions that are + * longer than 256 bytes in length using Niagara's block stores/quad store. + * If the criteria for using this routine are not met then it calls bzero + * and returns 1. Otherwise 0 is returned indicating success. + * Caller is responsible for ensuring use_hw_bzero is true and that + * kpreempt_disable() has been called. + */ +#ifdef lint +/*ARGSUSED*/ +int +hwblkclr(void *addr, size_t len) +{ + return(0); +} +#else /* lint */ + ! %i0 - start address + ! %i1 - length of region (multiple of 64) + +ENTRY(hwblkclr) + save %sp, -SA(MINFRAME), %sp + + ! Must be block-aligned + andcc %i0, 0x3f, %g0 + bnz,pn %xcc, 1f + nop + + ! ... and must be 256 bytes or more + cmp %i1, 0x100 + blu,pn %xcc, 1f + nop + + ! ... and length must be a multiple of 64 + andcc %i1, 0x3f, %g0 + bz,pn %xcc, pz_doblock + wr %g0, ASI_LDSTBI_P, %asi + +1: ! punt, call bzero but notify the caller that bzero was used + mov %i0, %o0 + call bzero + mov %i1, %o1 + ret + restore %g0, 1, %o0 ! return (1) - did not use block operations + + ! Already verified that there are at least 256 bytes to set +pz_doblock: + stxa %g0, [%i0+0x0]%asi + stxa %g0, [%i0+0x40]%asi + stxa %g0, [%i0+0x80]%asi + stxa %g0, [%i0+0xc0]%asi + + stxa %g0, [%i0+0x8]%asi + stxa %g0, [%i0+0x10]%asi + stxa %g0, [%i0+0x18]%asi + stxa %g0, [%i0+0x20]%asi + stxa %g0, [%i0+0x28]%asi + stxa %g0, [%i0+0x30]%asi + stxa %g0, [%i0+0x38]%asi + + stxa %g0, [%i0+0x48]%asi + stxa %g0, [%i0+0x50]%asi + stxa %g0, [%i0+0x58]%asi + stxa %g0, [%i0+0x60]%asi + stxa %g0, [%i0+0x68]%asi + stxa %g0, [%i0+0x70]%asi + stxa %g0, [%i0+0x78]%asi + + stxa %g0, [%i0+0x88]%asi + stxa %g0, [%i0+0x90]%asi + stxa %g0, [%i0+0x98]%asi + stxa %g0, [%i0+0xa0]%asi + stxa %g0, [%i0+0xa8]%asi + stxa %g0, [%i0+0xb0]%asi + stxa %g0, [%i0+0xb8]%asi + + stxa %g0, [%i0+0xc8]%asi + stxa %g0, [%i0+0xd0]%asi + stxa %g0, [%i0+0xd8]%asi + stxa %g0, [%i0+0xe0]%asi + stxa %g0, [%i0+0xe8]%asi + stxa %g0, [%i0+0xf0]%asi + stxa %g0, [%i0+0xf8]%asi + + sub %i1, 0x100, %i1 + cmp %i1, 0x100 + bgu,pt %xcc, pz_doblock + add %i0, 0x100, %i0 + +2: + ! Check if more than 64 bytes to set + cmp %i1,0x40 + blu %xcc, pz_finish + nop + +3: + stxa %g0, [%i0+0x0]%asi + stxa %g0, [%i0+0x8]%asi + stxa %g0, [%i0+0x10]%asi + stxa %g0, [%i0+0x18]%asi + stxa %g0, [%i0+0x20]%asi + stxa %g0, [%i0+0x28]%asi + stxa %g0, [%i0+0x30]%asi + stxa %g0, [%i0+0x38]%asi + + subcc %i1, 0x40, %i1 + bgu,pt %xcc, 3b + add %i0, 0x40, %i0 + +pz_finish: + membar #Sync + ret + restore %g0, 0, %o0 ! return (bzero or not) +END(hwblkclr) +#endif /* lint */ + +#if defined(lint) + +/* ARGSUSED */ +void +bzero(void *addr, size_t count) +{} + +#else /* lint */ + +ENTRY(bzero) + wr %g0, ASI_P, %asi + + cmp %o1, 7 + blu,pn %xcc, byteclr + nop + + cmp %o1, 15 + blu,pn %xcc, wdalign + nop + + andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound + bz,pt %xcc, blkalign ! already double aligned + sub %o3, 8, %o3 ! -(bytes till double aligned) + add %o1, %o3, %o1 ! update o1 with new count + +1: + stba %g0, [%o0]%asi + inccc %o3 + bl,pt %xcc, 1b + inc %o0 + + ! Now address is double aligned +blkalign: + cmp %o1, 0x80 ! check if there are 128 bytes to set + blu,pn %xcc, bzero_small + mov %o1, %o3 +#if 0 + sethi %hi(use_hw_bzero), %o2 + ld [%o2 + %lo(use_hw_bzero)], %o2 + tst %o2 + bz %xcc, bzero_small + mov %o1, %o3 +#endif + rd %asi, %o3 + wr %g0, ASI_LDSTBI_P, %asi + cmp %o3, ASI_P + bne,a %xcc, algnblk + wr %g0, ASI_LDSTBI_AIUS, %asi + +algnblk: + andcc %o0, 0x3f, %o3 ! is block aligned? + bz,pt %xcc, bzero_blk + sub %o3, 0x40, %o3 ! -(bytes till block aligned) + add %o1, %o3, %o1 ! o1 is the remainder + + ! Clear -(%o3) bytes till block aligned +1: + stxa %g0, [%o0]%asi + addcc %o3, 8, %o3 + bl,pt %xcc, 1b + add %o0, 8, %o0 + +bzero_blk: + and %o1, 0x3f, %o3 ! calc bytes left after blk clear + andn %o1, 0x3f, %o4 ! calc size of blocks in bytes + + cmp %o4, 0x100 ! 256 bytes or more + blu,pn %xcc, 3f + nop + +2: + stxa %g0, [%o0+0x0]%asi + stxa %g0, [%o0+0x40]%asi + stxa %g0, [%o0+0x80]%asi + stxa %g0, [%o0+0xc0]%asi + + stxa %g0, [%o0+0x8]%asi + stxa %g0, [%o0+0x10]%asi + stxa %g0, [%o0+0x18]%asi + stxa %g0, [%o0+0x20]%asi + stxa %g0, [%o0+0x28]%asi + stxa %g0, [%o0+0x30]%asi + stxa %g0, [%o0+0x38]%asi + + stxa %g0, [%o0+0x48]%asi + stxa %g0, [%o0+0x50]%asi + stxa %g0, [%o0+0x58]%asi + stxa %g0, [%o0+0x60]%asi + stxa %g0, [%o0+0x68]%asi + stxa %g0, [%o0+0x70]%asi + stxa %g0, [%o0+0x78]%asi + + stxa %g0, [%o0+0x88]%asi + stxa %g0, [%o0+0x90]%asi + stxa %g0, [%o0+0x98]%asi + stxa %g0, [%o0+0xa0]%asi + stxa %g0, [%o0+0xa8]%asi + stxa %g0, [%o0+0xb0]%asi + stxa %g0, [%o0+0xb8]%asi + + stxa %g0, [%o0+0xc8]%asi + stxa %g0, [%o0+0xd0]%asi + stxa %g0, [%o0+0xd8]%asi + stxa %g0, [%o0+0xe0]%asi + stxa %g0, [%o0+0xe8]%asi + stxa %g0, [%o0+0xf0]%asi + stxa %g0, [%o0+0xf8]%asi + + sub %o4, 0x100, %o4 + cmp %o4, 0x100 + bgu,pt %xcc, 2b + add %o0, 0x100, %o0 + +3: + ! ... check if 64 bytes to set + cmp %o4, 0x40 + blu %xcc, bzero_blk_done + nop + +4: + stxa %g0, [%o0+0x0]%asi + stxa %g0, [%o0+0x8]%asi + stxa %g0, [%o0+0x10]%asi + stxa %g0, [%o0+0x18]%asi + stxa %g0, [%o0+0x20]%asi + stxa %g0, [%o0+0x28]%asi + stxa %g0, [%o0+0x30]%asi + stxa %g0, [%o0+0x38]%asi + + subcc %o4, 0x40, %o4 + bgu,pt %xcc, 3b + add %o0, 0x40, %o0 + +bzero_blk_done: + membar #Sync + ! + ! Undo asi register setting. + ! + rd %asi, %o4 + wr %g0, ASI_P, %asi + cmp %o4, ASI_LDSTBI_P + bne,a %xcc, bzero_small + wr %g0, ASI_AIUS, %asi + +bzero_small: + ! Set the remaining doubles + subcc %o3, 8, %o3 ! Can we store any doubles? + blu,pn %xcc, byteclr + and %o1, 7, %o1 ! calc bytes left after doubles + +dbclr: + stxa %g0, [%o0]%asi ! Clear the doubles + subcc %o3, 8, %o3 + bgeu,pt %xcc, dbclr + add %o0, 8, %o0 + + ba byteclr + nop + +wdalign: + andcc %o0, 3, %o3 ! is add aligned on a word boundary + bz,pn %xcc, wdclr + andn %o1, 3, %o3 ! create word sized count in %o3 + + dec %o1 ! decrement count + stba %g0, [%o0]%asi ! clear a byte + ba wdalign + inc %o0 ! next byte + +wdclr: + sta %g0, [%o0]%asi ! 4-byte clearing loop + subcc %o3, 4, %o3 + bnz,pt %xcc, wdclr + inc 4, %o0 + + and %o1, 3, %o1 ! leftover count, if any + +byteclr: + ! Set the leftover bytes + brz %o1, bzero_exit + nop + +7: + deccc %o1 ! byte clearing loop + stba %g0, [%o0]%asi + bgu,pt %xcc, 7b + inc %o0 + +bzero_exit: + retl + clr %o0 ! return (0) + +END(bzero) +#endif /* lint */ + + +#if 0 +#define SMALL_LIMIT 7 +#if defined(lint) + +/*ARGSUSED*/ +int +copyin(const void *uaddr, void *kaddr, size_t count) +{ return (0); } + +#else /* lint */ + +ENTRY(copyin) + ! + ! Check the length and bail if zero. + ! + tst %o2 + bnz,pt %xcc, 1f + nop + retl + clr %o0 +#if 0 +1: + sethi %hi(copyio_fault), %o4 + or %o4, %lo(copyio_fault), %o4 + sethi %hi(copyio_fault_nowindow), %o3 + ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT + or %o3, %lo(copyio_fault_nowindow), %o3 + membar #Sync + stn %o3, [THREAD_REG + T_LOFAULT] + + mov %o0, SAVE_SRC + mov %o1, SAVE_DST + mov %o2, SAVE_COUNT +#endif + ! + ! Check to see if we're more than SMALL_LIMIT. + ! + subcc %o2, SMALL_LIMIT, %o3 + bgu,a,pt %xcc, dci_ns + or %o0, %o1, %o3 + ! + ! What was previously ".small_copyin" + ! +dcibcp: + sub %g0, %o2, %o3 ! setup for copy loop + add %o0, %o2, %o0 + add %o1, %o2, %o1 + ba,pt %xcc, dcicl + lduba [%o0 + %o3]ASI_AIUS, %o4 + ! + ! %o0 and %o1 point at the end and remain pointing at the end + ! of their buffers. We pull things out by adding %o3 (which is + ! the negation of the length) to the buffer end which gives us + ! the curent location in the buffers. By incrementing %o3 we walk + ! through both buffers without having to bump each buffer's + ! pointer. A very fast 4 instruction loop. + ! + .align 16 +dcicl: + stb %o4, [%o1 + %o3] + inccc %o3 + bl,a,pt %xcc, dcicl + lduba [%o0 + %o3]ASI_AIUS, %o4 + ! + ! We're done. Go home. + ! + membar #Sync + retl + clr %o0 + ! + ! Try aligned copies from here. + ! +dci_ns: + ! + ! See if we're single byte aligned. If we are, check the + ! limit for single byte copies. If we're smaller, or equal, + ! bounce to the byte for byte copy loop. Otherwise do it in + ! HW (if enabled). + ! + btst 1, %o3 + bz,a,pt %icc, dcih8 + btst 7, %o3 + ! + ! We're single byte aligned. + ! + sethi %hi(hw_copy_limit_1), %o3 + ld [%o3 + %lo(hw_copy_limit_1)], %o3 + ! + ! Is HW copy on? If not do everything byte for byte. + ! + tst %o3 + bz,pn %icc, dcibcp + subcc %o3, %o2, %o3 + ! + ! Are we bigger than the HW limit? If not + ! go to byte for byte. + ! + bge,pt %xcc, dcibcp + nop + ! + ! We're big enough and copy is on. Do it with HW. + ! + ba,pt %xcc, big_copyin + nop +dcih8: + ! + ! 8 byte aligned? + ! + bnz,a %xcc, dcih4 + btst 3, %o3 + ! + ! We're eight byte aligned. + ! + sethi %hi(hw_copy_limit_8), %o3 + ld [%o3 + %lo(hw_copy_limit_8)], %o3 + ! + ! Is HW assist on? If not, do it with the aligned copy. + ! + tst %o3 + bz,pn %icc, dcis8 + subcc %o3, %o2, %o3 + bge %xcc, dcis8 + nop + ba,pt %xcc, big_copyin + nop +dcis8: + ! + ! Housekeeping for copy loops. Uses same idea as in the byte for + ! byte copy loop above. + ! + add %o0, %o2, %o0 + add %o1, %o2, %o1 + sub %g0, %o2, %o3 + ba,pt %xcc, didebc + srl %o2, 3, %o2 ! Number of 8 byte chunks to copy + ! + ! 4 byte aligned? + ! +dcih4: + bnz %xcc, dcih2 + sethi %hi(hw_copy_limit_4), %o3 + ld [%o3 + %lo(hw_copy_limit_4)], %o3 + ! + ! Is HW assist on? If not, do it with the aligned copy. + ! + tst %o3 + bz,pn %icc, dcis4 + subcc %o3, %o2, %o3 + ! + ! We're negative if our size is less than or equal to hw_copy_limit_4. + ! + bge %xcc, dcis4 + nop + ba,pt %xcc, big_copyin + nop +dcis4: + ! + ! Housekeeping for copy loops. Uses same idea as in the byte + ! for byte copy loop above. + ! + add %o0, %o2, %o0 + add %o1, %o2, %o1 + sub %g0, %o2, %o3 + ba,pt %xcc, didfbc + srl %o2, 2, %o2 ! Number of 4 byte chunks to copy +dcih2: + ! + ! We're two byte aligned. Check for "smallness" + ! done in delay at .dcih4 + ! + bleu,pt %xcc, dcis2 + sethi %hi(hw_copy_limit_2), %o3 + ld [%o3 + %lo(hw_copy_limit_2)], %o3 + ! + ! Is HW assist on? If not, do it with the aligned copy. + ! + tst %o3 + bz,pn %icc, dcis2 + subcc %o3, %o2, %o3 + ! + ! Are we larger than the HW limit? + ! + bge %xcc, dcis2 + nop + ! + ! HW assist is on and we're large enough to use it. + ! + ba,pt %xcc, big_copyin + nop + ! + ! Housekeeping for copy loops. Uses same idea as in the byte + ! for byte copy loop above. + ! +dcis2: + add %o0, %o2, %o0 + add %o1, %o2, %o1 + sub %g0, %o2, %o3 + ba,pt %xcc, didtbc + srl %o2, 1, %o2 ! Number of 2 byte chunks to copy + ! +small_copyin: + ! + ! Why are we doing this AGAIN? There are certain conditions in + ! big copyin that will cause us to forgo the HW assisted copys + ! and bounce back to a non-hw assisted copy. This dispatches + ! those copies. Note that we branch around this in the main line + ! code. + ! + ! We make no check for limits or HW enablement here. We've + ! already been told that we're a poster child so just go off + ! and do it. + ! + or %o0, %o1, %o3 + btst 1, %o3 + bnz %icc, dcibcp ! Most likely + btst 7, %o3 + bz %icc, dcis8 + btst 3, %o3 + bz %icc, dcis4 + nop + ba,pt %xcc, dcis2 + nop + ! + ! Eight byte aligned copies. A steal from the original .small_copyin + ! with modifications. %o2 is number of 8 byte chunks to copy. When + ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more + ! to copy. + ! + .align 32 +didebc: + ldxa [%o0 + %o3]ASI_AIUS, %o4 + deccc %o2 + stx %o4, [%o1 + %o3] + bg,pt %xcc, didebc + addcc %o3, 8, %o3 + ! + ! End of copy loop. Most 8 byte aligned copies end here. + ! + bz,pt %xcc, dcifh + nop + ! + ! Something is left. Do it byte for byte. + ! + ba,pt %xcc, dcicl + lduba [%o0 + %o3]ASI_AIUS, %o4 + ! + ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. + ! + .align 32 +didfbc: + lduwa [%o0 + %o3]ASI_AIUS, %o4 + deccc %o2 + st %o4, [%o1 + %o3] + bg,pt %xcc, didfbc + addcc %o3, 4, %o3 + ! + ! End of copy loop. Most 4 byte aligned copies end here. + ! + bz,pt %xcc, dcifh + nop + ! + ! Something is left. Do it byte for byte. + ! + ba,pt %xcc, dcicl + lduba [%o0 + %o3]ASI_AIUS, %o4 + ! + ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to + ! copy. + ! + .align 32 +didtbc: + lduha [%o0 + %o3]ASI_AIUS, %o4 + deccc %o2 + sth %o4, [%o1 + %o3] + bg,pt %xcc, didtbc + addcc %o3, 2, %o3 + ! + ! End of copy loop. Most 2 byte aligned copies end here. + ! + bz,pt %xcc, dcifh + nop + ! + ! Deal with the last byte + ! + lduba [%o0 + %o3]ASI_AIUS, %o4 + stb %o4, [%o1 + %o3] +dcifh: + membar #Sync + retl + clr %o0 + +big_copyin: + ! + ! We're going off to do a block copy. + ! Switch fault hendlers and grab a window. We + ! don't do a membar #Sync since we've done only + ! kernel data to this point. + ! + save %sp, -SA(MINFRAME), %sp + + ! Copy in that reach here are larger than 256 bytes. The + ! hw_copy_limit_1 is set to 256. Never set this limit less + ! 128 bytes. +do_blockcopyin: + + ! Swap src/dst since the code below is memcpy code + ! and memcpy/bcopy have different calling sequences + mov %i1, %i5 + mov %i0, %i1 + mov %i5, %i0 + + andcc %i0, 7, %i3 ! is dst double aligned + bz %xcc, copyin_blkcpy + sub %i3, 8, %i3 + neg %i3 ! bytes till double aligned + sub %i2, %i3, %i2 ! update %i2 with new count + + ! Align Destination on double-word boundary + +1: lduba [%i1]ASI_AIUS, %i4 + inc %i1 + stb %i4, [%i0] + deccc %i3 + bgu %xcc, 1b + inc %i0 + +copyin_blkcpy: + andcc %i0, 63, %i3 + bz,pn %xcc, copyin_blalign ! now block aligned + sub %i3, 64, %i3 + neg %i3 ! bytes till block aligned + sub %i2, %i3, %i2 ! update %i2 with new count + + ! Copy %i3 bytes till dst is block (64 byte) aligned. use + ! double word copies. + + andcc %i1, 7, %g1 ! is src aligned on a 8 bytes + bz %xcc, ci_dbcopy ! %g1 has source offset (last 3-bits) + sll %g1, 3, %l1 ! left shift + mov 0x40, %l2 + sub %l2, %l1, %l2 ! right shift = (64 - left shift) + + ! Now use double word copies to align destination. +ci_double: + sub %i1, %g1, %i1 ! align the src at 8 bytes. + ldxa [%i1]ASI_AIUS, %o2 +2: + add %i1, 0x8, %i1 + ldxa [%i1]ASI_AIUS, %o4 + ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3) + stx %o2, [%i0] + mov %o4, %o2 + subcc %i3, 0x8, %i3 + bgu,pt %xcc, 2b + add %i0, 0x8, %i0 + ba copyin_blalign + add %i1, %g1, %i1 + + ! Both source and destination are double aligned. + ! No shift and merge of data required in this case. +ci_dbcopy: + ldxa [%i1]ASI_AIUS, %o2 + stx %o2, [%i0] + add %i1, 0x8, %i1 + subcc %i3, 0x8, %i3 + bgu,pt %xcc, ci_dbcopy + add %i0, 0x8, %i0 + +copyin_blalign: + andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size + sub %i2, %i3, %i2 ! Residue bytes in %i2 + + wr %g0, ASI_LDSTBI_P, %asi + + andcc %i1, 0xf, %o2 ! is src quadword aligned + bz,pn %xcc, ci_blkcpy ! src offset in %o2 (last 4-bits) + nop + cmp %o2, 0x8 + bg ci_upper_double + nop + bl ci_lower_double + nop + + ! Falls through when source offset is equal to 8 i.e. + ! source is double word aligned. + ! In this case no shift/merge of data is required + + sub %i1, %o2, %i1 ! align the src at 16 bytes. + andn %i1, 0x3f, %l0 ! %l0 has block aligned source + prefetch [%l0+0x0], #one_read + ldda [%i1]ASI_LDSTBI_AIUS, %l2 +ci_loop0: + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l4 + + prefetch [%l0+0x40], #one_read + + stxa %l3, [%i0+0x0]%asi + stxa %l4, [%i0+0x8]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + + stxa %l5, [%i0+0x10]%asi + stxa %l2, [%i0+0x18]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l4 + + stxa %l3, [%i0+0x20]%asi + stxa %l4, [%i0+0x28]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + + stxa %l5, [%i0+0x30]%asi + stxa %l2, [%i0+0x38]%asi + + add %l0, 0x40, %l0 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, ci_loop0 + add %i0, 0x40, %i0 + ba ci_blkdone + add %i1, %o2, %i1 ! increment the source by src offset + ! the src offset was stored in %o2 + +ci_lower_double: + + sub %i1, %o2, %i1 ! align the src at 16 bytes. + sll %o2, 3, %o0 ! %o0 left shift + mov 0x40, %o1 + sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) + andn %i1, 0x3f, %l0 ! %l0 has block aligned source + prefetch [%l0+0x0], #one_read + ldda [%i1]ASI_LDSTBI_AIUS, %l2 ! partial data in %l2 + ! and %l3 has complete + ! data +ci_loop1: + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l4 ! %l4 has partial data + ! for this read. + ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 + ! into %l2 and %l3 + + prefetch [%l0+0x40], #one_read + + stxa %l2, [%i0+0x0]%asi + stxa %l3, [%i0+0x8]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and + ! %l4 from previous read + ! into %l4 and %l5 + stxa %l4, [%i0+0x10]%asi + stxa %l5, [%i0+0x18]%asi + + ! Repeat the same for next 32 bytes. + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l4 + ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) + + stxa %l2, [%i0+0x20]%asi + stxa %l3, [%i0+0x28]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) + + stxa %l4, [%i0+0x30]%asi + stxa %l5, [%i0+0x38]%asi + + add %l0, 0x40, %l0 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, ci_loop1 + add %i0, 0x40, %i0 + ba ci_blkdone + add %i1, %o2, %i1 ! increment the source by src offset + ! the src offset was stored in %o2 + +ci_upper_double: + + sub %i1, %o2, %i1 ! align the src at 16 bytes. + sub %o2, 0x8, %o0 + sll %o0, 3, %o0 ! %o0 left shift + mov 0x40, %o1 + sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) + andn %i1, 0x3f, %l0 ! %l0 has block aligned source + prefetch [%l0+0x0], #one_read + ldda [%i1]ASI_LDSTBI_AIUS, %l2 ! partial data in %l3 + ! for this read and + ! no data in %l2 +ci_loop2: + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l4 ! %l4 has complete data + ! and %l5 has partial + ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 + ! into %l3 and %l4 + prefetch [%l0+0x40], #one_read + + stxa %l3, [%i0+0x0]%asi + stxa %l4, [%i0+0x8]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with + ! %l5 from previous read + ! into %l5 and %l2 + + stxa %l5, [%i0+0x10]%asi + stxa %l2, [%i0+0x18]%asi + + ! Repeat the same for next 32 bytes. + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l4 + ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) + + stxa %l3, [%i0+0x20]%asi + stxa %l4, [%i0+0x28]%asi + + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) + + stxa %l5, [%i0+0x30]%asi + stxa %l2, [%i0+0x38]%asi + + add %l0, 0x40, %l0 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, ci_loop2 + add %i0, 0x40, %i0 + ba ci_blkdone + add %i1, %o2, %i1 ! increment the source by src offset + ! the src offset was stored in %o2 + + + ! Do fast copy using ASI_LDSTBI_P +ci_blkcpy: + + andn %i1, 0x3f, %o0 ! %o0 has block aligned source + prefetch [%o0+0x0], #one_read +1: + ldda [%i1]ASI_LDSTBI_AIUS, %l0 + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l2 + add %i1, 0x10, %i1 + + prefetch [%o0+0x40], #one_read + + stxa %l0, [%i0+0x0]%asi + + ldda [%i1]ASI_LDSTBI_AIUS, %l4 + add %i1, 0x10, %i1 + ldda [%i1]ASI_LDSTBI_AIUS, %l6 + add %i1, 0x10, %i1 + + stxa %l1, [%i0+0x8]%asi + stxa %l2, [%i0+0x10]%asi + stxa %l3, [%i0+0x18]%asi + stxa %l4, [%i0+0x20]%asi + stxa %l5, [%i0+0x28]%asi + stxa %l6, [%i0+0x30]%asi + stxa %l7, [%i0+0x38]%asi + + add %o0, 0x40, %o0 + subcc %i3, 0x40, %i3 + bgu,pt %xcc, 1b + add %i0, 0x40, %i0 + +ci_blkdone: + membar #Sync + + ! Copy as much rest of the data as double word copy. +ci_dwcp: + cmp %i2, 0x8 ! Not enough bytes to copy as double + blu %xcc, ci_dbdone + nop + + andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size + sub %i2, %i3, %i2 ! Residue bytes in %i2 + + andcc %i1, 7, %g1 ! is src aligned on a 8 bytes + bz %xcc, ci_cpy_db + nop + + sll %g1, 3, %l0 ! left shift + mov 0x40, %l1 + sub %l1, %l0, %l1 ! right shift = (64 - left shift) + +ci_cpy_dbwd: + sub %i1, %g1, %i1 ! align the src at 8 bytes. + ldxa [%i1]ASI_AIUS, %o2 +3: + add %i1, 0x8, %i1 + ldxa [%i1]ASI_AIUS, %o4 + ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3) + stx %o2, [%i0] + mov %o4, %o2 + subcc %i3, 0x8, %i3 + bgu,pt %xcc, 3b + add %i0, 0x8, %i0 + ba ci_dbdone + add %i1, %g1, %i1 + +ci_cpy_db: + ldxa [%i1]ASI_AIUS, %o2 + stx %o2, [%i0] + add %i1, 0x8, %i1 + subcc %i3, 0x8, %i3 + bgu,pt %xcc, ci_cpy_db + add %i0, 0x8, %i0 + +ci_dbdone: + tst %i2 + bz,pt %xcc, copyin_exit + nop + + ! Copy the residue as byte copy +ci_residue: + lduba [%i1]ASI_AIUS, %i4 + stb %i4, [%i0] + inc %i1 + deccc %i2 + bgu %xcc, ci_residue + inc %i0 + +copyin_exit: + membar #Sync + ret + restore %g0, 0, %o0 +END(copyin) + +#endif /* lint */ +#endif + |