summaryrefslogtreecommitdiffstats
path: root/sys/sun4v/cddl
diff options
context:
space:
mode:
authorkmacy <kmacy@FreeBSD.org>2006-11-23 19:58:06 +0000
committerkmacy <kmacy@FreeBSD.org>2006-11-23 19:58:06 +0000
commitc784eb28fd564165eec1c920c5ea555324910901 (patch)
treebd9650d04ee7cc53bb5869647cfe9a7209e77afe /sys/sun4v/cddl
parentd47577789b1b7d11b294d9305019db910d35efe0 (diff)
downloadFreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.zip
FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.tar.gz
separate out legitimately CDDL code - optimized routines taken from
opensolaries
Diffstat (limited to 'sys/sun4v/cddl')
-rw-r--r--sys/sun4v/cddl/t1_copy.S1598
1 files changed, 1598 insertions, 0 deletions
diff --git a/sys/sun4v/cddl/t1_copy.S b/sys/sun4v/cddl/t1_copy.S
new file mode 100644
index 0000000..6fe5a3d
--- /dev/null
+++ b/sys/sun4v/cddl/t1_copy.S
@@ -0,0 +1,1598 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http: //www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$")
+
+#include <machine/asi.h>
+#include <machine/asmacros.h>
+#include <machine/ktr.h>
+#include <machine/pstate.h>
+#include <machine/trap.h>
+#include <machine/tstate.h>
+#include <machine/wstate.h>
+#include <machine/hypervisorvar.h>
+
+ .register %g2,#ignore
+ .register %g3,#ignore
+ .register %g6,#ignore
+ .register %g7,#ignore
+
+
+/*
+ * This define is to align data for the unaligned source cases.
+ * The data1, data2 and data3 is merged into data1 and data2.
+ * The data3 is preserved for next merge.
+ */
+#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
+ sllx data1, lshift, data1 ;\
+ srlx data2, rshift, tmp ;\
+ or data1, tmp, data1 ;\
+ sllx data2, lshift, data2 ;\
+ srlx data3, rshift, tmp ;\
+ or data2, tmp, data2
+/*
+ * This macro is to align the data. Basically it merges
+ * data1 and data2 to form double word.
+ */
+#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
+ sllx data1, lshift, data1 ;\
+ srlx data2, rshift, tmp ;\
+ or data1, tmp, data1
+
+
+
+
+
+/*
+ * DGDEF and DGDEF2 provide global data declarations.
+ *
+ * DGDEF provides a word aligned word of storage.
+ *
+ * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This
+ * implies this macro is best used for byte arrays.
+ *
+ * DGDEF3 allocates "sz" bytes of storage with "algn" alignment.
+ */
+#define DGDEF2(name, sz) \
+ .section ".data" ; \
+ .global name ; \
+ .type name, @object ; \
+ .size name, sz; \
+name:
+
+#define DGDEF3(name, sz, algn) \
+ .section ".data" ; \
+ .align algn ; \
+ .global name ; \
+ .type name, @object ; \
+ .size name, sz; \
+name:
+
+#define DGDEF(name) DGDEF3(name, 4, 4)
+
+.align 4
+DGDEF(hw_copy_limit_1)
+.word 0x100
+DGDEF(hw_copy_limit_2)
+.word 0x200
+DGDEF(hw_copy_limit_4)
+.word 0x400
+DGDEF(hw_copy_limit_8)
+.word 0x400
+.align 64
+.section ".text"
+
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+ovbcopy(const void *from, void *to, size_t count)
+{}
+
+#else /* lint */
+
+ENTRY(bcopy)
+ tst %o2 ! check count
+ bgu,a %xcc, 1f ! nothing to do or bad arguments
+ subcc %o0, %o1, %o3 ! difference of from and to address
+
+ retl ! return
+ nop
+1:
+ bneg,a %xcc, 2f
+ neg %o3 ! if < 0, make it positive
+2: cmp %o2, %o3 ! cmp size and abs(from - to)
+ bleu %xcc, novbcopy ! if size <= abs(diff): use bcopy,
+ nop
+ cmp %o0, %o1 ! compare from and to addresses
+ blu %xcc, ov_bkwd ! if from < to, copy backwards
+ nop
+ !
+ ! Copy forwards.
+ !
+ov_fwd:
+ ldub [%o0], %o3 ! read from address
+ inc %o0 ! inc from address
+ stb %o3, [%o1] ! write to address
+ deccc %o2 ! dec count
+ bgu %xcc, ov_fwd ! loop till done
+ inc %o1 ! inc to address
+
+ retl ! return
+ nop
+ !
+ ! Copy backwards.
+ !
+ov_bkwd:
+ deccc %o2 ! dec count
+ ldub [%o0 + %o2], %o3 ! get byte at end of src
+ bgu %xcc, ov_bkwd ! loop till done
+ stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
+
+ retl ! return
+ nop
+END(bcopy)
+
+#endif /* lint */
+
+
+
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ */
+ENTRY(novbcopy)
+
+ save %sp, -SA(MINFRAME), %sp
+
+do_copy:
+ cmp %i2, 12 ! for small counts
+ blu %xcc, bytecp ! just copy bytes
+ nop
+
+ cmp %i2, 128 ! for less than 128 bytes
+ blu,pn %xcc, bcb_punt ! no block st/quad ld
+ nop
+#if 0
+ set use_hw_bcopy, %o2
+ ld [%o2], %o2
+ tst %o2
+ bz bcb_punt
+ nop
+#endif
+ subcc %i1, %i0, %i3
+ bneg,a,pn %xcc, 1f
+ neg %i3
+1:
+ /*
+ * Compare against 256 since we should be checking block addresses
+ * and (dest & ~63) - (src & ~63) can be 3 blocks even if
+ * src = dest + (64 * 3) + 63.
+ */
+ cmp %i3, 256
+ blu,pn %xcc, bcb_punt
+ nop
+
+ /*
+ * Copy that reach here have at least 2 blocks of data to copy.
+ */
+do_blockcopy:
+ ! Swap src/dst since the code below is memcpy code
+ ! and memcpy/bcopy have different calling sequences
+ mov %i1, %i5
+ mov %i0, %i1
+ mov %i5, %i0
+
+ andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes
+ bz %xcc, chksrc ! dst is already double aligned
+ sub %i3, 0x40, %i3
+ neg %i3 ! bytes till dst 64 bytes aligned
+ sub %i2, %i3, %i2 ! update i2 with new count
+
+1: ldub [%i1], %i4
+ stb %i4, [%i0]
+ inc %i1
+ deccc %i3
+ bgu %xcc, 1b
+ inc %i0
+
+ ! Now Destination is block (64 bytes) aligned
+chksrc:
+ andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
+ sub %i2, %i3, %i2 ! Residue bytes in %i2
+
+ wr %g0, ASI_LDSTBI_P, %asi
+
+ andcc %i1, 0xf, %o2 ! is src quadword aligned
+ bz,pn %xcc, blkcpy ! src offset in %o2
+ nop
+ cmp %o2, 0x8
+ bg cpy_upper_double
+ nop
+ bl cpy_lower_double
+ nop
+
+ ! Falls through when source offset is equal to 8 i.e.
+ ! source is double word aligned.
+ ! In this case no shift/merge of data is required
+ sub %i1, %o2, %i1 ! align the src at 16 bytes.
+ andn %i1, 0x3f, %l0 ! %l0 has block aligned source
+ prefetch [%l0+0x0], #one_read
+ ldda [%i1+0x0]%asi, %l2
+loop0:
+ ldda [%i1+0x10]%asi, %l4
+ prefetch [%l0+0x40], #one_read
+
+ stxa %l3, [%i0+0x0]%asi
+ stxa %l4, [%i0+0x8]%asi
+
+ ldda [%i1+0x20]%asi, %l2
+ stxa %l5, [%i0+0x10]%asi
+ stxa %l2, [%i0+0x18]%asi
+
+ ldda [%i1+0x30]%asi, %l4
+ stxa %l3, [%i0+0x20]%asi
+ stxa %l4, [%i0+0x28]%asi
+
+ ldda [%i1+0x40]%asi, %l2
+ stxa %l5, [%i0+0x30]%asi
+ stxa %l2, [%i0+0x38]%asi
+
+ add %l0, 0x40, %l0
+ add %i1, 0x40, %i1
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, loop0
+ add %i0, 0x40, %i0
+ ba blkdone
+ add %i1, %o2, %i1 ! increment the source by src offset
+ ! the src offset was stored in %o2
+
+cpy_lower_double:
+ sub %i1, %o2, %i1 ! align the src at 16 bytes.
+ sll %o2, 3, %o0 ! %o0 left shift
+ mov 0x40, %o1
+ sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
+ andn %i1, 0x3f, %l0 ! %l0 has block aligned source
+ prefetch [%l0+0x0], #one_read
+ ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has
+ ! complete data
+loop1:
+ ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read.
+ ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
+ ! into %l2 and %l3
+ prefetch [%l0+0x40], #one_read
+ stxa %l2, [%i0+0x0]%asi
+ stxa %l3, [%i0+0x8]%asi
+
+ ldda [%i1+0x20]%asi, %l2
+ ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
+ stxa %l4, [%i0+0x10]%asi ! %l4 from previous read
+ stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5
+
+ ! Repeat the same for next 32 bytes.
+
+ ldda [%i1+0x30]%asi, %l4
+ ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
+ stxa %l2, [%i0+0x20]%asi
+ stxa %l3, [%i0+0x28]%asi
+
+ ldda [%i1+0x40]%asi, %l2
+ ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
+ stxa %l4, [%i0+0x30]%asi
+ stxa %l5, [%i0+0x38]%asi
+
+ add %l0, 0x40, %l0
+ add %i1, 0x40, %i1
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, loop1
+ add %i0, 0x40, %i0
+ ba blkdone
+ add %i1, %o2, %i1 ! increment the source by src offset
+ ! the src offset was stored in %o2
+
+cpy_upper_double:
+ sub %i1, %o2, %i1 ! align the src at 16 bytes.
+ mov 0x8, %o0
+ sub %o2, %o0, %o0
+ sll %o0, 3, %o0 ! %o0 left shift
+ mov 0x40, %o1
+ sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
+ andn %i1, 0x3f, %l0 ! %l0 has block aligned source
+ prefetch [%l0+0x0], #one_read
+ ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and
+ ! no data in %l2
+loop2:
+ ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has
+ ! partial
+ ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
+ ! into %l3 and %l4
+ prefetch [%l0+0x40], #one_read
+ stxa %l3, [%i0+0x0]%asi
+ stxa %l4, [%i0+0x8]%asi
+
+ ldda [%i1+0x20]%asi, %l2
+ ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
+ stxa %l5, [%i0+0x10]%asi ! %l5 from previous read
+ stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2
+
+ ! Repeat the same for next 32 bytes.
+
+ ldda [%i1+0x30]%asi, %l4
+ ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
+ stxa %l3, [%i0+0x20]%asi
+ stxa %l4, [%i0+0x28]%asi
+
+ ldda [%i1+0x40]%asi, %l2
+ ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
+ stxa %l5, [%i0+0x30]%asi
+ stxa %l2, [%i0+0x38]%asi
+
+ add %l0, 0x40, %l0
+ add %i1, 0x40, %i1
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, loop2
+ add %i0, 0x40, %i0
+ ba blkdone
+ add %i1, %o2, %i1 ! increment the source by src offset
+ ! the src offset was stored in %o2
+
+
+ ! Both Source and Destination are block aligned.
+ ! Do fast copy using ASI_LDSTBI_P
+blkcpy:
+ prefetch [%i1+0x0], #one_read
+1:
+ ldda [%i1+0x0]%asi, %l0
+ ldda [%i1+0x10]%asi, %l2
+ prefetch [%i1+0x40], #one_read
+
+ stxa %l0, [%i0+0x0]%asi
+ ldda [%i1+0x20]%asi, %l4
+ ldda [%i1+0x30]%asi, %l6
+
+ stxa %l1, [%i0+0x8]%asi
+ stxa %l2, [%i0+0x10]%asi
+ stxa %l3, [%i0+0x18]%asi
+ stxa %l4, [%i0+0x20]%asi
+ stxa %l5, [%i0+0x28]%asi
+ stxa %l6, [%i0+0x30]%asi
+ stxa %l7, [%i0+0x38]%asi
+
+ add %i1, 0x40, %i1
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, 1b
+ add %i0, 0x40, %i0
+
+blkdone:
+ tst %i2
+ bz,pt %xcc, blkexit
+ nop
+
+residue:
+ ldub [%i1], %i4
+ stb %i4, [%i0]
+ inc %i1
+ deccc %i2
+ bgu %xcc, residue
+ inc %i0
+
+blkexit:
+ membar #Sync ! sync error barrier
+ ret
+ restore %g0, 0, %o0
+
+bcb_punt:
+ !
+ ! use aligned transfers where possible
+ !
+ xor %i0, %i1, %o4 ! xor from and to address
+ btst 7, %o4 ! if lower three bits zero
+ bz aldoubcp ! can align on double boundary
+ nop ! assembler complaints about label
+
+ xor %i0, %i1, %o4 ! xor from and to address
+ btst 3, %o4 ! if lower two bits zero
+ bz alwordcp ! can align on word boundary
+ btst 3, %i0 ! delay slot, from address unaligned?
+ !
+ ! use aligned reads and writes where possible
+ ! this differs from wordcp in that it copes
+ ! with odd alignment between source and destnation
+ ! using word reads and writes with the proper shifts
+ ! in between to align transfers to and from memory
+ ! i0 - src address, i1 - dest address, i2 - count
+ ! i3, i4 - tmps for used generating complete word
+ ! i5 (word to write)
+ ! l0 size in bits of upper part of source word (US)
+ ! l1 size in bits of lower part of source word (LS = 32 - US)
+ ! l2 size in bits of upper part of destination word (UD)
+ ! l3 size in bits of lower part of destination word (LD = 32 - UD)
+ ! l4 number of bytes leftover after aligned transfers complete
+ ! l5 the number 32
+ !
+ mov 32, %l5 ! load an oft-needed constant
+ bz align_dst_only
+ btst 3, %i1 ! is destnation address aligned?
+ clr %i4 ! clear registers used in either case
+ bz align_src_only
+ clr %l0
+ !
+ ! both source and destination addresses are unaligned
+ !
+1: ! align source
+ ldub [%i0], %i3 ! read a byte from source address
+ add %i0, 1, %i0 ! increment source address
+ or %i4, %i3, %i4 ! or in with previous bytes (if any)
+ btst 3, %i0 ! is source aligned?
+ add %l0, 8, %l0 ! increment size of upper source (US)
+ bnz,a 1b
+ sll %i4, 8, %i4 ! make room for next byte
+
+ sub %l5, %l0, %l1 ! generate shift left count (LS)
+ sll %i4, %l1, %i4 ! prepare to get rest
+ ld [%i0], %i3 ! read a word
+ add %i0, 4, %i0 ! increment source address
+ srl %i3, %l0, %i5 ! upper src bits into lower dst bits
+ or %i4, %i5, %i5 ! merge
+ mov 24, %l3 ! align destination
+1:
+ srl %i5, %l3, %i4 ! prepare to write a single byte
+ stb %i4, [%i1] ! write a byte
+ add %i1, 1, %i1 ! increment destination address
+ sub %i2, 1, %i2 ! decrement count
+ btst 3, %i1 ! is destination aligned?
+ bnz,a 1b
+ sub %l3, 8, %l3 ! delay slot, decrement shift count (LD)
+ sub %l5, %l3, %l2 ! generate shift left count (UD)
+ sll %i5, %l2, %i5 ! move leftover into upper bytes
+ cmp %l2, %l0 ! cmp # reqd to fill dst w old src left
+ bgu %xcc, more_needed ! need more to fill than we have
+ nop
+
+ sll %i3, %l1, %i3 ! clear upper used byte(s)
+ srl %i3, %l1, %i3
+ ! get the odd bytes between alignments
+ sub %l0, %l2, %l0 ! regenerate shift count
+ sub %l5, %l0, %l1 ! generate new shift left count (LS)
+ and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
+ andn %i2, 3, %i2 ! # of aligned bytes that can be moved
+ srl %i3, %l0, %i4
+ or %i5, %i4, %i5
+ st %i5, [%i1] ! write a word
+ subcc %i2, 4, %i2 ! decrement count
+ bz %xcc, unalign_out
+ add %i1, 4, %i1 ! increment destination address
+
+ b 2f
+ sll %i3, %l1, %i5 ! get leftover into upper bits
+more_needed:
+ sll %i3, %l0, %i3 ! save remaining byte(s)
+ srl %i3, %l0, %i3
+ sub %l2, %l0, %l1 ! regenerate shift count
+ sub %l5, %l1, %l0 ! generate new shift left count
+ sll %i3, %l1, %i4 ! move to fill empty space
+ b 3f
+ or %i5, %i4, %i5 ! merge to complete word
+ !
+ ! the source address is aligned and destination is not
+ !
+align_dst_only:
+ ld [%i0], %i4 ! read a word
+ add %i0, 4, %i0 ! increment source address
+ mov 24, %l0 ! initial shift alignment count
+1:
+ srl %i4, %l0, %i3 ! prepare to write a single byte
+ stb %i3, [%i1] ! write a byte
+ add %i1, 1, %i1 ! increment destination address
+ sub %i2, 1, %i2 ! decrement count
+ btst 3, %i1 ! is destination aligned?
+ bnz,a 1b
+ sub %l0, 8, %l0 ! delay slot, decrement shift count
+xfer:
+ sub %l5, %l0, %l1 ! generate shift left count
+ sll %i4, %l1, %i5 ! get leftover
+3:
+ and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
+ andn %i2, 3, %i2 ! # of aligned bytes that can be moved
+2:
+ ld [%i0], %i3 ! read a source word
+ add %i0, 4, %i0 ! increment source address
+ srl %i3, %l0, %i4 ! upper src bits into lower dst bits
+ or %i5, %i4, %i5 ! merge with upper dest bits (leftover)
+ st %i5, [%i1] ! write a destination word
+ subcc %i2, 4, %i2 ! decrement count
+ bz %xcc, unalign_out ! check if done
+ add %i1, 4, %i1 ! increment destination address
+ b 2b ! loop
+ sll %i3, %l1, %i5 ! get leftover
+unalign_out:
+ tst %l4 ! any bytes leftover?
+ bz %xcc, cpdone
+ nop
+1:
+ sub %l0, 8, %l0 ! decrement shift
+ srl %i3, %l0, %i4 ! upper src byte into lower dst byte
+ stb %i4, [%i1] ! write a byte
+ subcc %l4, 1, %l4 ! decrement count
+ bz %xcc, cpdone ! done?
+ add %i1, 1, %i1 ! increment destination
+ tst %l0 ! any more previously read bytes
+ bnz %xcc, 1b ! we have leftover bytes
+ mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants
+ b dbytecp ! let dbytecp do the rest
+ sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
+ !
+ ! the destination address is aligned and the source is not
+ !
+align_src_only:
+ ldub [%i0], %i3 ! read a byte from source address
+ add %i0, 1, %i0 ! increment source address
+ or %i4, %i3, %i4 ! or in with previous bytes (if any)
+ btst 3, %i0 ! is source aligned?
+ add %l0, 8, %l0 ! increment shift count (US)
+ bnz,a align_src_only
+ sll %i4, 8, %i4 ! make room for next byte
+ b,a xfer
+ !
+ ! if from address unaligned for double-word moves,
+ ! move bytes till it is, if count is < 56 it could take
+ ! longer to align the thing than to do the transfer
+ ! in word size chunks right away
+ !
+aldoubcp:
+ cmp %i2, 56 ! if count < 56, use wordcp, it takes
+ blu,a %xcc, alwordcp ! longer to align doubles than words
+ mov 3, %o0 ! mask for word alignment
+ call alignit ! copy bytes until aligned
+ mov 7, %o0 ! mask for double alignment
+ !
+ ! source and destination are now double-word aligned
+ ! i3 has aligned count returned by alignit
+ !
+ and %i2, 7, %i2 ! unaligned leftover count
+ sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
+5:
+ ldx [%i0+%i1], %o4 ! read from address
+ stx %o4, [%i1] ! write at destination address
+ subcc %i3, 8, %i3 ! dec count
+ bgu %xcc, 5b
+ add %i1, 8, %i1 ! delay slot, inc to address
+ cmp %i2, 4 ! see if we can copy a word
+ blu %xcc, dbytecp ! if 3 or less bytes use bytecp
+ nop
+ !
+ ! for leftover bytes we fall into wordcp, if needed
+ !
+wordcp:
+ and %i2, 3, %i2 ! unaligned leftover count
+5:
+ ld [%i0+%i1], %o4 ! read from address
+ st %o4, [%i1] ! write at destination address
+ subcc %i3, 4, %i3 ! dec count
+ bgu %xcc, 5b
+ add %i1, 4, %i1 ! delay slot, inc to address
+ b,a dbytecp
+
+ ! we come here to align copies on word boundaries
+alwordcp:
+ call alignit ! go word-align it
+ mov 3, %o0 ! bits that must be zero to be aligned
+ b wordcp
+ sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
+
+ !
+ ! byte copy, works with any alignment
+ !
+bytecp:
+ b dbytecp
+ sub %i0, %i1, %i0 ! i0 gets difference of src and dst
+
+ !
+ ! differenced byte copy, works with any alignment
+ ! assumes dest in %i1 and (source - dest) in %i0
+ !
+1:
+ stb %o4, [%i1] ! write to address
+ inc %i1 ! inc to address
+dbytecp:
+ deccc %i2 ! dec count
+ bgeu,a %xcc, 1b ! loop till done
+ ldub [%i0+%i1], %o4 ! read from address
+cpdone:
+ membar #Sync ! sync error barrier
+ ret
+ restore %g0, 0, %o0 ! return (0)
+
+/*
+ * Common code used to align transfers on word and doubleword
+ * boudaries. Aligns source and destination and returns a count
+ * of aligned bytes to transfer in %i3
+ */
+1:
+ inc %i0 ! inc from
+ stb %o4, [%i1] ! write a byte
+ inc %i1 ! inc to
+ dec %i2 ! dec count
+alignit:
+ btst %o0, %i0 ! %o0 is bit mask to check for alignment
+ bnz,a 1b
+ ldub [%i0], %o4 ! read next byte
+
+ retl
+ andn %i2, %o0, %i3 ! return size of aligned bytes
+END(novbcopy)
+
+
+/*
+ * hwblkclr - clears block-aligned, block-multiple-sized regions that are
+ * longer than 256 bytes in length using Niagara's block stores/quad store.
+ * If the criteria for using this routine are not met then it calls bzero
+ * and returns 1. Otherwise 0 is returned indicating success.
+ * Caller is responsible for ensuring use_hw_bzero is true and that
+ * kpreempt_disable() has been called.
+ */
+#ifdef lint
+/*ARGSUSED*/
+int
+hwblkclr(void *addr, size_t len)
+{
+ return(0);
+}
+#else /* lint */
+ ! %i0 - start address
+ ! %i1 - length of region (multiple of 64)
+
+ENTRY(hwblkclr)
+ save %sp, -SA(MINFRAME), %sp
+
+ ! Must be block-aligned
+ andcc %i0, 0x3f, %g0
+ bnz,pn %xcc, 1f
+ nop
+
+ ! ... and must be 256 bytes or more
+ cmp %i1, 0x100
+ blu,pn %xcc, 1f
+ nop
+
+ ! ... and length must be a multiple of 64
+ andcc %i1, 0x3f, %g0
+ bz,pn %xcc, pz_doblock
+ wr %g0, ASI_LDSTBI_P, %asi
+
+1: ! punt, call bzero but notify the caller that bzero was used
+ mov %i0, %o0
+ call bzero
+ mov %i1, %o1
+ ret
+ restore %g0, 1, %o0 ! return (1) - did not use block operations
+
+ ! Already verified that there are at least 256 bytes to set
+pz_doblock:
+ stxa %g0, [%i0+0x0]%asi
+ stxa %g0, [%i0+0x40]%asi
+ stxa %g0, [%i0+0x80]%asi
+ stxa %g0, [%i0+0xc0]%asi
+
+ stxa %g0, [%i0+0x8]%asi
+ stxa %g0, [%i0+0x10]%asi
+ stxa %g0, [%i0+0x18]%asi
+ stxa %g0, [%i0+0x20]%asi
+ stxa %g0, [%i0+0x28]%asi
+ stxa %g0, [%i0+0x30]%asi
+ stxa %g0, [%i0+0x38]%asi
+
+ stxa %g0, [%i0+0x48]%asi
+ stxa %g0, [%i0+0x50]%asi
+ stxa %g0, [%i0+0x58]%asi
+ stxa %g0, [%i0+0x60]%asi
+ stxa %g0, [%i0+0x68]%asi
+ stxa %g0, [%i0+0x70]%asi
+ stxa %g0, [%i0+0x78]%asi
+
+ stxa %g0, [%i0+0x88]%asi
+ stxa %g0, [%i0+0x90]%asi
+ stxa %g0, [%i0+0x98]%asi
+ stxa %g0, [%i0+0xa0]%asi
+ stxa %g0, [%i0+0xa8]%asi
+ stxa %g0, [%i0+0xb0]%asi
+ stxa %g0, [%i0+0xb8]%asi
+
+ stxa %g0, [%i0+0xc8]%asi
+ stxa %g0, [%i0+0xd0]%asi
+ stxa %g0, [%i0+0xd8]%asi
+ stxa %g0, [%i0+0xe0]%asi
+ stxa %g0, [%i0+0xe8]%asi
+ stxa %g0, [%i0+0xf0]%asi
+ stxa %g0, [%i0+0xf8]%asi
+
+ sub %i1, 0x100, %i1
+ cmp %i1, 0x100
+ bgu,pt %xcc, pz_doblock
+ add %i0, 0x100, %i0
+
+2:
+ ! Check if more than 64 bytes to set
+ cmp %i1,0x40
+ blu %xcc, pz_finish
+ nop
+
+3:
+ stxa %g0, [%i0+0x0]%asi
+ stxa %g0, [%i0+0x8]%asi
+ stxa %g0, [%i0+0x10]%asi
+ stxa %g0, [%i0+0x18]%asi
+ stxa %g0, [%i0+0x20]%asi
+ stxa %g0, [%i0+0x28]%asi
+ stxa %g0, [%i0+0x30]%asi
+ stxa %g0, [%i0+0x38]%asi
+
+ subcc %i1, 0x40, %i1
+ bgu,pt %xcc, 3b
+ add %i0, 0x40, %i0
+
+pz_finish:
+ membar #Sync
+ ret
+ restore %g0, 0, %o0 ! return (bzero or not)
+END(hwblkclr)
+#endif /* lint */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+bzero(void *addr, size_t count)
+{}
+
+#else /* lint */
+
+ENTRY(bzero)
+ wr %g0, ASI_P, %asi
+
+ cmp %o1, 7
+ blu,pn %xcc, byteclr
+ nop
+
+ cmp %o1, 15
+ blu,pn %xcc, wdalign
+ nop
+
+ andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
+ bz,pt %xcc, blkalign ! already double aligned
+ sub %o3, 8, %o3 ! -(bytes till double aligned)
+ add %o1, %o3, %o1 ! update o1 with new count
+
+1:
+ stba %g0, [%o0]%asi
+ inccc %o3
+ bl,pt %xcc, 1b
+ inc %o0
+
+ ! Now address is double aligned
+blkalign:
+ cmp %o1, 0x80 ! check if there are 128 bytes to set
+ blu,pn %xcc, bzero_small
+ mov %o1, %o3
+#if 0
+ sethi %hi(use_hw_bzero), %o2
+ ld [%o2 + %lo(use_hw_bzero)], %o2
+ tst %o2
+ bz %xcc, bzero_small
+ mov %o1, %o3
+#endif
+ rd %asi, %o3
+ wr %g0, ASI_LDSTBI_P, %asi
+ cmp %o3, ASI_P
+ bne,a %xcc, algnblk
+ wr %g0, ASI_LDSTBI_AIUS, %asi
+
+algnblk:
+ andcc %o0, 0x3f, %o3 ! is block aligned?
+ bz,pt %xcc, bzero_blk
+ sub %o3, 0x40, %o3 ! -(bytes till block aligned)
+ add %o1, %o3, %o1 ! o1 is the remainder
+
+ ! Clear -(%o3) bytes till block aligned
+1:
+ stxa %g0, [%o0]%asi
+ addcc %o3, 8, %o3
+ bl,pt %xcc, 1b
+ add %o0, 8, %o0
+
+bzero_blk:
+ and %o1, 0x3f, %o3 ! calc bytes left after blk clear
+ andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
+
+ cmp %o4, 0x100 ! 256 bytes or more
+ blu,pn %xcc, 3f
+ nop
+
+2:
+ stxa %g0, [%o0+0x0]%asi
+ stxa %g0, [%o0+0x40]%asi
+ stxa %g0, [%o0+0x80]%asi
+ stxa %g0, [%o0+0xc0]%asi
+
+ stxa %g0, [%o0+0x8]%asi
+ stxa %g0, [%o0+0x10]%asi
+ stxa %g0, [%o0+0x18]%asi
+ stxa %g0, [%o0+0x20]%asi
+ stxa %g0, [%o0+0x28]%asi
+ stxa %g0, [%o0+0x30]%asi
+ stxa %g0, [%o0+0x38]%asi
+
+ stxa %g0, [%o0+0x48]%asi
+ stxa %g0, [%o0+0x50]%asi
+ stxa %g0, [%o0+0x58]%asi
+ stxa %g0, [%o0+0x60]%asi
+ stxa %g0, [%o0+0x68]%asi
+ stxa %g0, [%o0+0x70]%asi
+ stxa %g0, [%o0+0x78]%asi
+
+ stxa %g0, [%o0+0x88]%asi
+ stxa %g0, [%o0+0x90]%asi
+ stxa %g0, [%o0+0x98]%asi
+ stxa %g0, [%o0+0xa0]%asi
+ stxa %g0, [%o0+0xa8]%asi
+ stxa %g0, [%o0+0xb0]%asi
+ stxa %g0, [%o0+0xb8]%asi
+
+ stxa %g0, [%o0+0xc8]%asi
+ stxa %g0, [%o0+0xd0]%asi
+ stxa %g0, [%o0+0xd8]%asi
+ stxa %g0, [%o0+0xe0]%asi
+ stxa %g0, [%o0+0xe8]%asi
+ stxa %g0, [%o0+0xf0]%asi
+ stxa %g0, [%o0+0xf8]%asi
+
+ sub %o4, 0x100, %o4
+ cmp %o4, 0x100
+ bgu,pt %xcc, 2b
+ add %o0, 0x100, %o0
+
+3:
+ ! ... check if 64 bytes to set
+ cmp %o4, 0x40
+ blu %xcc, bzero_blk_done
+ nop
+
+4:
+ stxa %g0, [%o0+0x0]%asi
+ stxa %g0, [%o0+0x8]%asi
+ stxa %g0, [%o0+0x10]%asi
+ stxa %g0, [%o0+0x18]%asi
+ stxa %g0, [%o0+0x20]%asi
+ stxa %g0, [%o0+0x28]%asi
+ stxa %g0, [%o0+0x30]%asi
+ stxa %g0, [%o0+0x38]%asi
+
+ subcc %o4, 0x40, %o4
+ bgu,pt %xcc, 3b
+ add %o0, 0x40, %o0
+
+bzero_blk_done:
+ membar #Sync
+ !
+ ! Undo asi register setting.
+ !
+ rd %asi, %o4
+ wr %g0, ASI_P, %asi
+ cmp %o4, ASI_LDSTBI_P
+ bne,a %xcc, bzero_small
+ wr %g0, ASI_AIUS, %asi
+
+bzero_small:
+ ! Set the remaining doubles
+ subcc %o3, 8, %o3 ! Can we store any doubles?
+ blu,pn %xcc, byteclr
+ and %o1, 7, %o1 ! calc bytes left after doubles
+
+dbclr:
+ stxa %g0, [%o0]%asi ! Clear the doubles
+ subcc %o3, 8, %o3
+ bgeu,pt %xcc, dbclr
+ add %o0, 8, %o0
+
+ ba byteclr
+ nop
+
+wdalign:
+ andcc %o0, 3, %o3 ! is add aligned on a word boundary
+ bz,pn %xcc, wdclr
+ andn %o1, 3, %o3 ! create word sized count in %o3
+
+ dec %o1 ! decrement count
+ stba %g0, [%o0]%asi ! clear a byte
+ ba wdalign
+ inc %o0 ! next byte
+
+wdclr:
+ sta %g0, [%o0]%asi ! 4-byte clearing loop
+ subcc %o3, 4, %o3
+ bnz,pt %xcc, wdclr
+ inc 4, %o0
+
+ and %o1, 3, %o1 ! leftover count, if any
+
+byteclr:
+ ! Set the leftover bytes
+ brz %o1, bzero_exit
+ nop
+
+7:
+ deccc %o1 ! byte clearing loop
+ stba %g0, [%o0]%asi
+ bgu,pt %xcc, 7b
+ inc %o0
+
+bzero_exit:
+ retl
+ clr %o0 ! return (0)
+
+END(bzero)
+#endif /* lint */
+
+
+#if 0
+#define SMALL_LIMIT 7
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+copyin(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else /* lint */
+
+ENTRY(copyin)
+ !
+ ! Check the length and bail if zero.
+ !
+ tst %o2
+ bnz,pt %xcc, 1f
+ nop
+ retl
+ clr %o0
+#if 0
+1:
+ sethi %hi(copyio_fault), %o4
+ or %o4, %lo(copyio_fault), %o4
+ sethi %hi(copyio_fault_nowindow), %o3
+ ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
+ or %o3, %lo(copyio_fault_nowindow), %o3
+ membar #Sync
+ stn %o3, [THREAD_REG + T_LOFAULT]
+
+ mov %o0, SAVE_SRC
+ mov %o1, SAVE_DST
+ mov %o2, SAVE_COUNT
+#endif
+ !
+ ! Check to see if we're more than SMALL_LIMIT.
+ !
+ subcc %o2, SMALL_LIMIT, %o3
+ bgu,a,pt %xcc, dci_ns
+ or %o0, %o1, %o3
+ !
+ ! What was previously ".small_copyin"
+ !
+dcibcp:
+ sub %g0, %o2, %o3 ! setup for copy loop
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+ ba,pt %xcc, dcicl
+ lduba [%o0 + %o3]ASI_AIUS, %o4
+ !
+ ! %o0 and %o1 point at the end and remain pointing at the end
+ ! of their buffers. We pull things out by adding %o3 (which is
+ ! the negation of the length) to the buffer end which gives us
+ ! the curent location in the buffers. By incrementing %o3 we walk
+ ! through both buffers without having to bump each buffer's
+ ! pointer. A very fast 4 instruction loop.
+ !
+ .align 16
+dcicl:
+ stb %o4, [%o1 + %o3]
+ inccc %o3
+ bl,a,pt %xcc, dcicl
+ lduba [%o0 + %o3]ASI_AIUS, %o4
+ !
+ ! We're done. Go home.
+ !
+ membar #Sync
+ retl
+ clr %o0
+ !
+ ! Try aligned copies from here.
+ !
+dci_ns:
+ !
+ ! See if we're single byte aligned. If we are, check the
+ ! limit for single byte copies. If we're smaller, or equal,
+ ! bounce to the byte for byte copy loop. Otherwise do it in
+ ! HW (if enabled).
+ !
+ btst 1, %o3
+ bz,a,pt %icc, dcih8
+ btst 7, %o3
+ !
+ ! We're single byte aligned.
+ !
+ sethi %hi(hw_copy_limit_1), %o3
+ ld [%o3 + %lo(hw_copy_limit_1)], %o3
+ !
+ ! Is HW copy on? If not do everything byte for byte.
+ !
+ tst %o3
+ bz,pn %icc, dcibcp
+ subcc %o3, %o2, %o3
+ !
+ ! Are we bigger than the HW limit? If not
+ ! go to byte for byte.
+ !
+ bge,pt %xcc, dcibcp
+ nop
+ !
+ ! We're big enough and copy is on. Do it with HW.
+ !
+ ba,pt %xcc, big_copyin
+ nop
+dcih8:
+ !
+ ! 8 byte aligned?
+ !
+ bnz,a %xcc, dcih4
+ btst 3, %o3
+ !
+ ! We're eight byte aligned.
+ !
+ sethi %hi(hw_copy_limit_8), %o3
+ ld [%o3 + %lo(hw_copy_limit_8)], %o3
+ !
+ ! Is HW assist on? If not, do it with the aligned copy.
+ !
+ tst %o3
+ bz,pn %icc, dcis8
+ subcc %o3, %o2, %o3
+ bge %xcc, dcis8
+ nop
+ ba,pt %xcc, big_copyin
+ nop
+dcis8:
+ !
+ ! Housekeeping for copy loops. Uses same idea as in the byte for
+ ! byte copy loop above.
+ !
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+ sub %g0, %o2, %o3
+ ba,pt %xcc, didebc
+ srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
+ !
+ ! 4 byte aligned?
+ !
+dcih4:
+ bnz %xcc, dcih2
+ sethi %hi(hw_copy_limit_4), %o3
+ ld [%o3 + %lo(hw_copy_limit_4)], %o3
+ !
+ ! Is HW assist on? If not, do it with the aligned copy.
+ !
+ tst %o3
+ bz,pn %icc, dcis4
+ subcc %o3, %o2, %o3
+ !
+ ! We're negative if our size is less than or equal to hw_copy_limit_4.
+ !
+ bge %xcc, dcis4
+ nop
+ ba,pt %xcc, big_copyin
+ nop
+dcis4:
+ !
+ ! Housekeeping for copy loops. Uses same idea as in the byte
+ ! for byte copy loop above.
+ !
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+ sub %g0, %o2, %o3
+ ba,pt %xcc, didfbc
+ srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
+dcih2:
+ !
+ ! We're two byte aligned. Check for "smallness"
+ ! done in delay at .dcih4
+ !
+ bleu,pt %xcc, dcis2
+ sethi %hi(hw_copy_limit_2), %o3
+ ld [%o3 + %lo(hw_copy_limit_2)], %o3
+ !
+ ! Is HW assist on? If not, do it with the aligned copy.
+ !
+ tst %o3
+ bz,pn %icc, dcis2
+ subcc %o3, %o2, %o3
+ !
+ ! Are we larger than the HW limit?
+ !
+ bge %xcc, dcis2
+ nop
+ !
+ ! HW assist is on and we're large enough to use it.
+ !
+ ba,pt %xcc, big_copyin
+ nop
+ !
+ ! Housekeeping for copy loops. Uses same idea as in the byte
+ ! for byte copy loop above.
+ !
+dcis2:
+ add %o0, %o2, %o0
+ add %o1, %o2, %o1
+ sub %g0, %o2, %o3
+ ba,pt %xcc, didtbc
+ srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
+ !
+small_copyin:
+ !
+ ! Why are we doing this AGAIN? There are certain conditions in
+ ! big copyin that will cause us to forgo the HW assisted copys
+ ! and bounce back to a non-hw assisted copy. This dispatches
+ ! those copies. Note that we branch around this in the main line
+ ! code.
+ !
+ ! We make no check for limits or HW enablement here. We've
+ ! already been told that we're a poster child so just go off
+ ! and do it.
+ !
+ or %o0, %o1, %o3
+ btst 1, %o3
+ bnz %icc, dcibcp ! Most likely
+ btst 7, %o3
+ bz %icc, dcis8
+ btst 3, %o3
+ bz %icc, dcis4
+ nop
+ ba,pt %xcc, dcis2
+ nop
+ !
+ ! Eight byte aligned copies. A steal from the original .small_copyin
+ ! with modifications. %o2 is number of 8 byte chunks to copy. When
+ ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
+ ! to copy.
+ !
+ .align 32
+didebc:
+ ldxa [%o0 + %o3]ASI_AIUS, %o4
+ deccc %o2
+ stx %o4, [%o1 + %o3]
+ bg,pt %xcc, didebc
+ addcc %o3, 8, %o3
+ !
+ ! End of copy loop. Most 8 byte aligned copies end here.
+ !
+ bz,pt %xcc, dcifh
+ nop
+ !
+ ! Something is left. Do it byte for byte.
+ !
+ ba,pt %xcc, dcicl
+ lduba [%o0 + %o3]ASI_AIUS, %o4
+ !
+ ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
+ !
+ .align 32
+didfbc:
+ lduwa [%o0 + %o3]ASI_AIUS, %o4
+ deccc %o2
+ st %o4, [%o1 + %o3]
+ bg,pt %xcc, didfbc
+ addcc %o3, 4, %o3
+ !
+ ! End of copy loop. Most 4 byte aligned copies end here.
+ !
+ bz,pt %xcc, dcifh
+ nop
+ !
+ ! Something is left. Do it byte for byte.
+ !
+ ba,pt %xcc, dcicl
+ lduba [%o0 + %o3]ASI_AIUS, %o4
+ !
+ ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
+ ! copy.
+ !
+ .align 32
+didtbc:
+ lduha [%o0 + %o3]ASI_AIUS, %o4
+ deccc %o2
+ sth %o4, [%o1 + %o3]
+ bg,pt %xcc, didtbc
+ addcc %o3, 2, %o3
+ !
+ ! End of copy loop. Most 2 byte aligned copies end here.
+ !
+ bz,pt %xcc, dcifh
+ nop
+ !
+ ! Deal with the last byte
+ !
+ lduba [%o0 + %o3]ASI_AIUS, %o4
+ stb %o4, [%o1 + %o3]
+dcifh:
+ membar #Sync
+ retl
+ clr %o0
+
+big_copyin:
+ !
+ ! We're going off to do a block copy.
+ ! Switch fault hendlers and grab a window. We
+ ! don't do a membar #Sync since we've done only
+ ! kernel data to this point.
+ !
+ save %sp, -SA(MINFRAME), %sp
+
+ ! Copy in that reach here are larger than 256 bytes. The
+ ! hw_copy_limit_1 is set to 256. Never set this limit less
+ ! 128 bytes.
+do_blockcopyin:
+
+ ! Swap src/dst since the code below is memcpy code
+ ! and memcpy/bcopy have different calling sequences
+ mov %i1, %i5
+ mov %i0, %i1
+ mov %i5, %i0
+
+ andcc %i0, 7, %i3 ! is dst double aligned
+ bz %xcc, copyin_blkcpy
+ sub %i3, 8, %i3
+ neg %i3 ! bytes till double aligned
+ sub %i2, %i3, %i2 ! update %i2 with new count
+
+ ! Align Destination on double-word boundary
+
+1: lduba [%i1]ASI_AIUS, %i4
+ inc %i1
+ stb %i4, [%i0]
+ deccc %i3
+ bgu %xcc, 1b
+ inc %i0
+
+copyin_blkcpy:
+ andcc %i0, 63, %i3
+ bz,pn %xcc, copyin_blalign ! now block aligned
+ sub %i3, 64, %i3
+ neg %i3 ! bytes till block aligned
+ sub %i2, %i3, %i2 ! update %i2 with new count
+
+ ! Copy %i3 bytes till dst is block (64 byte) aligned. use
+ ! double word copies.
+
+ andcc %i1, 7, %g1 ! is src aligned on a 8 bytes
+ bz %xcc, ci_dbcopy ! %g1 has source offset (last 3-bits)
+ sll %g1, 3, %l1 ! left shift
+ mov 0x40, %l2
+ sub %l2, %l1, %l2 ! right shift = (64 - left shift)
+
+ ! Now use double word copies to align destination.
+ci_double:
+ sub %i1, %g1, %i1 ! align the src at 8 bytes.
+ ldxa [%i1]ASI_AIUS, %o2
+2:
+ add %i1, 0x8, %i1
+ ldxa [%i1]ASI_AIUS, %o4
+ ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
+ stx %o2, [%i0]
+ mov %o4, %o2
+ subcc %i3, 0x8, %i3
+ bgu,pt %xcc, 2b
+ add %i0, 0x8, %i0
+ ba copyin_blalign
+ add %i1, %g1, %i1
+
+ ! Both source and destination are double aligned.
+ ! No shift and merge of data required in this case.
+ci_dbcopy:
+ ldxa [%i1]ASI_AIUS, %o2
+ stx %o2, [%i0]
+ add %i1, 0x8, %i1
+ subcc %i3, 0x8, %i3
+ bgu,pt %xcc, ci_dbcopy
+ add %i0, 0x8, %i0
+
+copyin_blalign:
+ andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
+ sub %i2, %i3, %i2 ! Residue bytes in %i2
+
+ wr %g0, ASI_LDSTBI_P, %asi
+
+ andcc %i1, 0xf, %o2 ! is src quadword aligned
+ bz,pn %xcc, ci_blkcpy ! src offset in %o2 (last 4-bits)
+ nop
+ cmp %o2, 0x8
+ bg ci_upper_double
+ nop
+ bl ci_lower_double
+ nop
+
+ ! Falls through when source offset is equal to 8 i.e.
+ ! source is double word aligned.
+ ! In this case no shift/merge of data is required
+
+ sub %i1, %o2, %i1 ! align the src at 16 bytes.
+ andn %i1, 0x3f, %l0 ! %l0 has block aligned source
+ prefetch [%l0+0x0], #one_read
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+ci_loop0:
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4
+
+ prefetch [%l0+0x40], #one_read
+
+ stxa %l3, [%i0+0x0]%asi
+ stxa %l4, [%i0+0x8]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+
+ stxa %l5, [%i0+0x10]%asi
+ stxa %l2, [%i0+0x18]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4
+
+ stxa %l3, [%i0+0x20]%asi
+ stxa %l4, [%i0+0x28]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+
+ stxa %l5, [%i0+0x30]%asi
+ stxa %l2, [%i0+0x38]%asi
+
+ add %l0, 0x40, %l0
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, ci_loop0
+ add %i0, 0x40, %i0
+ ba ci_blkdone
+ add %i1, %o2, %i1 ! increment the source by src offset
+ ! the src offset was stored in %o2
+
+ci_lower_double:
+
+ sub %i1, %o2, %i1 ! align the src at 16 bytes.
+ sll %o2, 3, %o0 ! %o0 left shift
+ mov 0x40, %o1
+ sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
+ andn %i1, 0x3f, %l0 ! %l0 has block aligned source
+ prefetch [%l0+0x0], #one_read
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2 ! partial data in %l2
+ ! and %l3 has complete
+ ! data
+ci_loop1:
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4 ! %l4 has partial data
+ ! for this read.
+ ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
+ ! into %l2 and %l3
+
+ prefetch [%l0+0x40], #one_read
+
+ stxa %l2, [%i0+0x0]%asi
+ stxa %l3, [%i0+0x8]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+ ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
+ ! %l4 from previous read
+ ! into %l4 and %l5
+ stxa %l4, [%i0+0x10]%asi
+ stxa %l5, [%i0+0x18]%asi
+
+ ! Repeat the same for next 32 bytes.
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4
+ ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
+
+ stxa %l2, [%i0+0x20]%asi
+ stxa %l3, [%i0+0x28]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+ ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
+
+ stxa %l4, [%i0+0x30]%asi
+ stxa %l5, [%i0+0x38]%asi
+
+ add %l0, 0x40, %l0
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, ci_loop1
+ add %i0, 0x40, %i0
+ ba ci_blkdone
+ add %i1, %o2, %i1 ! increment the source by src offset
+ ! the src offset was stored in %o2
+
+ci_upper_double:
+
+ sub %i1, %o2, %i1 ! align the src at 16 bytes.
+ sub %o2, 0x8, %o0
+ sll %o0, 3, %o0 ! %o0 left shift
+ mov 0x40, %o1
+ sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
+ andn %i1, 0x3f, %l0 ! %l0 has block aligned source
+ prefetch [%l0+0x0], #one_read
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2 ! partial data in %l3
+ ! for this read and
+ ! no data in %l2
+ci_loop2:
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4 ! %l4 has complete data
+ ! and %l5 has partial
+ ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
+ ! into %l3 and %l4
+ prefetch [%l0+0x40], #one_read
+
+ stxa %l3, [%i0+0x0]%asi
+ stxa %l4, [%i0+0x8]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+ ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
+ ! %l5 from previous read
+ ! into %l5 and %l2
+
+ stxa %l5, [%i0+0x10]%asi
+ stxa %l2, [%i0+0x18]%asi
+
+ ! Repeat the same for next 32 bytes.
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4
+ ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
+
+ stxa %l3, [%i0+0x20]%asi
+ stxa %l4, [%i0+0x28]%asi
+
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+ ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
+
+ stxa %l5, [%i0+0x30]%asi
+ stxa %l2, [%i0+0x38]%asi
+
+ add %l0, 0x40, %l0
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, ci_loop2
+ add %i0, 0x40, %i0
+ ba ci_blkdone
+ add %i1, %o2, %i1 ! increment the source by src offset
+ ! the src offset was stored in %o2
+
+
+ ! Do fast copy using ASI_LDSTBI_P
+ci_blkcpy:
+
+ andn %i1, 0x3f, %o0 ! %o0 has block aligned source
+ prefetch [%o0+0x0], #one_read
+1:
+ ldda [%i1]ASI_LDSTBI_AIUS, %l0
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l2
+ add %i1, 0x10, %i1
+
+ prefetch [%o0+0x40], #one_read
+
+ stxa %l0, [%i0+0x0]%asi
+
+ ldda [%i1]ASI_LDSTBI_AIUS, %l4
+ add %i1, 0x10, %i1
+ ldda [%i1]ASI_LDSTBI_AIUS, %l6
+ add %i1, 0x10, %i1
+
+ stxa %l1, [%i0+0x8]%asi
+ stxa %l2, [%i0+0x10]%asi
+ stxa %l3, [%i0+0x18]%asi
+ stxa %l4, [%i0+0x20]%asi
+ stxa %l5, [%i0+0x28]%asi
+ stxa %l6, [%i0+0x30]%asi
+ stxa %l7, [%i0+0x38]%asi
+
+ add %o0, 0x40, %o0
+ subcc %i3, 0x40, %i3
+ bgu,pt %xcc, 1b
+ add %i0, 0x40, %i0
+
+ci_blkdone:
+ membar #Sync
+
+ ! Copy as much rest of the data as double word copy.
+ci_dwcp:
+ cmp %i2, 0x8 ! Not enough bytes to copy as double
+ blu %xcc, ci_dbdone
+ nop
+
+ andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size
+ sub %i2, %i3, %i2 ! Residue bytes in %i2
+
+ andcc %i1, 7, %g1 ! is src aligned on a 8 bytes
+ bz %xcc, ci_cpy_db
+ nop
+
+ sll %g1, 3, %l0 ! left shift
+ mov 0x40, %l1
+ sub %l1, %l0, %l1 ! right shift = (64 - left shift)
+
+ci_cpy_dbwd:
+ sub %i1, %g1, %i1 ! align the src at 8 bytes.
+ ldxa [%i1]ASI_AIUS, %o2
+3:
+ add %i1, 0x8, %i1
+ ldxa [%i1]ASI_AIUS, %o4
+ ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
+ stx %o2, [%i0]
+ mov %o4, %o2
+ subcc %i3, 0x8, %i3
+ bgu,pt %xcc, 3b
+ add %i0, 0x8, %i0
+ ba ci_dbdone
+ add %i1, %g1, %i1
+
+ci_cpy_db:
+ ldxa [%i1]ASI_AIUS, %o2
+ stx %o2, [%i0]
+ add %i1, 0x8, %i1
+ subcc %i3, 0x8, %i3
+ bgu,pt %xcc, ci_cpy_db
+ add %i0, 0x8, %i0
+
+ci_dbdone:
+ tst %i2
+ bz,pt %xcc, copyin_exit
+ nop
+
+ ! Copy the residue as byte copy
+ci_residue:
+ lduba [%i1]ASI_AIUS, %i4
+ stb %i4, [%i0]
+ inc %i1
+ deccc %i2
+ bgu %xcc, ci_residue
+ inc %i0
+
+copyin_exit:
+ membar #Sync
+ ret
+ restore %g0, 0, %o0
+END(copyin)
+
+#endif /* lint */
+#endif
+
OpenPOWER on IntegriCloud