separate out legitimately CDDL code - optimized routines taken from

opensolaries
author: kmacy <kmacy@FreeBSD.org> 2006-11-23 19:58:06 +0000
committer: kmacy <kmacy@FreeBSD.org> 2006-11-23 19:58:06 +0000
commit: c784eb28fd564165eec1c920c5ea555324910901 (patch)
tree: bd9650d04ee7cc53bb5869647cfe9a7209e77afe /sys/sun4v/cddl
parent: d47577789b1b7d11b294d9305019db910d35efe0 (diff)
download: FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.zip
FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.tar.gz
1 files changed, 1598 insertions, 0 deletions
diff --git a/sys/sun4v/cddl/t1_copy.S b/sys/sun4v/cddl/t1_copy.S
new file mode 100644
index 0000000..6fe5a3d
--- /dev/null
+++ b/sys/sun4v/cddl/t1_copy.S
@@ -0,0 +1,1598 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http:	//www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information:	 Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$")
+	
+#include <machine/asi.h>
+#include <machine/asmacros.h>
+#include <machine/ktr.h>
+#include <machine/pstate.h>
+#include <machine/trap.h>
+#include <machine/tstate.h>
+#include <machine/wstate.h>
+#include <machine/hypervisorvar.h>
+
+       .register %g2,#ignore
+       .register %g3,#ignore
+       .register %g6,#ignore
+       .register %g7,#ignore
+		
+	
+/*
+ * This define is to align data for the unaligned source cases.
+ * The data1, data2 and data3 is merged into data1 and data2.
+ * The data3 is preserved for next merge.
+ */
+#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
+	sllx	data1, lshift, data1				;\
+	srlx	data2, rshift, tmp				;\
+	or	data1, tmp, data1				;\
+	sllx	data2, lshift, data2				;\
+	srlx	data3, rshift, tmp				;\
+	or	data2, tmp, data2
+/*
+ * This macro is to align the data. Basically it merges
+ * data1 and data2 to form double word.
+ */
+#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
+	sllx	data1, lshift, data1				;\
+	srlx	data2, rshift, tmp				;\
+	or	data1, tmp, data1
+
+
+
+	
+	
+/*
+ * DGDEF and DGDEF2 provide global data declarations.
+ *
+ * DGDEF provides a word aligned word of storage.
+ *
+ * DGDEF2 allocates "sz" bytes of storage with **NO** alignment.  This
+ * implies this macro is best used for byte arrays.
+ *
+ * DGDEF3 allocates "sz" bytes of storage with "algn" alignment.
+ */
+#define DGDEF2(name, sz) \
+        .section        ".data"	;  \
+        .global name	;  \
+        .type   name, @object ;  \
+        .size   name, sz;  \
+name:
+
+#define DGDEF3(name, sz, algn) \
+        .section        ".data"	;  \
+        .align  algn	;  \
+        .global name	;  \
+        .type   name, @object ;  \
+        .size   name, sz;  \
+name:
+
+#define DGDEF(name)     DGDEF3(name, 4, 4)
+	
+.align  4
+DGDEF(hw_copy_limit_1)
+.word   0x100
+DGDEF(hw_copy_limit_2)
+.word   0x200
+DGDEF(hw_copy_limit_4)
+.word   0x400
+DGDEF(hw_copy_limit_8)
+.word   0x400
+.align  64
+.section ".text"
+	
+
+#if defined(lint)
+
+/*ARGSUSED*/
+void
+ovbcopy(const void *from, void *to, size_t count)
+{}
+
+#else	/* lint */
+
+ENTRY(bcopy)
+	tst	%o2			! check count
+	bgu,a	%xcc, 1f		! nothing to do or bad arguments
+	subcc	%o0, %o1, %o3		! difference of from and to address
+
+	retl				! return
+	  nop
+1:
+	bneg,a	%xcc, 2f
+	neg	%o3			! if < 0, make it positive
+2:	cmp	%o2, %o3		! cmp size and abs(from - to)
+	bleu	%xcc, novbcopy		! if size <= abs(diff): use bcopy,
+	  nop
+	cmp	%o0, %o1		! compare from and to addresses
+	blu	%xcc, ov_bkwd		! if from < to, copy backwards
+	  nop
+	!
+	! Copy forwards.
+	!
+ov_fwd:
+	ldub	[%o0], %o3		! read from address
+	inc	%o0			! inc from address
+	stb	%o3, [%o1]		! write to address
+	deccc	%o2			! dec count
+	bgu	%xcc, ov_fwd		! loop till done
+	  inc	%o1			! inc to address
+
+	retl				! return
+	nop
+	!
+	! Copy backwards.
+	!
+ov_bkwd:
+	deccc	%o2			! dec count
+	ldub	[%o0 + %o2], %o3	! get byte at end of src
+	bgu	%xcc, ov_bkwd		! loop till done
+	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
+
+	retl				! return
+	nop
+END(bcopy)
+
+#endif	/* lint */
+
+
+		
+/*
+ * Copy a block of storage - must not overlap (from + len <= to).
+ */
+ENTRY(novbcopy)
+
+	save	%sp, -SA(MINFRAME), %sp
+
+do_copy:
+	cmp	%i2, 12			! for small counts
+	blu	%xcc, bytecp		! just copy bytes
+	  nop
+
+	cmp	%i2, 128		! for less than 128 bytes
+	blu,pn	%xcc, bcb_punt		! no block st/quad ld
+	  nop
+#if 0
+	set	use_hw_bcopy, %o2
+	ld	[%o2], %o2
+	tst	%o2
+	bz	bcb_punt
+	  nop
+#endif
+	subcc	%i1, %i0, %i3
+	bneg,a,pn %xcc, 1f
+	  neg	%i3
+1:
+	/*
+	 * Compare against 256 since we should be checking block addresses
+	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
+	 * src = dest + (64 * 3) + 63.
+	 */
+	cmp	%i3, 256
+	blu,pn	%xcc, bcb_punt
+	  nop
+
+	/*
+	 * Copy that reach here have at least 2 blocks of data to copy.
+	 */
+do_blockcopy:
+	! Swap src/dst since the code below is memcpy code
+	! and memcpy/bcopy have different calling sequences
+	mov	%i1, %i5
+	mov	%i0, %i1
+	mov	%i5, %i0
+
+	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
+	bz	%xcc, chksrc		! dst is already double aligned
+	  sub	%i3, 0x40, %i3
+	neg	%i3			! bytes till dst 64 bytes aligned
+	sub	%i2, %i3, %i2		! update i2 with new count
+
+1:	ldub	[%i1], %i4
+	stb	%i4, [%i0]
+	inc	%i1
+	deccc	%i3
+	bgu	%xcc, 1b
+	  inc	%i0
+
+	! Now Destination is block (64 bytes) aligned
+chksrc:
+	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
+	sub	%i2, %i3, %i2		! Residue bytes in %i2
+
+	wr	%g0, ASI_LDSTBI_P, %asi
+
+	andcc	%i1, 0xf, %o2		! is src quadword aligned
+	bz,pn	%xcc, blkcpy		! src offset in %o2
+	nop
+	cmp	%o2, 0x8
+	bg	cpy_upper_double
+	nop
+	bl	cpy_lower_double
+	nop
+
+	! Falls through when source offset is equal to 8 i.e.
+	! source is double word aligned.
+	! In this case no shift/merge of data is required
+	sub	%i1, %o2, %i1		! align the src at 16 bytes.
+	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
+	prefetch [%l0+0x0], #one_read
+	ldda	[%i1+0x0]%asi, %l2
+loop0:
+	ldda	[%i1+0x10]%asi, %l4
+	prefetch [%l0+0x40], #one_read
+
+	stxa	%l3, [%i0+0x0]%asi
+	stxa	%l4, [%i0+0x8]%asi
+
+	ldda	[%i1+0x20]%asi, %l2
+	stxa	%l5, [%i0+0x10]%asi
+	stxa	%l2, [%i0+0x18]%asi
+
+	ldda	[%i1+0x30]%asi, %l4
+	stxa	%l3, [%i0+0x20]%asi
+	stxa	%l4, [%i0+0x28]%asi
+
+	ldda	[%i1+0x40]%asi, %l2
+	stxa	%l5, [%i0+0x30]%asi
+	stxa	%l2, [%i0+0x38]%asi
+
+	add	%l0, 0x40, %l0
+	add	%i1, 0x40, %i1
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, loop0
+	  add	%i0, 0x40, %i0
+	ba	blkdone
+	add	%i1, %o2, %i1		! increment the source by src offset
+					! the src offset was stored in %o2
+
+cpy_lower_double:
+	sub	%i1, %o2, %i1		! align the src at 16 bytes.
+	sll	%o2, 3, %o0		! %o0 left shift
+	mov	0x40, %o1
+	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
+	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
+	prefetch [%l0+0x0], #one_read
+	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
+					! complete data
+loop1:
+	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
+	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
+							! into %l2 and %l3
+	prefetch [%l0+0x40], #one_read
+	stxa	%l2, [%i0+0x0]%asi
+	stxa	%l3, [%i0+0x8]%asi
+
+	ldda	[%i1+0x20]%asi, %l2
+	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
+	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
+	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
+
+	! Repeat the same for next 32 bytes.
+
+	ldda	[%i1+0x30]%asi, %l4
+	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
+	stxa	%l2, [%i0+0x20]%asi
+	stxa	%l3, [%i0+0x28]%asi
+
+	ldda	[%i1+0x40]%asi, %l2
+	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
+	stxa	%l4, [%i0+0x30]%asi
+	stxa	%l5, [%i0+0x38]%asi
+
+	add	%l0, 0x40, %l0
+	add	%i1, 0x40, %i1
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, loop1
+	  add	%i0, 0x40, %i0
+	ba	blkdone
+	add	%i1, %o2, %i1		! increment the source by src offset
+					! the src offset was stored in %o2
+
+cpy_upper_double:
+	sub	%i1, %o2, %i1		! align the src at 16 bytes.
+	mov	0x8, %o0
+	sub	%o2, %o0, %o0
+	sll	%o0, 3, %o0		! %o0 left shift
+	mov	0x40, %o1
+	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
+	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
+	prefetch [%l0+0x0], #one_read
+	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
+					! no data in %l2
+loop2:
+	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
+					! partial
+	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
+							! into %l3 and %l4
+	prefetch [%l0+0x40], #one_read
+	stxa	%l3, [%i0+0x0]%asi
+	stxa	%l4, [%i0+0x8]%asi
+
+	ldda	[%i1+0x20]%asi, %l2
+	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
+	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
+	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
+
+	! Repeat the same for next 32 bytes.
+
+	ldda	[%i1+0x30]%asi, %l4
+	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
+	stxa	%l3, [%i0+0x20]%asi
+	stxa	%l4, [%i0+0x28]%asi
+
+	ldda	[%i1+0x40]%asi, %l2
+	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
+	stxa	%l5, [%i0+0x30]%asi
+	stxa	%l2, [%i0+0x38]%asi
+
+	add	%l0, 0x40, %l0
+	add	%i1, 0x40, %i1
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, loop2
+	  add	%i0, 0x40, %i0
+	ba	blkdone
+	add	%i1, %o2, %i1		! increment the source by src offset
+					! the src offset was stored in %o2
+
+
+	! Both Source and Destination are block aligned.
+	! Do fast copy using ASI_LDSTBI_P
+blkcpy:
+	prefetch [%i1+0x0], #one_read
+1:
+	ldda	[%i1+0x0]%asi, %l0
+	ldda	[%i1+0x10]%asi, %l2
+	prefetch [%i1+0x40], #one_read
+
+	stxa	%l0, [%i0+0x0]%asi
+	ldda	[%i1+0x20]%asi, %l4
+	ldda	[%i1+0x30]%asi, %l6
+
+	stxa	%l1, [%i0+0x8]%asi
+	stxa	%l2, [%i0+0x10]%asi
+	stxa	%l3, [%i0+0x18]%asi
+	stxa	%l4, [%i0+0x20]%asi
+	stxa	%l5, [%i0+0x28]%asi
+	stxa	%l6, [%i0+0x30]%asi
+	stxa	%l7, [%i0+0x38]%asi
+
+	add	%i1, 0x40, %i1
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, 1b
+	  add	%i0, 0x40, %i0
+
+blkdone:
+	tst	%i2
+	bz,pt	%xcc, blkexit
+	nop
+
+residue:
+	ldub	[%i1], %i4
+	stb	%i4, [%i0]
+	inc	%i1
+	deccc	%i2
+	bgu	%xcc, residue
+	  inc	%i0
+
+blkexit:
+	membar	#Sync				! sync error barrier
+	ret
+	restore	%g0, 0, %o0
+
+bcb_punt:
+	!
+	! use aligned transfers where possible
+	!
+	xor	%i0, %i1, %o4		! xor from and to address
+	btst	7, %o4			! if lower three bits zero
+	bz	aldoubcp		! can align on double boundary
+	  nop	! assembler complaints about label
+
+	xor	%i0, %i1, %o4		! xor from and to address
+	btst	3, %o4			! if lower two bits zero
+	bz	alwordcp		! can align on word boundary
+	btst	3, %i0			! delay slot, from address unaligned?
+	!
+	! use aligned reads and writes where possible
+	! this differs from wordcp in that it copes
+	! with odd alignment between source and destnation
+	! using word reads and writes with the proper shifts
+	! in between to align transfers to and from memory
+	! i0 - src address, i1 - dest address, i2 - count
+	! i3, i4 - tmps for used generating complete word
+	! i5 (word to write)
+	! l0 size in bits of upper part of source word (US)
+	! l1 size in bits of lower part of source word (LS = 32 - US)
+	! l2 size in bits of upper part of destination word (UD)
+	! l3 size in bits of lower part of destination word (LD = 32 - UD)
+	! l4 number of bytes leftover after aligned transfers complete
+	! l5 the number 32
+	!
+	mov	32, %l5			! load an oft-needed constant
+	bz	align_dst_only
+	  btst	3, %i1			! is destnation address aligned?
+	clr	%i4			! clear registers used in either case
+	bz	align_src_only
+	  clr	%l0
+	!
+	! both source and destination addresses are unaligned
+	!
+1:					! align source
+	ldub	[%i0], %i3		! read a byte from source address
+	add	%i0, 1, %i0		! increment source address
+	or	%i4, %i3, %i4		! or in with previous bytes (if any)
+	btst	3, %i0			! is source aligned?
+	add	%l0, 8, %l0		! increment size of upper source (US)
+	bnz,a	1b
+	  sll	%i4, 8, %i4		! make room for next byte
+
+	sub	%l5, %l0, %l1		! generate shift left count (LS)
+	sll	%i4, %l1, %i4		! prepare to get rest
+	ld	[%i0], %i3		! read a word
+	add	%i0, 4, %i0		! increment source address
+	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
+	or	%i4, %i5, %i5		! merge
+	mov	24, %l3			! align destination
+1:
+	srl	%i5, %l3, %i4		! prepare to write a single byte
+	stb	%i4, [%i1]		! write a byte
+	add	%i1, 1, %i1		! increment destination address
+	sub	%i2, 1, %i2		! decrement count
+	btst	3, %i1			! is destination aligned?
+	bnz,a	1b
+	  sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
+	sub	%l5, %l3, %l2		! generate shift left count (UD)
+	sll	%i5, %l2, %i5		! move leftover into upper bytes
+	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
+	bgu	%xcc, more_needed	! need more to fill than we have
+	  nop
+
+	sll	%i3, %l1, %i3		! clear upper used byte(s)
+	srl	%i3, %l1, %i3
+	! get the odd bytes between alignments
+	sub	%l0, %l2, %l0		! regenerate shift count
+	sub	%l5, %l0, %l1		! generate new shift left count (LS)
+	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
+	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
+	srl	%i3, %l0, %i4
+	or	%i5, %i4, %i5
+	st	%i5, [%i1]		! write a word
+	subcc	%i2, 4, %i2		! decrement count
+	bz	%xcc, unalign_out
+	  add	%i1, 4, %i1		! increment destination address
+
+	b	2f
+	sll	%i3, %l1, %i5		! get leftover into upper bits
+more_needed:
+	sll	%i3, %l0, %i3		! save remaining byte(s)
+	srl	%i3, %l0, %i3
+	sub	%l2, %l0, %l1		! regenerate shift count
+	sub	%l5, %l1, %l0		! generate new shift left count
+	sll	%i3, %l1, %i4		! move to fill empty space
+	b	3f
+	or	%i5, %i4, %i5		! merge to complete word
+	!
+	! the source address is aligned and destination is not
+	!
+align_dst_only:
+	ld	[%i0], %i4		! read a word
+	add	%i0, 4, %i0		! increment source address
+	mov	24, %l0			! initial shift alignment count
+1:
+	srl	%i4, %l0, %i3		! prepare to write a single byte
+	stb	%i3, [%i1]		! write a byte
+	add	%i1, 1, %i1		! increment destination address
+	sub	%i2, 1, %i2		! decrement count
+	btst	3, %i1			! is destination aligned?
+	bnz,a	1b
+	  sub	%l0, 8, %l0		! delay slot, decrement shift count
+xfer:
+	sub	%l5, %l0, %l1		! generate shift left count
+	sll	%i4, %l1, %i5		! get leftover
+3:
+	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
+	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
+2:
+	ld	[%i0], %i3		! read a source word
+	add	%i0, 4, %i0		! increment source address
+	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
+	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
+	st	%i5, [%i1]		! write a destination word
+	subcc	%i2, 4, %i2		! decrement count
+	bz	%xcc, unalign_out	! check if done
+	  add	%i1, 4, %i1		! increment destination address
+	b	2b			! loop
+	sll	%i3, %l1, %i5		! get leftover
+unalign_out:
+	tst	%l4			! any bytes leftover?
+	bz	%xcc, cpdone
+	  nop
+1:
+	sub	%l0, 8, %l0		! decrement shift
+	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
+	stb	%i4, [%i1]		! write a byte
+	subcc	%l4, 1, %l4		! decrement count
+	bz	%xcc, cpdone		! done?
+	add	%i1, 1, %i1		! increment destination
+	tst	%l0			! any more previously read bytes
+	bnz	%xcc, 1b		! we have leftover bytes
+	  mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
+	b	dbytecp			! let dbytecp do the rest
+	  sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
+	!
+	! the destination address is aligned and the source is not
+	!
+align_src_only:
+	ldub	[%i0], %i3		! read a byte from source address
+	add	%i0, 1, %i0		! increment source address
+	or	%i4, %i3, %i4		! or in with previous bytes (if any)
+	btst	3, %i0			! is source aligned?
+	add	%l0, 8, %l0		! increment shift count (US)
+	bnz,a	align_src_only
+	  sll	%i4, 8, %i4		! make room for next byte
+	b,a	xfer
+	!
+	! if from address unaligned for double-word moves,
+	! move bytes till it is, if count is < 56 it could take
+	! longer to align the thing than to do the transfer
+	! in word size chunks right away
+	!
+aldoubcp:
+	cmp	%i2, 56			! if count < 56, use wordcp, it takes
+	blu,a	%xcc, alwordcp		! longer to align doubles than words
+	  mov	3, %o0			! mask for word alignment
+	call	alignit			! copy bytes until aligned
+	mov	7, %o0			! mask for double alignment
+	  !
+	! source and destination are now double-word aligned
+	! i3 has aligned count returned by alignit
+	!
+	and	%i2, 7, %i2		! unaligned leftover count
+	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
+5:
+	ldx	[%i0+%i1], %o4		! read from address
+	stx	%o4, [%i1]		! write at destination address
+	subcc	%i3, 8, %i3		! dec count
+	bgu	%xcc, 5b
+	  add	%i1, 8, %i1		! delay slot, inc to address
+	cmp	%i2, 4			! see if we can copy a word
+	blu	%xcc, dbytecp		! if 3 or less bytes use bytecp
+	  nop
+	!
+	! for leftover bytes we fall into wordcp, if needed
+	!
+wordcp:
+	and	%i2, 3, %i2		! unaligned leftover count
+5:
+	ld	[%i0+%i1], %o4		! read from address
+	st	%o4, [%i1]		! write at destination address
+	subcc	%i3, 4, %i3		! dec count
+	bgu	%xcc, 5b
+	  add	%i1, 4, %i1		! delay slot, inc to address
+	b,a	dbytecp
+
+	! we come here to align copies on word boundaries
+alwordcp:
+	call	alignit		! go word-align it
+	  mov	3, %o0			! bits that must be zero to be aligned
+	b	wordcp
+	  sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
+
+	!
+	! byte copy, works with any alignment
+	!
+bytecp:
+	b	dbytecp
+	  sub	%i0, %i1, %i0		! i0 gets difference of src and dst
+
+	!
+	! differenced byte copy, works with any alignment
+	! assumes dest in %i1 and (source - dest) in %i0
+	!
+1:
+	stb	%o4, [%i1]		! write to address
+	inc	%i1			! inc to address
+dbytecp:
+	deccc	%i2			! dec count
+	bgeu,a	%xcc, 1b		! loop till done
+	ldub	[%i0+%i1], %o4		! read from address
+cpdone:
+	membar	#Sync				! sync error barrier
+	ret
+	  restore %g0, 0, %o0		! return (0)
+
+/*
+ * Common code used to align transfers on word and doubleword
+ * boudaries.  Aligns source and destination and returns a count
+ * of aligned bytes to transfer in %i3
+ */
+1:
+	inc	%i0			! inc from
+	stb	%o4, [%i1]		! write a byte
+	inc	%i1			! inc to
+	dec	%i2			! dec count
+alignit:
+	btst	%o0, %i0		! %o0 is bit mask to check for alignment
+	bnz,a	1b
+	  ldub	[%i0], %o4		! read next byte
+
+	retl
+	andn	%i2, %o0, %i3		! return size of aligned bytes
+END(novbcopy)
+
+
+/*
+ * hwblkclr - clears block-aligned, block-multiple-sized regions that are
+ * longer than 256 bytes in length using Niagara's block stores/quad store.
+ * If the criteria for using this routine are not met then it calls bzero
+ * and returns 1.  Otherwise 0 is returned indicating success.
+ * Caller is responsible for ensuring use_hw_bzero is true and that
+ * kpreempt_disable() has been called.
+ */
+#ifdef lint
+/*ARGSUSED*/
+int
+hwblkclr(void *addr, size_t len)
+{ 
+	return(0);
+}
+#else /* lint */
+	! %i0 - start address
+	! %i1 - length of region (multiple of 64)
+
+ENTRY(hwblkclr)
+	save	%sp, -SA(MINFRAME), %sp
+
+	! Must be block-aligned
+	andcc	%i0, 0x3f, %g0
+	bnz,pn	%xcc, 1f
+	  nop
+
+	! ... and must be 256 bytes or more
+	cmp	%i1, 0x100
+	blu,pn	%xcc, 1f
+	  nop
+
+	! ... and length must be a multiple of 64
+	andcc	%i1, 0x3f, %g0
+	bz,pn	%xcc, pz_doblock
+	wr	%g0, ASI_LDSTBI_P, %asi
+
+1:	! punt, call bzero but notify the caller that bzero was used
+	mov	%i0, %o0
+	call	bzero
+	  mov	%i1, %o1
+	ret
+	  restore	%g0, 1, %o0	! return (1) - did not use block operations
+
+	! Already verified that there are at least 256 bytes to set
+pz_doblock:
+	stxa	%g0, [%i0+0x0]%asi
+	stxa	%g0, [%i0+0x40]%asi
+	stxa	%g0, [%i0+0x80]%asi
+	stxa	%g0, [%i0+0xc0]%asi
+
+	stxa	%g0, [%i0+0x8]%asi
+	stxa	%g0, [%i0+0x10]%asi
+	stxa	%g0, [%i0+0x18]%asi
+	stxa	%g0, [%i0+0x20]%asi
+	stxa	%g0, [%i0+0x28]%asi
+	stxa	%g0, [%i0+0x30]%asi
+	stxa	%g0, [%i0+0x38]%asi
+
+	stxa	%g0, [%i0+0x48]%asi
+	stxa	%g0, [%i0+0x50]%asi
+	stxa	%g0, [%i0+0x58]%asi
+	stxa	%g0, [%i0+0x60]%asi
+	stxa	%g0, [%i0+0x68]%asi
+	stxa	%g0, [%i0+0x70]%asi
+	stxa	%g0, [%i0+0x78]%asi
+
+	stxa	%g0, [%i0+0x88]%asi
+	stxa	%g0, [%i0+0x90]%asi
+	stxa	%g0, [%i0+0x98]%asi
+	stxa	%g0, [%i0+0xa0]%asi
+	stxa	%g0, [%i0+0xa8]%asi
+	stxa	%g0, [%i0+0xb0]%asi
+	stxa	%g0, [%i0+0xb8]%asi
+
+	stxa	%g0, [%i0+0xc8]%asi
+	stxa	%g0, [%i0+0xd0]%asi
+	stxa	%g0, [%i0+0xd8]%asi
+	stxa	%g0, [%i0+0xe0]%asi
+	stxa	%g0, [%i0+0xe8]%asi
+	stxa	%g0, [%i0+0xf0]%asi
+	stxa	%g0, [%i0+0xf8]%asi
+
+	sub	%i1, 0x100, %i1
+	cmp	%i1, 0x100
+	bgu,pt	%xcc, pz_doblock
+	  add	%i0, 0x100, %i0
+
+2:
+	! Check if more than 64 bytes to set
+	cmp	%i1,0x40
+	blu	%xcc, pz_finish
+	  nop
+
+3:
+	stxa	%g0, [%i0+0x0]%asi
+	stxa	%g0, [%i0+0x8]%asi
+	stxa	%g0, [%i0+0x10]%asi
+	stxa	%g0, [%i0+0x18]%asi
+	stxa	%g0, [%i0+0x20]%asi
+	stxa	%g0, [%i0+0x28]%asi
+	stxa	%g0, [%i0+0x30]%asi
+	stxa	%g0, [%i0+0x38]%asi
+
+	subcc	%i1, 0x40, %i1
+	bgu,pt	%xcc, 3b
+	  add	%i0, 0x40, %i0
+
+pz_finish:
+	membar	#Sync
+	ret
+	  restore	%g0, 0, %o0		! return (bzero or not)
+END(hwblkclr)
+#endif	/* lint */
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+bzero(void *addr, size_t count)
+{}
+
+#else	/* lint */
+
+ENTRY(bzero)
+	wr	%g0, ASI_P, %asi
+
+	cmp	%o1, 7
+	blu,pn	%xcc, byteclr
+	  nop
+
+	cmp	%o1, 15
+	blu,pn	%xcc, wdalign
+	  nop
+
+	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
+	bz,pt	%xcc, blkalign		! already double aligned
+	  sub	%o3, 8, %o3		! -(bytes till double aligned)
+	add	%o1, %o3, %o1		! update o1 with new count
+
+1:
+	stba	%g0, [%o0]%asi
+	inccc	%o3
+	bl,pt	%xcc, 1b
+	  inc	%o0
+
+	! Now address is double aligned
+blkalign:
+	cmp	%o1, 0x80		! check if there are 128 bytes to set
+	blu,pn	%xcc, bzero_small
+	  mov	%o1, %o3
+#if 0
+	sethi	%hi(use_hw_bzero), %o2
+	ld	[%o2 + %lo(use_hw_bzero)], %o2
+	tst	%o2
+	bz	%xcc, bzero_small
+	  mov	%o1, %o3
+#endif
+	rd	%asi, %o3
+	wr	%g0, ASI_LDSTBI_P, %asi
+	cmp	%o3, ASI_P
+	bne,a	%xcc, algnblk
+	  wr	%g0, ASI_LDSTBI_AIUS, %asi
+
+algnblk:
+	andcc	%o0, 0x3f, %o3		! is block aligned?
+	bz,pt	%xcc, bzero_blk
+	  sub	%o3, 0x40, %o3		! -(bytes till block aligned)
+	add	%o1, %o3, %o1		! o1 is the remainder
+	
+	! Clear -(%o3) bytes till block aligned
+1:
+	stxa	%g0, [%o0]%asi
+	addcc	%o3, 8, %o3
+	bl,pt	%xcc, 1b
+ 	  add	%o0, 8, %o0
+
+bzero_blk:
+	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
+	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
+
+	cmp	%o4, 0x100		! 256 bytes or more
+	blu,pn	%xcc, 3f
+	  nop
+
+2:
+	stxa	%g0, [%o0+0x0]%asi
+	stxa	%g0, [%o0+0x40]%asi
+	stxa	%g0, [%o0+0x80]%asi
+	stxa	%g0, [%o0+0xc0]%asi
+
+	stxa	%g0, [%o0+0x8]%asi
+	stxa	%g0, [%o0+0x10]%asi
+	stxa	%g0, [%o0+0x18]%asi
+	stxa	%g0, [%o0+0x20]%asi
+	stxa	%g0, [%o0+0x28]%asi
+	stxa	%g0, [%o0+0x30]%asi
+	stxa	%g0, [%o0+0x38]%asi
+
+	stxa	%g0, [%o0+0x48]%asi
+	stxa	%g0, [%o0+0x50]%asi
+	stxa	%g0, [%o0+0x58]%asi
+	stxa	%g0, [%o0+0x60]%asi
+	stxa	%g0, [%o0+0x68]%asi
+	stxa	%g0, [%o0+0x70]%asi
+	stxa	%g0, [%o0+0x78]%asi
+
+	stxa	%g0, [%o0+0x88]%asi
+	stxa	%g0, [%o0+0x90]%asi
+	stxa	%g0, [%o0+0x98]%asi
+	stxa	%g0, [%o0+0xa0]%asi
+	stxa	%g0, [%o0+0xa8]%asi
+	stxa	%g0, [%o0+0xb0]%asi
+	stxa	%g0, [%o0+0xb8]%asi
+
+	stxa	%g0, [%o0+0xc8]%asi
+	stxa	%g0, [%o0+0xd0]%asi
+	stxa	%g0, [%o0+0xd8]%asi
+	stxa	%g0, [%o0+0xe0]%asi
+	stxa	%g0, [%o0+0xe8]%asi
+	stxa	%g0, [%o0+0xf0]%asi
+	stxa	%g0, [%o0+0xf8]%asi
+
+	sub	%o4, 0x100, %o4
+	cmp	%o4, 0x100
+	bgu,pt	%xcc, 2b
+	  add	%o0, 0x100, %o0
+
+3:
+	! ... check if 64 bytes to set
+	cmp	%o4, 0x40
+	blu	%xcc, bzero_blk_done
+	  nop
+
+4:
+	stxa	%g0, [%o0+0x0]%asi
+	stxa	%g0, [%o0+0x8]%asi
+	stxa	%g0, [%o0+0x10]%asi
+	stxa	%g0, [%o0+0x18]%asi
+	stxa	%g0, [%o0+0x20]%asi
+	stxa	%g0, [%o0+0x28]%asi
+	stxa	%g0, [%o0+0x30]%asi
+	stxa	%g0, [%o0+0x38]%asi
+
+	subcc	%o4, 0x40, %o4
+	bgu,pt	%xcc, 3b
+	  add	%o0, 0x40, %o0
+
+bzero_blk_done:
+	membar	#Sync
+	!
+	! Undo asi register setting.
+	!
+	rd	%asi, %o4
+	wr	%g0, ASI_P, %asi
+	cmp	%o4, ASI_LDSTBI_P
+	bne,a	%xcc, bzero_small
+	  wr	%g0, ASI_AIUS, %asi
+
+bzero_small:
+	! Set the remaining doubles
+	subcc	%o3, 8, %o3		! Can we store any doubles?
+	blu,pn	%xcc, byteclr
+	  and	%o1, 7, %o1		! calc bytes left after doubles
+
+dbclr:
+	stxa	%g0, [%o0]%asi		! Clear the doubles
+	subcc	%o3, 8, %o3
+	bgeu,pt	%xcc, dbclr
+	  add	%o0, 8, %o0
+
+	ba	byteclr
+ 	  nop
+
+wdalign:			
+	andcc	%o0, 3, %o3		! is add aligned on a word boundary
+	bz,pn	%xcc, wdclr
+	andn	%o1, 3, %o3		! create word sized count in %o3
+
+	dec	%o1			! decrement count
+	stba	%g0, [%o0]%asi		! clear a byte
+	ba	wdalign
+	  inc	%o0			! next byte
+
+wdclr:
+	sta	%g0, [%o0]%asi		! 4-byte clearing loop
+	subcc	%o3, 4, %o3
+	bnz,pt	%xcc, wdclr
+	  inc	4, %o0
+
+	and	%o1, 3, %o1		! leftover count, if any
+
+byteclr:
+	! Set the leftover bytes
+	brz	%o1, bzero_exit
+	nop
+
+7:
+	deccc	%o1			! byte clearing loop
+	stba	%g0, [%o0]%asi
+	bgu,pt	%xcc, 7b
+	  inc	%o0
+
+bzero_exit:
+	retl
+	  clr	%o0			! return (0)
+
+END(bzero)
+#endif	/* lint */
+
+
+#if 0
+#define SMALL_LIMIT 7	
+#if defined(lint)
+
+/*ARGSUSED*/
+int
+copyin(const void *uaddr, void *kaddr, size_t count)
+{ return (0); }
+
+#else	/* lint */
+
+ENTRY(copyin)
+	!
+	! Check the length and bail if zero.
+	!
+	tst	%o2
+	bnz,pt	%xcc, 1f
+	  nop
+	retl
+	  clr	%o0
+#if 0
+1:
+	sethi	%hi(copyio_fault), %o4
+	or	%o4, %lo(copyio_fault), %o4
+	sethi	%hi(copyio_fault_nowindow), %o3
+	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
+	or	%o3, %lo(copyio_fault_nowindow), %o3
+	membar	#Sync
+	stn	%o3, [THREAD_REG + T_LOFAULT]
+
+	mov	%o0, SAVE_SRC
+	mov	%o1, SAVE_DST
+	mov	%o2, SAVE_COUNT
+#endif
+	!
+	! Check to see if we're more than SMALL_LIMIT.
+	!
+	subcc	%o2, SMALL_LIMIT, %o3
+	bgu,a,pt %xcc, dci_ns
+	  or	%o0, %o1, %o3
+	!
+	! What was previously ".small_copyin"
+	!
+dcibcp:
+	sub	%g0, %o2, %o3		! setup for copy loop
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	ba,pt	%xcc, dcicl
+	lduba	[%o0 + %o3]ASI_AIUS, %o4
+	!
+	! %o0 and %o1 point at the end and remain pointing at the end
+	! of their buffers. We pull things out by adding %o3 (which is
+	! the negation of the length) to the buffer end which gives us
+	! the curent location in the buffers. By incrementing %o3 we walk
+	! through both buffers without having to bump each buffer's
+	! pointer. A very fast 4 instruction loop.
+	!
+	.align 16
+dcicl:
+	stb	%o4, [%o1 + %o3]
+	inccc	%o3
+	bl,a,pt %xcc, dcicl
+	lduba	[%o0 + %o3]ASI_AIUS, %o4
+	!
+	! We're done. Go home.
+	!	
+	membar	#Sync
+	retl
+	  clr	%o0
+	!
+	! Try aligned copies from here.
+	!
+dci_ns:
+	!
+	! See if we're single byte aligned. If we are, check the
+	! limit for single byte copies. If we're smaller, or equal,
+	! bounce to the byte for byte copy loop. Otherwise do it in
+	! HW (if enabled).
+	!
+	btst	1, %o3
+	bz,a,pt	%icc, dcih8
+	btst	7, %o3
+	!
+	! We're single byte aligned.
+	!
+	sethi	%hi(hw_copy_limit_1), %o3
+	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
+	!
+	! Is HW copy on? If not do everything byte for byte.
+	!
+	tst	%o3
+	bz,pn	%icc, dcibcp
+	  subcc	%o3, %o2, %o3
+	!
+	! Are we bigger than the HW limit? If not
+	! go to byte for byte.
+	!
+	bge,pt	%xcc, dcibcp
+	  nop
+	!
+	! We're big enough and copy is on. Do it with HW.
+	!
+	ba,pt	%xcc, big_copyin
+	nop
+dcih8:
+	!
+	! 8 byte aligned?
+	!
+	bnz,a	%xcc, dcih4
+	btst	3, %o3
+	!
+	! We're eight byte aligned.
+	!
+	sethi	%hi(hw_copy_limit_8), %o3
+	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
+	!
+	! Is HW assist on? If not, do it with the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, dcis8
+	subcc	%o3, %o2, %o3
+	bge	%xcc, dcis8
+	nop
+	ba,pt	%xcc, big_copyin
+	nop
+dcis8:
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte for
+	! byte copy loop above.
+	!
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%xcc, didebc
+	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
+	!
+	! 4 byte aligned?
+	!
+dcih4:
+	bnz	%xcc, dcih2
+	sethi	%hi(hw_copy_limit_4), %o3
+	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
+	!
+	! Is HW assist on? If not, do it with the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, dcis4
+	  subcc	%o3, %o2, %o3
+	!
+	! We're negative if our size is less than or equal to hw_copy_limit_4.
+	!
+	bge	%xcc, dcis4
+	  nop
+	ba,pt	%xcc, big_copyin
+	  nop
+dcis4:
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte
+	! for byte copy loop above.
+	!
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%xcc, didfbc
+	  srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
+dcih2:
+	!
+	! We're two byte aligned. Check for "smallness"
+	! done in delay at .dcih4
+	!
+	bleu,pt	%xcc, dcis2
+	sethi	%hi(hw_copy_limit_2), %o3
+	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
+	!
+	! Is HW assist on? If not, do it with the aligned copy.
+	!
+	tst	%o3
+	bz,pn	%icc, dcis2
+	  subcc	%o3, %o2, %o3
+	!
+	! Are we larger than the HW limit?
+	!
+	bge	%xcc, dcis2
+	nop
+	!
+	! HW assist is on and we're large enough to use it.
+	!
+	ba,pt	%xcc, big_copyin
+	nop
+	!
+	! Housekeeping for copy loops. Uses same idea as in the byte
+	! for byte copy loop above.
+	!
+dcis2:
+	add	%o0, %o2, %o0
+	add	%o1, %o2, %o1
+	sub	%g0, %o2, %o3
+	ba,pt	%xcc, didtbc
+	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
+	!
+small_copyin:
+	!
+	! Why are we doing this AGAIN? There are certain conditions in
+	! big copyin that will cause us to forgo the HW assisted copys
+	! and bounce back to a non-hw assisted copy. This dispatches
+	! those copies. Note that we branch around this in the main line
+	! code.
+	!
+	! We make no check for limits or HW enablement here. We've
+	! already been told that we're a poster child so just go off
+	! and do it.
+	!
+	or	%o0, %o1, %o3
+	btst	1, %o3
+	bnz	%icc, dcibcp		! Most likely
+	  btst	7, %o3
+	bz	%icc, dcis8
+	  btst	3, %o3
+	bz	%icc, dcis4
+	  nop
+	ba,pt	%xcc, dcis2
+	  nop
+	!
+	! Eight byte aligned copies. A steal from the original .small_copyin
+	! with modifications. %o2 is number of 8 byte chunks to copy. When
+	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
+	! to copy.
+	!
+	.align 32
+didebc:
+	ldxa	[%o0 + %o3]ASI_AIUS, %o4
+	deccc	%o2
+	stx	%o4, [%o1 + %o3]
+	bg,pt	%xcc, didebc
+	addcc	%o3, 8, %o3
+	!
+	! End of copy loop. Most 8 byte aligned copies end here.
+	!
+	bz,pt	%xcc, dcifh
+	nop
+	!
+	! Something is left. Do it byte for byte.
+	!
+	ba,pt	%xcc, dcicl
+	lduba	[%o0 + %o3]ASI_AIUS, %o4
+	!
+	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
+	!
+	.align 32
+didfbc:
+	lduwa	[%o0 + %o3]ASI_AIUS, %o4
+	deccc	%o2
+	st	%o4, [%o1 + %o3]
+	bg,pt	%xcc, didfbc
+	addcc	%o3, 4, %o3
+	!
+	! End of copy loop. Most 4 byte aligned copies end here.
+	!
+	bz,pt	%xcc, dcifh
+	nop
+	!
+	! Something is left. Do it byte for byte.
+	!
+	ba,pt	%xcc, dcicl
+	lduba	[%o0 + %o3]ASI_AIUS, %o4
+	!
+	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
+	! copy.
+	!
+	.align 32
+didtbc:
+	lduha	[%o0 + %o3]ASI_AIUS, %o4
+	deccc	%o2
+	sth	%o4, [%o1 + %o3]
+	bg,pt	%xcc, didtbc
+	  addcc	%o3, 2, %o3
+	!
+	! End of copy loop. Most 2 byte aligned copies end here.
+	!
+	bz,pt	%xcc, dcifh
+	  nop
+	!
+	! Deal with the last byte
+	!
+	lduba	[%o0 + %o3]ASI_AIUS, %o4
+	stb	%o4, [%o1 + %o3]
+dcifh:
+	membar	#Sync
+	retl
+	  clr	%o0
+
+big_copyin:
+	!
+	! We're going off to do a block copy.
+	! Switch fault hendlers and grab a window. We
+	! don't do a membar #Sync since we've done only
+	! kernel data to this point.
+	!
+	save	%sp, -SA(MINFRAME), %sp
+
+	! Copy in that reach here are larger than 256 bytes. The
+	! hw_copy_limit_1 is set to 256. Never set this limit less
+	! 128 bytes.
+do_blockcopyin:
+
+	! Swap src/dst since the code below is memcpy code
+	! and memcpy/bcopy have different calling sequences
+	mov	%i1, %i5
+	mov	%i0, %i1
+	mov	%i5, %i0
+
+	andcc	%i0, 7, %i3		! is dst double aligned
+	bz	%xcc, copyin_blkcpy
+	  sub	%i3, 8, %i3
+	neg	%i3			! bytes till double aligned
+	sub	%i2, %i3, %i2		! update %i2 with new count
+
+	! Align Destination on double-word boundary
+
+1:	lduba	[%i1]ASI_AIUS, %i4
+	inc	%i1
+	stb	%i4, [%i0]
+	deccc	%i3
+	bgu	%xcc, 1b
+	  inc	%i0
+
+copyin_blkcpy:
+	andcc	%i0, 63, %i3
+	bz,pn	%xcc, copyin_blalign	! now block aligned
+	sub	%i3, 64, %i3
+	neg	%i3			! bytes till block aligned
+	sub	%i2, %i3, %i2		! update %i2 with new count
+
+	! Copy %i3 bytes till dst is block (64 byte) aligned. use
+	! double word copies.
+
+	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
+	bz	%xcc, ci_dbcopy		! %g1 has source offset (last 3-bits)
+	sll	%g1, 3, %l1		! left shift
+	mov	0x40, %l2
+	sub	%l2, %l1, %l2		! right shift = (64 - left shift)
+
+	! Now use double word copies to align destination.
+ci_double:
+	sub	%i1, %g1, %i1		! align the src at 8 bytes.
+	ldxa	[%i1]ASI_AIUS, %o2
+2:
+	add	%i1, 0x8, %i1
+	ldxa	[%i1]ASI_AIUS, %o4
+	ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
+	stx	%o2, [%i0]
+	mov	%o4, %o2
+	subcc	%i3, 0x8, %i3
+	bgu,pt	%xcc, 2b
+	add	%i0, 0x8, %i0
+	ba	copyin_blalign
+	add	%i1, %g1, %i1
+
+	! Both source and destination are double aligned.
+	! No shift and merge of data required in this case.
+ci_dbcopy:
+	ldxa	[%i1]ASI_AIUS, %o2
+	stx	%o2, [%i0]
+	add	%i1, 0x8, %i1
+	subcc	%i3, 0x8, %i3
+	bgu,pt	%xcc, ci_dbcopy
+	add	%i0, 0x8, %i0
+
+copyin_blalign:
+	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
+	sub	%i2, %i3, %i2		! Residue bytes in %i2
+
+	wr	%g0, ASI_LDSTBI_P, %asi
+
+	andcc	%i1, 0xf, %o2		! is src quadword aligned
+	bz,pn	%xcc, ci_blkcpy		! src offset in %o2 (last 4-bits)
+	  nop
+	cmp	%o2, 0x8
+	bg	ci_upper_double
+	  nop
+	bl	ci_lower_double
+	  nop
+
+	! Falls through when source offset is equal to 8 i.e.
+	! source is double word aligned.
+	! In this case no shift/merge of data is required
+
+	sub	%i1, %o2, %i1		! align the src at 16 bytes.
+	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
+	prefetch [%l0+0x0], #one_read
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+ci_loop0:
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4
+
+	prefetch [%l0+0x40], #one_read
+
+	stxa	%l3, [%i0+0x0]%asi
+	stxa	%l4, [%i0+0x8]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+
+	stxa	%l5, [%i0+0x10]%asi
+	stxa	%l2, [%i0+0x18]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4
+
+	stxa	%l3, [%i0+0x20]%asi
+	stxa	%l4, [%i0+0x28]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+
+	stxa	%l5, [%i0+0x30]%asi
+	stxa	%l2, [%i0+0x38]%asi
+
+	add	%l0, 0x40, %l0
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, ci_loop0
+	  add	%i0, 0x40, %i0
+	ba	ci_blkdone
+	  add	%i1, %o2, %i1		! increment the source by src offset
+					! the src offset was stored in %o2
+
+ci_lower_double:
+
+	sub	%i1, %o2, %i1		! align the src at 16 bytes.
+	sll	%o2, 3, %o0		! %o0 left shift
+	mov	0x40, %o1
+	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
+	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
+	prefetch [%l0+0x0], #one_read
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2	! partial data in %l2
+							! and %l3 has complete
+							! data
+ci_loop1:
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4	! %l4 has partial data
+							! for this read.
+	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
+							! into %l2 and %l3
+
+	prefetch [%l0+0x40], #one_read
+
+	stxa	%l2, [%i0+0x0]%asi
+	stxa	%l3, [%i0+0x8]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
+							! %l4 from previous read
+							! into %l4 and %l5
+	stxa	%l4, [%i0+0x10]%asi
+	stxa	%l5, [%i0+0x18]%asi
+
+	! Repeat the same for next 32 bytes.
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4
+	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
+
+	stxa	%l2, [%i0+0x20]%asi
+	stxa	%l3, [%i0+0x28]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
+
+	stxa	%l4, [%i0+0x30]%asi
+	stxa	%l5, [%i0+0x38]%asi
+
+	add	%l0, 0x40, %l0
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, ci_loop1
+	  add	%i0, 0x40, %i0
+	ba	ci_blkdone
+	  add	%i1, %o2, %i1		! increment the source by src offset
+					! the src offset was stored in %o2
+
+ci_upper_double:
+
+	sub	%i1, %o2, %i1		! align the src at 16 bytes.
+	sub	%o2, 0x8, %o0
+	sll	%o0, 3, %o0		! %o0 left shift
+	mov	0x40, %o1
+	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
+	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
+	prefetch [%l0+0x0], #one_read
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2	! partial data in %l3
+							! for this read and
+							! no data in %l2
+ci_loop2:
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4	! %l4 has complete data
+							! and %l5 has partial
+	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
+							! into %l3 and %l4
+	prefetch [%l0+0x40], #one_read
+
+	stxa	%l3, [%i0+0x0]%asi
+	stxa	%l4, [%i0+0x8]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
+							! %l5 from previous read
+							! into %l5 and %l2
+
+	stxa	%l5, [%i0+0x10]%asi
+	stxa	%l2, [%i0+0x18]%asi
+
+	! Repeat the same for next 32 bytes.
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4
+	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
+
+	stxa	%l3, [%i0+0x20]%asi
+	stxa	%l4, [%i0+0x28]%asi
+
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
+
+	stxa	%l5, [%i0+0x30]%asi
+	stxa	%l2, [%i0+0x38]%asi
+
+	add	%l0, 0x40, %l0
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, ci_loop2
+	  add	%i0, 0x40, %i0
+	ba	ci_blkdone
+	  add	%i1, %o2, %i1		! increment the source by src offset
+					! the src offset was stored in %o2
+
+
+	! Do fast copy using ASI_LDSTBI_P
+ci_blkcpy:
+
+	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
+	prefetch [%o0+0x0], #one_read
+1:
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l0
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l2
+	add	%i1, 0x10, %i1
+
+	prefetch [%o0+0x40], #one_read
+
+	stxa	%l0, [%i0+0x0]%asi
+
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l4
+	add	%i1, 0x10, %i1
+	ldda	[%i1]ASI_LDSTBI_AIUS, %l6
+	add	%i1, 0x10, %i1
+
+	stxa	%l1, [%i0+0x8]%asi
+	stxa	%l2, [%i0+0x10]%asi
+	stxa	%l3, [%i0+0x18]%asi
+	stxa	%l4, [%i0+0x20]%asi
+	stxa	%l5, [%i0+0x28]%asi
+	stxa	%l6, [%i0+0x30]%asi
+	stxa	%l7, [%i0+0x38]%asi
+
+	add	%o0, 0x40, %o0
+	subcc	%i3, 0x40, %i3
+	bgu,pt	%xcc, 1b
+	  add	%i0, 0x40, %i0
+
+ci_blkdone:
+	membar	#Sync
+
+	! Copy as much rest of the data as double word copy.
+ci_dwcp:
+	cmp	%i2, 0x8		! Not enough bytes to copy as double
+	blu	%xcc, ci_dbdone
+	  nop
+
+	andn	%i2, 0x7, %i3		! %i3 count is multiple of 8 bytes size
+	sub	%i2, %i3, %i2		! Residue bytes in %i2
+
+	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
+	bz	%xcc, ci_cpy_db
+	  nop
+
+	sll	%g1, 3, %l0		! left shift
+	mov	0x40, %l1
+	sub	%l1, %l0, %l1		! right shift = (64 - left shift)
+
+ci_cpy_dbwd:
+	sub	%i1, %g1, %i1		! align the src at 8 bytes.
+	ldxa	[%i1]ASI_AIUS, %o2
+3:
+	add	%i1, 0x8, %i1
+	ldxa	[%i1]ASI_AIUS, %o4
+	ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
+	stx	%o2, [%i0]
+	mov	%o4, %o2
+	subcc	%i3, 0x8, %i3
+	bgu,pt	%xcc, 3b
+	  add	%i0, 0x8, %i0
+	ba	ci_dbdone
+	  add	%i1, %g1, %i1
+
+ci_cpy_db:
+	ldxa	[%i1]ASI_AIUS, %o2
+	stx	%o2, [%i0]
+	add	%i1, 0x8, %i1
+	subcc	%i3, 0x8, %i3
+	bgu,pt	%xcc, ci_cpy_db
+	add	%i0, 0x8, %i0
+
+ci_dbdone:
+	tst	%i2
+	bz,pt	%xcc, copyin_exit
+	  nop
+
+	! Copy the residue as byte copy
+ci_residue:
+	lduba	[%i1]ASI_AIUS, %i4
+	stb	%i4, [%i0]
+	inc	%i1
+	deccc	%i2
+	bgu	%xcc, ci_residue
+	  inc	%i0
+
+copyin_exit:
+	membar	#Sync
+	ret
+	  restore	%g0, 0, %o0
+END(copyin)
+
+#endif	/* lint */
+#endif
+
author	kmacy <kmacy@FreeBSD.org>	2006-11-23 19:58:06 +0000
committer	kmacy <kmacy@FreeBSD.org>	2006-11-23 19:58:06 +0000
commit	c784eb28fd564165eec1c920c5ea555324910901 (patch)
tree	bd9650d04ee7cc53bb5869647cfe9a7209e77afe /sys/sun4v/cddl
parent	d47577789b1b7d11b294d9305019db910d35efe0 (diff)
download	FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.zip FreeBSD-src-c784eb28fd564165eec1c920c5ea555324910901.tar.gz