1 files changed, 225 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-clear_user.S b/arch/alpha/lib/ev6-clear_user.S
new file mode 100644
index 0000000..4f42a16
--- /dev/null
+++ b/arch/alpha/lib/ev6-clear_user.S
@@ -0,0 +1,225 @@
+/*
+ * arch/alpha/lib/ev6-clear_user.S
+ * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Zero user space, handling exceptions as we go.
+ *
+ * We have to make sure that $0 is always up-to-date and contains the
+ * right "bytes left to zero" value (and that it is updated only _after_
+ * a successful copy).  There is also some rather minor exception setup
+ * stuff.
+ *
+ * NOTE! This is not directly C-callable, because the calling semantics
+ * are different:
+ *
+ * Inputs:
+ *	length in $0
+ *	destination address in $6
+ *	exception pointer in $7
+ *	return address in $28 (exceptions expect it there)
+ *
+ * Outputs:
+ *	bytes left to copy in $0
+ *
+ * Clobbers:
+ *	$1,$2,$3,$4,$5,$6
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *	Compiler Writer's Guide for the Alpha 21264
+ *	abbreviated as 'CWG' in other comments here
+ *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *	E	- either cluster
+ *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ * Try not to change the actual algorithm if possible for consistency.
+ * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
+ * From perusing the source code context where this routine is called, it is
+ * a fair assumption that significant fractions of entire pages are zeroed, so
+ * it's going to be worth the effort to hand-unroll a big loop, and use wh64.
+ * ASSUMPTION:
+ *	The believed purpose of only updating $0 after a store is that a signal
+ *	may come along during the execution of this chunk of code, and we don't
+ *	want to leave a hole (and we also want to avoid repeating lots of work)
+ */
+
+/* Allow an exception for an insn; exit if we get one.  */
+#define EX(x,y...)			\
+	99: x,##y;			\
+	.section __ex_table,"a";	\
+	.long 99b - .;			\
+	lda $31, $exception-99b($31); 	\
+	.previous
+
+	.set noat
+	.set noreorder
+	.align 4
+
+	.globl __do_clear_user
+	.ent __do_clear_user
+	.frame	$30, 0, $28
+	.prologue 0
+
+				# Pipeline info : Slotting & Comments
+__do_clear_user:
+	and	$6, 7, $4	# .. E  .. ..	: find dest head misalignment
+	beq	$0, $zerolength # U  .. .. ..	:  U L U L
+
+	addq	$0, $4, $1	# .. .. .. E	: bias counter
+	and	$1, 7, $2	# .. .. E  ..	: number of misaligned bytes in tail
+# Note - we never actually use $2, so this is a moot computation
+# and we can rewrite this later...
+	srl	$1, 3, $1	# .. E  .. ..	: number of quadwords to clear
+	beq	$4, $headalign	# U  .. .. ..	: U L U L
+
+/*
+ * Head is not aligned.  Write (8 - $4) bytes to head of destination
+ * This means $6 is known to be misaligned
+ */
+	EX( ldq_u $5, 0($6) )	# .. .. .. L	: load dst word to mask back in
+	beq	$1, $onebyte	# .. .. U  ..	: sub-word store?
+	mskql	$5, $6, $5	# .. U  .. ..	: take care of misaligned head
+	addq	$6, 8, $6	# E  .. .. .. 	: L U U L
+
+	EX( stq_u $5, -8($6) )	# .. .. .. L	:
+	subq	$1, 1, $1	# .. .. E  ..	:
+	addq	$0, $4, $0	# .. E  .. ..	: bytes left -= 8 - misalignment
+	subq	$0, 8, $0	# E  .. .. ..	: U L U L
+
+	.align	4
+/*
+ * (The .align directive ought to be a moot point)
+ * values upon initial entry to the loop
+ * $1 is number of quadwords to clear (zero is a valid value)
+ * $2 is number of trailing bytes (0..7) ($2 never used...)
+ * $6 is known to be aligned 0mod8
+ */
+$headalign:
+	subq	$1, 16, $4	# .. .. .. E	: If < 16, we can not use the huge loop
+	and	$6, 0x3f, $2	# .. .. E  ..	: Forward work for huge loop
+	subq	$2, 0x40, $3	# .. E  .. ..	: bias counter (huge loop)
+	blt	$4, $trailquad	# U  .. .. ..	: U L U L
+
+/*
+ * We know that we're going to do at least 16 quads, which means we are
+ * going to be able to use the large block clear loop at least once.
+ * Figure out how many quads we need to clear before we are 0mod64 aligned
+ * so we can use the wh64 instruction.
+ */
+
+	nop			# .. .. .. E
+	nop			# .. .. E  ..
+	nop			# .. E  .. ..
+	beq	$3, $bigalign	# U  .. .. ..	: U L U L : Aligned 0mod64
+
+$alignmod64:
+	EX( stq_u $31, 0($6) )	# .. .. .. L
+	addq	$3, 8, $3	# .. .. E  ..
+	subq	$0, 8, $0	# .. E  .. ..
+	nop			# E  .. .. ..	: U L U L
+
+	nop			# .. .. .. E
+	subq	$1, 1, $1	# .. .. E  ..
+	addq	$6, 8, $6	# .. E  .. ..
+	blt	$3, $alignmod64	# U  .. .. ..	: U L U L
+
+$bigalign:
+/*
+ * $0 is the number of bytes left
+ * $1 is the number of quads left
+ * $6 is aligned 0mod64
+ * we know that we'll be taking a minimum of one trip through
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+ * We are _not_ going to update $0 after every single store.  That
+ * would be silly, because there will be cross-cluster dependencies
+ * no matter how the code is scheduled.  By doing it in slightly
+ * staggered fashion, we can still do this loop in 5 fetches
+ * The worse case will be doing two extra quads in some future execution,
+ * in the event of an interrupted clear.
+ * Assumes the wh64 needs to be for 2 trips through the loop in the future
+ * The wh64 is issued on for the starting destination address for trip +2
+ * through the loop, and if there are less than two trips left, the target
+ * address will be for the current trip.
+ */
+	nop			# E :
+	nop			# E :
+	nop			# E :
+	bis	$6,$6,$3	# E : U L U L : Initial wh64 address is dest
+	/* This might actually help for the current trip... */
+
+$do_wh64:
+	wh64	($3)		# .. .. .. L1	: memory subsystem hint
+	subq	$1, 16, $4	# .. .. E  ..	: Forward calculation - repeat the loop?
+	EX( stq_u $31, 0($6) )	# .. L  .. ..
+	subq	$0, 8, $0	# E  .. .. ..	: U L U L
+
+	addq	$6, 128, $3	# E : Target address of wh64
+	EX( stq_u $31, 8($6) )	# L :
+	EX( stq_u $31, 16($6) )	# L :
+	subq	$0, 16, $0	# E : U L L U
+
+	nop			# E :
+	EX( stq_u $31, 24($6) )	# L :
+	EX( stq_u $31, 32($6) )	# L :
+	subq	$0, 168, $5	# E : U L L U : two trips through the loop left?
+	/* 168 = 192 - 24, since we've already completed some stores */
+
+	subq	$0, 16, $0	# E :
+	EX( stq_u $31, 40($6) )	# L :
+	EX( stq_u $31, 48($6) )	# L :
+	cmovlt	$5, $6, $3	# E : U L L U : Latency 2, extra mapping cycle
+
+	subq	$1, 8, $1	# E :
+	subq	$0, 16, $0	# E :
+	EX( stq_u $31, 56($6) )	# L :
+	nop			# E : U L U L
+
+	nop			# E :
+	subq	$0, 8, $0	# E :
+	addq	$6, 64, $6	# E :
+	bge	$4, $do_wh64	# U : U L U L
+
+$trailquad:
+	# zero to 16 quadwords left to store, plus any trailing bytes
+	# $1 is the number of quadwords left to go.
+	# 
+	nop			# .. .. .. E
+	nop			# .. .. E  ..
+	nop			# .. E  .. ..
+	beq	$1, $trailbytes	# U  .. .. ..	: U L U L : Only 0..7 bytes to go
+
+$onequad:
+	EX( stq_u $31, 0($6) )	# .. .. .. L
+	subq	$1, 1, $1	# .. .. E  ..
+	subq	$0, 8, $0	# .. E  .. ..
+	nop			# E  .. .. ..	: U L U L
+
+	nop			# .. .. .. E
+	nop			# .. .. E  ..
+	addq	$6, 8, $6	# .. E  .. ..
+	bgt	$1, $onequad	# U  .. .. ..	: U L U L
+
+	# We have an unknown number of bytes left to go.
+$trailbytes:
+	nop			# .. .. .. E
+	nop			# .. .. E  ..
+	nop			# .. E  .. ..
+	beq	$0, $zerolength	# U  .. .. ..	: U L U L
+
+	# $0 contains the number of bytes left to copy (0..31)
+	# so we will use $0 as the loop counter
+	# We know for a fact that $0 > 0 zero due to previous context
+$onebyte:
+	EX( stb $31, 0($6) )	# .. .. .. L
+	subq	$0, 1, $0	# .. .. E  ..	:
+	addq	$6, 1, $6	# .. E  .. ..	:
+	bgt	$0, $onebyte	# U  .. .. ..	: U L U L
+
+$zerolength:
+$exception:			# Destination for exception recovery(?)
+	nop			# .. .. .. E	:
+	nop			# .. .. E  ..	:
+	nop			# .. E  .. ..	:
+	ret	$31, ($28), 1	# L0 .. .. ..	: L U L U
+	.end __do_clear_user
+