33 files changed, 1475 insertions, 1102 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 8725d63..391f3ab 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -7,13 +7,27 @@
 lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
 		   copy_page.o delay.o findbit.o memchr.o memcpy.o    \
-		   memset.o memzero.o setbit.o strncpy_from_user.o    \
-		   strnlen_user.o strchr.o strrchr.o testchangebit.o  \
-		   testclearbit.o testsetbit.o uaccess.o getuser.o    \
-		   putuser.o ashldi3.o ashrdi3.o lshrdi3.o muldi3.o   \
-		   ucmpdi2.o lib1funcs.o div64.o	              \
+		   memmove.o memset.o memzero.o setbit.o              \
+		   strncpy_from_user.o strnlen_user.o                 \
+		   strchr.o strrchr.o                                 \
+		   testchangebit.o testclearbit.o testsetbit.o        \
+		   getuser.o putuser.o clear_user.o                   \
+		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
+		   ucmpdi2.o lib1funcs.o div64.o sha1.o               \
 		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o
 
+# the code in uaccess.S is not preemption safe and
+# probably faster on ARMv3 only
+ifeq ($CONFIG_PREEMPT,y)
+  lib-y	+= copy_from_user.o copy_to_user.o
+else
+ifneq ($(CONFIG_CPU_32v3),y)
+  lib-y	+= copy_from_user.o copy_to_user.o
+else
+  lib-y	+= uaccess.o
+endif
+endif
+
 ifeq ($(CONFIG_CPU_32v3),y)
   lib-y	+= io-readsw-armv3.o io-writesw-armv3.o
 else
diff --git a/arch/arm/lib/ashldi3.S b/arch/arm/lib/ashldi3.S
new file mode 100644
index 0000000..561e207
--- /dev/null
+++ b/arch/arm/lib/ashldi3.S
@@ -0,0 +1,48 @@
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define al r1
+#define ah r0
+#else
+#define al r0
+#define ah r1
+#endif
+
+ENTRY(__ashldi3)
+
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	ah, ah, lsl r2
+	movpl	ah, al, lsl r3
+	orrmi	ah, ah, al, lsr ip
+	mov	al, al, lsl r2
+	mov	pc, lr
+
diff --git a/arch/arm/lib/ashldi3.c b/arch/arm/lib/ashldi3.c
deleted file mode 100644
index b62875c..0000000
--- a/arch/arm/lib/ashldi3.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/* More subroutines needed by GCC output code on some machines.  */
-/* Compile this one with gcc.  */
-/* Copyright (C) 1989, 92-98, 1999 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* As a special exception, if you link this library with other files,
-   some of which are compiled with GCC, to produce an executable,
-   this library does not by itself cause the resulting executable
-   to be covered by the GNU General Public License.
-   This exception does not however invalidate any other reasons why
-   the executable file might be covered by the GNU General Public License.
- */
-/* support functions required by the kernel. based on code from gcc-2.95.3 */
-/* I Molton	29/07/01 */
-
-#include "gcclib.h"
-
-s64 __ashldi3(s64 u, int b)
-{
-	DIunion w;
-	int bm;
-	DIunion uu;
-
-	if (b == 0)
-		return u;
-
-	uu.ll = u;
-
-	bm = (sizeof(s32) * BITS_PER_UNIT) - b;
-	if (bm <= 0) {
-		w.s.low = 0;
-		w.s.high = (u32) uu.s.low << -bm;
-	} else {
-		u32 carries = (u32) uu.s.low >> bm;
-		w.s.low = (u32) uu.s.low << b;
-		w.s.high = ((u32) uu.s.high << b) | carries;
-	}
-
-	return w.ll;
-}
diff --git a/arch/arm/lib/ashrdi3.S b/arch/arm/lib/ashrdi3.S
new file mode 100644
index 0000000..86fb2a9
--- /dev/null
+++ b/arch/arm/lib/ashrdi3.S
@@ -0,0 +1,48 @@
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define al r1
+#define ah r0
+#else
+#define al r0
+#define ah r1
+#endif
+
+ENTRY(__ashrdi3)
+
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	al, al, lsr r2
+	movpl	al, ah, asr r3
+	orrmi	al, al, ah, lsl ip
+	mov	ah, ah, asr r2
+	mov	pc, lr
+
diff --git a/arch/arm/lib/ashrdi3.c b/arch/arm/lib/ashrdi3.c
deleted file mode 100644
index 9a8600a..0000000
--- a/arch/arm/lib/ashrdi3.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/* More subroutines needed by GCC output code on some machines.  */
-/* Compile this one with gcc.  */
-/* Copyright (C) 1989, 92-98, 1999 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* As a special exception, if you link this library with other files,
-   some of which are compiled with GCC, to produce an executable,
-   this library does not by itself cause the resulting executable
-   to be covered by the GNU General Public License.
-   This exception does not however invalidate any other reasons why
-   the executable file might be covered by the GNU General Public License.
- */
-/* support functions required by the kernel. based on code from gcc-2.95.3 */
-/* I Molton     29/07/01 */
-
-#include "gcclib.h"
-
-s64 __ashrdi3(s64 u, int b)
-{
-	DIunion w;
-	int bm;
-	DIunion uu;
-
-	if (b == 0)
-		return u;
-
-	uu.ll = u;
-
-	bm = (sizeof(s32) * BITS_PER_UNIT) - b;
-	if (bm <= 0) {
-		/* w.s.high = 1..1 or 0..0 */
-		w.s.high = uu.s.high >> (sizeof(s32) * BITS_PER_UNIT - 1);
-		w.s.low = uu.s.high >> -bm;
-	} else {
-		u32 carries = (u32) uu.s.high << bm;
-		w.s.high = uu.s.high >> b;
-		w.s.low = ((u32) uu.s.low >> b) | carries;
-	}
-
-	return w.ll;
-}
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index 64a988c..b8c14e9 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -1,6 +1,6 @@
 #include <linux/config.h>
 
-#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_CPU_MPCORE)
+#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_CPU_32v6K)
 	.macro	bitop, instr
 	mov	r2, #1
 	and	r3, r0, #7		@ Get bit offset
@@ -34,7 +34,7 @@
 	and	r2, r0, #7
 	mov	r3, #1
 	mov	r3, r3, lsl r2
-	save_and_disable_irqs ip, r2
+	save_and_disable_irqs ip
 	ldrb	r2, [r1, r0, lsr #3]
 	\instr	r2, r2, r3
 	strb	r2, [r1, r0, lsr #3]
@@ -54,7 +54,7 @@
 	add	r1, r1, r0, lsr #3
 	and	r3, r0, #7
 	mov	r0, #1
-	save_and_disable_irqs ip, r2
+	save_and_disable_irqs ip
 	ldrb	r2, [r1]
 	tst	r2, r0, lsl r3
 	\instr	r2, r2, r0, lsl r3
diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S
new file mode 100644
index 0000000..7ff9f83
--- /dev/null
+++ b/arch/arm/lib/clear_user.S
@@ -0,0 +1,52 @@
+/*
+ *  linux/arch/arm/lib/clear_user.S
+ *
+ *  Copyright (C) 1995, 1996,1997,1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/* Prototype: int __arch_clear_user(void *addr, size_t sz)
+ * Purpose  : clear some user memory
+ * Params   : addr - user memory address to clear
+ *          : sz   - number of bytes to clear
+ * Returns  : number of bytes NOT cleared
+ */
+ENTRY(__arch_clear_user)
+		stmfd	sp!, {r1, lr}
+		mov	r2, #0
+		cmp	r1, #4
+		blt	2f
+		ands	ip, r0, #3
+		beq	1f
+		cmp	ip, #2
+USER(		strbt	r2, [r0], #1)
+USER(		strlebt	r2, [r0], #1)
+USER(		strltbt	r2, [r0], #1)
+		rsb	ip, ip, #4
+		sub	r1, r1, ip		@  7  6  5  4  3  2  1
+1:		subs	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
+USER(		strplt	r2, [r0], #4)
+USER(		strplt	r2, [r0], #4)
+		bpl	1b
+		adds	r1, r1, #4		@  3  2  1  0 -1 -2 -3
+USER(		strplt	r2, [r0], #4)
+2:		tst	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
+USER(		strnebt	r2, [r0], #1)
+USER(		strnebt	r2, [r0], #1)
+		tst	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
+USER(		strnebt	r2, [r0], #1)
+		mov	r0, #0
+		LOADREGS(fd,sp!, {r1, pc})
+
+		.section .fixup,"ax"
+		.align	0
+9001:		LOADREGS(fd,sp!, {r0, pc})
+		.previous
+
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
new file mode 100644
index 0000000..7497393
--- /dev/null
+++ b/arch/arm/lib/copy_from_user.S
@@ -0,0 +1,101 @@
+/*
+ *  linux/arch/arm/lib/copy_from_user.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 29, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Prototype:
+ *
+ *	size_t __arch_copy_from_user(void *to, const void *from, size_t n)
+ *
+ * Purpose:
+ *
+ *	copy a block to kernel memory from user memory
+ *
+ * Params:
+ *
+ *	to = kernel memory
+ *	from = user memory
+ *	n = number of bytes to copy
+ *
+ * Return value:
+ *
+ *	Number of bytes NOT copied.
+ */
+
+	.macro ldr1w ptr reg abort
+100:	ldrt \reg, [\ptr], #4
+	.section __ex_table, "a"
+	.long 100b, \abort
+	.previous
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldr1w \ptr, \reg1, \abort
+	ldr1w \ptr, \reg2, \abort
+	ldr1w \ptr, \reg3, \abort
+	ldr1w \ptr, \reg4, \abort
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
+	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+100:	ldr\cond\()bt \reg, [\ptr], #1
+	.section __ex_table, "a"
+	.long 100b, \abort
+	.previous
+	.endm
+
+	.macro str1w ptr reg abort
+	str \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	mov	r3, #0
+	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	add	sp, sp, #8
+	ldmfd	sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+ENTRY(__arch_copy_from_user)
+
+#include "copy_template.S"
+
+	.section .fixup,"ax"
+	.align 0
+	copy_abort_preamble
+	ldmfd	sp!, {r1, r2}
+	sub	r3, r0, r1
+	rsb	r1, r3, r2
+	str	r1, [sp]
+	bl	__memzero
+	ldr	r0, [sp], #4
+	copy_abort_end
+	.previous
+
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
new file mode 100644
index 0000000..838e435
--- /dev/null
+++ b/arch/arm/lib/copy_template.S
@@ -0,0 +1,255 @@
+/*
+ *  linux/arch/arm/lib/copy_template.s
+ *
+ *  Code template for optimized memory copy functions
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do.  That might be different in the future.
+ */
+//#define CALGN(code...)	code
+#define CALGN(code...)
+
+/*
+ * Theory of operation
+ * -------------------
+ *
+ * This file provides the core code for a forward memory copy used in
+ * the implementation of memcopy(), copy_to_user() and copy_from_user().
+ *
+ * The including file must define the following accessor macros
+ * according to the need of the given function:
+ *
+ * ldr1w ptr reg abort
+ *
+ *	This loads one word from 'ptr', stores it in 'reg' and increments
+ *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
+ *
+ * ldr4w ptr reg1 reg2 reg3 reg4 abort
+ * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ *
+ *	This loads four or eight words starting from 'ptr', stores them
+ *	in provided registers and increments 'ptr' past those words.
+ *	The'abort' argument is used for fixup tables.
+ *
+ * ldr1b ptr reg cond abort
+ *
+ *	Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
+ *	It also must apply the condition code if provided, otherwise the
+ *	"al" condition is assumed by default.
+ *
+ * str1w ptr reg abort
+ * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ * str1b ptr reg cond abort
+ *
+ *	Same as their ldr* counterparts, but data is stored to 'ptr' location
+ *	rather than being loaded.
+ *
+ * enter reg1 reg2
+ *
+ *	Preserve the provided registers on the stack plus any additional
+ *	data as needed by the implementation including this code. Called
+ *	upon code entry.
+ *
+ * exit reg1 reg2
+ *
+ *	Restore registers with the values previously saved with the
+ *	'preserv' macro. Called upon code termination.
+ */
+
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		nop
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+		add	pc, pc, ip
+		nop
+		nop
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
+
+/*
+ * Abort preanble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+	.macro	copy_abort_preamble
+19:	ldmfd	sp!, {r5 - r9}
+	b	21f
+20:	ldmfd	sp!, {r5 - r8}
+21:
+	.endm
+
+	.macro	copy_abort_end
+	ldmfd	sp!, {r4, pc}
+	.endm
+
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
new file mode 100644
index 0000000..4a6d8ea
--- /dev/null
+++ b/arch/arm/lib/copy_to_user.S
@@ -0,0 +1,101 @@
+/*
+ *  linux/arch/arm/lib/copy_to_user.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 29, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Prototype:
+ *
+ *	size_t __arch_copy_to_user(void *to, const void *from, size_t n)
+ *
+ * Purpose:
+ *
+ *	copy a block to user memory from kernel memory
+ *
+ * Params:
+ *
+ *	to = user memory
+ *	from = kernel memory
+ *	n = number of bytes to copy
+ *
+ * Return value:
+ *
+ *	Number of bytes NOT copied.
+ */
+
+	.macro ldr1w ptr reg abort
+	ldr \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+100:	strt \reg, [\ptr], #4
+	.section __ex_table, "a"
+	.long 100b, \abort
+	.previous
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	str1w \ptr, \reg1, \abort
+	str1w \ptr, \reg2, \abort
+	str1w \ptr, \reg3, \abort
+	str1w \ptr, \reg4, \abort
+	str1w \ptr, \reg5, \abort
+	str1w \ptr, \reg6, \abort
+	str1w \ptr, \reg7, \abort
+	str1w \ptr, \reg8, \abort
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+100:	str\cond\()bt \reg, [\ptr], #1
+	.section __ex_table, "a"
+	.long 100b, \abort
+	.previous
+	.endm
+
+	.macro enter reg1 reg2
+	mov	r3, #0
+	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	add	sp, sp, #8
+	ldmfd	sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+ENTRY(__arch_copy_to_user)
+
+#include "copy_template.S"
+
+	.section .fixup,"ax"
+	.align 0
+	copy_abort_preamble
+	ldmfd	sp!, {r1, r2, r3}
+	sub	r0, r0, r1
+	rsb	r0, r0, r2
+	copy_abort_end
+	.previous
+
diff --git a/arch/arm/lib/csumpartial.S b/arch/arm/lib/csumpartial.S
index cb5e370..a78dae5 100644
--- a/arch/arm/lib/csumpartial.S
+++ b/arch/arm/lib/csumpartial.S
@@ -26,7 +26,7 @@ td1	.req	r4	@ save before use
 td2	.req	r5	@ save before use
 td3	.req	lr
 
-.zero:		mov	r0, sum
+.Lzero:		mov	r0, sum
 		add	sp, sp, #4
 		ldr	pc, [sp], #4
 
@@ -34,21 +34,22 @@ td3	.req	lr
 		 * Handle 0 to 7 bytes, with any alignment of source and
 		 * destination pointers.  Note that when we get here, C = 0
 		 */
-.less8:		teq	len, #0			@ check for zero count
-		beq	.zero
+.Lless8:		teq	len, #0			@ check for zero count
+		beq	.Lzero
 
 		/* we must have at least one byte. */
 		tst	buf, #1			@ odd address?
+		movne	sum, sum, ror #8
 		ldrneb	td0, [buf], #1
 		subne	len, len, #1
 		adcnes	sum, sum, td0, put_byte_1
 
-.less4:		tst	len, #6
-		beq	.less8_byte
+.Lless4:		tst	len, #6
+		beq	.Lless8_byte
 
 		/* we are now half-word aligned */
 
-.less8_wordlp:
+.Lless8_wordlp:
 #if __LINUX_ARM_ARCH__ >= 4
 		ldrh	td0, [buf], #2
 		sub	len, len, #2
@@ -64,19 +65,19 @@ td3	.req	lr
 #endif
 		adcs	sum, sum, td0
 		tst	len, #6
-		bne	.less8_wordlp
+		bne	.Lless8_wordlp
 
-.less8_byte:	tst	len, #1			@ odd number of bytes
+.Lless8_byte:	tst	len, #1			@ odd number of bytes
 		ldrneb	td0, [buf], #1		@ include last byte
 		adcnes	sum, sum, td0, put_byte_0	@ update checksum
 
-.done:		adc	r0, sum, #0		@ collect up the last carry
+.Ldone:		adc	r0, sum, #0		@ collect up the last carry
 		ldr	td0, [sp], #4
 		tst	td0, #1			@ check buffer alignment
 		movne	r0, r0, ror #8		@ rotate checksum by 8 bits
 		ldr	pc, [sp], #4		@ return
 
-.not_aligned:	tst	buf, #1			@ odd address
+.Lnot_aligned:	tst	buf, #1			@ odd address
 		ldrneb	td0, [buf], #1		@ make even
 		subne	len, len, #1
 		adcnes	sum, sum, td0, put_byte_1	@ update checksum
@@ -101,11 +102,14 @@ td3	.req	lr
 ENTRY(csum_partial)
 		stmfd	sp!, {buf, lr}
 		cmp	len, #8			@ Ensure that we have at least
-		blo	.less8			@ 8 bytes to copy.
+		blo	.Lless8			@ 8 bytes to copy.
+
+		tst	buf, #1
+		movne	sum, sum, ror #8
 
 		adds	sum, sum, #0		@ C = 0
 		tst	buf, #3			@ Test destination alignment
-		blne	.not_aligned		@ aligh destination, return here
+		blne	.Lnot_aligned		@ align destination, return here
 
 1:		bics	ip, len, #31
 		beq	3f
@@ -127,11 +131,11 @@ ENTRY(csum_partial)
 		ldmfd	sp!, {r4 - r5}
 
 3:		tst	len, #0x1c		@ should not change C
-		beq	.less4
+		beq	.Lless4
 
 4:		ldr	td0, [buf], #4
 		sub	len, len, #4
 		adcs	sum, sum, td0
 		tst	len, #0x1c
 		bne	4b
-		b	.less4
+		b	.Lless4
diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S
index d3a2f46..4a4609c 100644
--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -22,7 +22,7 @@ dst	.req	r1
 len	.req	r2
 sum	.req	r3
 
-.zero:		mov	r0, sum
+.Lzero:		mov	r0, sum
 		load_regs	ea
 
 		/*
@@ -31,8 +31,9 @@ sum	.req	r3
 		 * the length.  Note that the source pointer hasn't been
 		 * aligned yet.
 		 */
-.dst_unaligned:	tst	dst, #1
-		beq	.dst_16bit
+.Ldst_unaligned:
+		tst	dst, #1
+		beq	.Ldst_16bit
 
 		load1b	ip
 		sub	len, len, #1
@@ -41,7 +42,7 @@ sum	.req	r3
 		tst	dst, #2
 		moveq	pc, lr			@ dst is now 32bit aligned
 
-.dst_16bit:	load2b	r8, ip
+.Ldst_16bit:	load2b	r8, ip
 		sub	len, len, #2
 		adcs	sum, sum, r8, put_byte_0
 		strb	r8, [dst], #1
@@ -53,12 +54,12 @@ sum	.req	r3
 		 * Handle 0 to 7 bytes, with any alignment of source and
 		 * destination pointers.  Note that when we get here, C = 0
 		 */
-.less8:		teq	len, #0			@ check for zero count
-		beq	.zero
+.Lless8:	teq	len, #0			@ check for zero count
+		beq	.Lzero
 
 		/* we must have at least one byte. */
 		tst	dst, #1			@ dst 16-bit aligned
-		beq	.less8_aligned
+		beq	.Lless8_aligned
 
 		/* Align dst */
 		load1b	ip
@@ -66,7 +67,7 @@ sum	.req	r3
 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 		strb	ip, [dst], #1
 		tst	len, #6
-		beq	.less8_byteonly
+		beq	.Lless8_byteonly
 
 1:		load2b	r8, ip
 		sub	len, len, #2
@@ -74,15 +75,16 @@ sum	.req	r3
 		strb	r8, [dst], #1
 		adcs	sum, sum, ip, put_byte_1
 		strb	ip, [dst], #1
-.less8_aligned:	tst	len, #6
+.Lless8_aligned:
+		tst	len, #6
 		bne	1b
-.less8_byteonly:
+.Lless8_byteonly:
 		tst	len, #1
-		beq	.done
+		beq	.Ldone
 		load1b	r8
 		adcs	sum, sum, r8, put_byte_0	@ update checksum
 		strb	r8, [dst], #1
-		b	.done
+		b	.Ldone
 
 FN_ENTRY
 		mov	ip, sp
@@ -90,11 +92,11 @@ FN_ENTRY
 		sub	fp, ip, #4
 
 		cmp	len, #8			@ Ensure that we have at least
-		blo	.less8			@ 8 bytes to copy.
+		blo	.Lless8			@ 8 bytes to copy.
 
 		adds	sum, sum, #0		@ C = 0
 		tst	dst, #3			@ Test destination alignment
-		blne	.dst_unaligned		@ align destination, return here
+		blne	.Ldst_unaligned		@ align destination, return here
 
 		/*
 		 * Ok, the dst pointer is now 32bit aligned, and we know
@@ -103,7 +105,7 @@ FN_ENTRY
 		 */
 
 		tst	src, #3			@ Test source alignment
-		bne	.src_not_aligned
+		bne	.Lsrc_not_aligned
 
 		/* Routine for src & dst aligned */
 
@@ -136,17 +138,17 @@ FN_ENTRY
 		adcs	sum, sum, r4
 
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		load1l	r4
 		tst	len, #2
 		mov	r5, r4, get_byte_0
-		beq	.exit
+		beq	.Lexit
 		adcs	sum, sum, r4, push #16
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_2
-.exit:		tst	len, #1
+.Lexit:		tst	len, #1
 		strneb	r5, [dst], #1
 		andne	r5, r5, #255
 		adcnes	sum, sum, r5, put_byte_0
@@ -157,20 +159,20 @@ FN_ENTRY
 		 * the inefficient byte manipulations in the
 		 * architecture independent code.
 		 */
-.done:		adc	r0, sum, #0
+.Ldone:		adc	r0, sum, #0
 		ldr	sum, [sp, #0]		@ dst
 		tst	sum, #1
 		movne	r0, r0, ror #8
 		load_regs	ea
 
-.src_not_aligned:
+.Lsrc_not_aligned:
 		adc	sum, sum, #0		@ include C from dst alignment
 		and	ip, src, #3
 		bic	src, src, #3
 		load1l	r5
 		cmp	ip, #2
-		beq	.src2_aligned
-		bhi	.src3_aligned
+		beq	.Lsrc2_aligned
+		bhi	.Lsrc3_aligned
 		mov	r4, r5, pull #8		@ C = 0
 		bics	ip, len, #15
 		beq	2f
@@ -211,18 +213,18 @@ FN_ENTRY
 		adcs	sum, sum, r4
 		mov	r4, r5, pull #8
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
-		beq	.exit
+		beq	.Lexit
 		adcs	sum, sum, r4, push #16
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_2
-		b	.exit
+		b	.Lexit
 
-.src2_aligned:	mov	r4, r5, pull #16
+.Lsrc2_aligned:	mov	r4, r5, pull #16
 		adds	sum, sum, #0
 		bics	ip, len, #15
 		beq	2f
@@ -263,20 +265,20 @@ FN_ENTRY
 		adcs	sum, sum, r4
 		mov	r4, r5, pull #16
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
-		beq	.exit
+		beq	.Lexit
 		adcs	sum, sum, r4
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		tst	len, #1
-		beq	.done
+		beq	.Ldone
 		load1b	r5
-		b	.exit
+		b	.Lexit
 
-.src3_aligned:	mov	r4, r5, pull #24
+.Lsrc3_aligned:	mov	r4, r5, pull #24
 		adds	sum, sum, #0
 		bics	ip, len, #15
 		beq	2f
@@ -317,10 +319,10 @@ FN_ENTRY
 		adcs	sum, sum, r4
 		mov	r4, r5, pull #24
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
-		beq	.exit
+		beq	.Lexit
 		strb	r5, [dst], #1
 		adcs	sum, sum, r4
 		load1l	r4
@@ -328,4 +330,4 @@ FN_ENTRY
 		strb	r5, [dst], #1
 		adcs	sum, sum, r4, push #24
 		mov	r5, r4, get_byte_1
-		b	.exit
+		b	.Lexit
diff --git a/arch/arm/lib/delay.S b/arch/arm/lib/delay.S
index 3c7f7e6..b3fb475 100644
--- a/arch/arm/lib/delay.S
+++ b/arch/arm/lib/delay.S
@@ -11,7 +11,7 @@
 #include <asm/assembler.h>
 		.text
 
-LC0:		.word	loops_per_jiffy
+.LC0:		.word	loops_per_jiffy
 
 /*
  * 0 <= r0 <= 2000
@@ -21,7 +21,7 @@ ENTRY(__udelay)
 		orr	r2, r2, #0x00db
 		mul	r0, r2, r0
 ENTRY(__const_udelay)				@ 0 <= r0 <= 0x01ffffff
-		ldr	r2, LC0
+		ldr	r2, .LC0
 		ldr	r2, [r2]		@ max = 0x0fffffff
 		mov	r0, r0, lsr #11		@ max = 0x00003fff
 		mov	r2, r2, lsr #11		@ max = 0x0003ffff
diff --git a/arch/arm/lib/findbit.S b/arch/arm/lib/findbit.S
index f055d56..6f8e27a 100644
--- a/arch/arm/lib/findbit.S
+++ b/arch/arm/lib/findbit.S
@@ -27,7 +27,7 @@ ENTRY(_find_first_zero_bit_le)
 		mov	r2, #0
 1:		ldrb	r3, [r0, r2, lsr #3]
 		eors	r3, r3, #0xff		@ invert bits
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
@@ -46,7 +46,7 @@ ENTRY(_find_next_zero_bit_le)
 		ldrb	r3, [r0, r2, lsr #3]
 		eor	r3, r3, #0xff		@ now looking for a 1 bit
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
@@ -61,7 +61,7 @@ ENTRY(_find_first_bit_le)
 		mov	r2, #0
 1:		ldrb	r3, [r0, r2, lsr #3]
 		movs	r3, r3
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
@@ -79,7 +79,7 @@ ENTRY(_find_next_bit_le)
 		beq	1b			@ If new byte, goto old routine
 		ldrb	r3, [r0, r2, lsr #3]
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
@@ -93,7 +93,7 @@ ENTRY(_find_first_zero_bit_be)
 1:		eor	r3, r2, #0x18		@ big endian byte ordering
 		ldrb	r3, [r0, r3, lsr #3]
 		eors	r3, r3, #0xff		@ invert bits
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
@@ -109,7 +109,7 @@ ENTRY(_find_next_zero_bit_be)
 		ldrb	r3, [r0, r3, lsr #3]
 		eor	r3, r3, #0xff		@ now looking for a 1 bit
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
@@ -121,7 +121,7 @@ ENTRY(_find_first_bit_be)
 1:		eor	r3, r2, #0x18		@ big endian byte ordering
 		ldrb	r3, [r0, r3, lsr #3]
 		movs	r3, r3
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
@@ -136,7 +136,7 @@ ENTRY(_find_next_bit_be)
 		eor	r3, r2, #0x18		@ big endian byte ordering
 		ldrb	r3, [r0, r3, lsr #3]
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
@@ -146,7 +146,7 @@ ENTRY(_find_next_bit_be)
 /*
  * One or more bits in the LSB of r3 are assumed to be set.
  */
-.found:
+.L_found:
 #if __LINUX_ARM_ARCH__ >= 5
 		rsb	r1, r3, #0
 		and	r3, r3, r1
diff --git a/arch/arm/lib/gcclib.h b/arch/arm/lib/gcclib.h
deleted file mode 100644
index 8b6dcc6..0000000
--- a/arch/arm/lib/gcclib.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* gcclib.h -- definitions for various functions 'borrowed' from gcc-2.95.3 */
-/* I Molton     29/07/01 */
-
-#include <linux/types.h>
-
-#define BITS_PER_UNIT	8
-#define SI_TYPE_SIZE	(sizeof(s32) * BITS_PER_UNIT)
-
-#ifdef __ARMEB__
-struct DIstruct {
-	s32 high, low;
-};
-#else
-struct DIstruct {
-	s32 low, high;
-};
-#endif
-
-typedef union {
-	struct DIstruct s;
-	s64 ll;
-} DIunion;
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index d204018..c03ea8e 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -54,15 +54,6 @@ __get_user_4:
 	mov	r0, #0
 	mov	pc, lr
 
-	.global	__get_user_8
-__get_user_8:
-5:	ldrt	r2, [r0], #4
-6:	ldrt	r3, [r0]
-	mov	r0, #0
-	mov	pc, lr
-
-__get_user_bad_8:
-	mov	r3, #0
 __get_user_bad:
 	mov	r2, #0
 	mov	r0, #-EFAULT
@@ -73,6 +64,4 @@ __get_user_bad:
 	.long	2b, __get_user_bad
 	.long	3b, __get_user_bad
 	.long	4b, __get_user_bad
-	.long	5b, __get_user_bad_8
-	.long	6b, __get_user_bad_8
 .previous
diff --git a/arch/arm/lib/io-acorn.S b/arch/arm/lib/io-acorn.S
index 3aacd01..b153523 100644
--- a/arch/arm/lib/io-acorn.S
+++ b/arch/arm/lib/io-acorn.S
@@ -17,7 +17,7 @@
 		.text
 		.align
 
-.iosl_warning:
+.Liosl_warning:
 		.ascii	"<4>insl/outsl not implemented, called from %08lX\0"
 		.align
 
@@ -27,6 +27,6 @@
  */
 ENTRY(insl)
 ENTRY(outsl)
-		adr	r0, .iosl_warning
+		adr	r0, .Liosl_warning
 		mov	r1, lr
 		b	printk
diff --git a/arch/arm/lib/io-readsb.S b/arch/arm/lib/io-readsb.S
index 081ef74..d3d8de7 100644
--- a/arch/arm/lib/io-readsb.S
+++ b/arch/arm/lib/io-readsb.S
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-.insb_align:	rsb	ip, ip, #4
+.Linsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
 		movgt	ip, r2
 		cmp	ip, #2
@@ -21,20 +21,20 @@
 		ldrgtb	r3, [r0]
 		strgtb	r3, [r1], #1
 		subs	r2, r2, ip
-		bne	.insb_aligned
+		bne	.Linsb_aligned
 
 ENTRY(__raw_readsb)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		ands	ip, r1, #3
-		bne	.insb_align
+		bne	.Linsb_align
 
-.insb_aligned:	stmfd	sp!, {r4 - r6, lr}
+.Linsb_aligned:	stmfd	sp!, {r4 - r6, lr}
 
 		subs	r2, r2, #16
-		bmi	.insb_no_16
+		bmi	.Linsb_no_16
 
-.insb_16_lp:	ldrb	r3, [r0]
+.Linsb_16_lp:	ldrb	r3, [r0]
 		ldrb	r4, [r0]
 		ldrb	r5, [r0]
 		mov	r3, r3,     put_byte_0
@@ -69,13 +69,13 @@ ENTRY(__raw_readsb)
 		stmia	r1!, {r3 - r6}
 
 		subs	r2, r2, #16
-		bpl	.insb_16_lp
+		bpl	.Linsb_16_lp
 
 		tst	r2, #15
 		LOADREGS(eqfd, sp!, {r4 - r6, pc})
 
-.insb_no_16:	tst	r2, #8
-		beq	.insb_no_8
+.Linsb_no_16:	tst	r2, #8
+		beq	.Linsb_no_8
 
 		ldrb	r3, [r0]
 		ldrb	r4, [r0]
@@ -95,8 +95,8 @@ ENTRY(__raw_readsb)
 		orr	r4, r4, ip, put_byte_3
 		stmia	r1!, {r3, r4}
 
-.insb_no_8:	tst	r2, #4
-		beq	.insb_no_4
+.Linsb_no_8:	tst	r2, #4
+		beq	.Linsb_no_4
 
 		ldrb	r3, [r0]
 		ldrb	r4, [r0]
@@ -108,7 +108,7 @@ ENTRY(__raw_readsb)
 		orr	r3, r3, r6, put_byte_3
 		str	r3, [r1], #4
 
-.insb_no_4:	ands	r2, r2, #3
+.Linsb_no_4:	ands	r2, r2, #3
 		LOADREGS(eqfd, sp!, {r4 - r6, pc})
 
 		cmp	r2, #2
diff --git a/arch/arm/lib/io-readsw-armv3.S b/arch/arm/lib/io-readsw-armv3.S
index 476cf7f..146d47c 100644
--- a/arch/arm/lib/io-readsw-armv3.S
+++ b/arch/arm/lib/io-readsw-armv3.S
@@ -11,16 +11,16 @@
 #include <asm/assembler.h>
 #include <asm/hardware.h>
 
-.insw_bad_alignment:
-		adr	r0, .insw_bad_align_msg
+.Linsw_bad_alignment:
+		adr	r0, .Linsw_bad_align_msg
 		mov	r2, lr
 		b	panic
-.insw_bad_align_msg:
+.Linsw_bad_align_msg:
 		.asciz	"insw: bad buffer alignment (0x%p, lr=0x%08lX)\n"
 		.align
 
-.insw_align:	tst	r1, #1
-		bne	.insw_bad_alignment
+.Linsw_align:	tst	r1, #1
+		bne	.Linsw_bad_alignment
 
 		ldr	r3, [r0]
 		strb	r3, [r1], #1
@@ -34,16 +34,16 @@ ENTRY(__raw_readsw)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		tst	r1, #3
-		bne	.insw_align
+		bne	.Linsw_align
 
-.insw_aligned:	mov	ip, #0xff
+.Linsw_aligned:	mov	ip, #0xff
 		orr	ip, ip, ip, lsl #8
 		stmfd	sp!, {r4, r5, r6, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_insw_8
+		bmi	.Lno_insw_8
 
-.insw_8_lp:	ldr	r3, [r0]
+.Linsw_8_lp:	ldr	r3, [r0]
 		and	r3, r3, ip
 		ldr	r4, [r0]
 		orr	r3, r3, r4, lsl #16
@@ -66,13 +66,13 @@ ENTRY(__raw_readsw)
 		stmia	r1!, {r3 - r6}
 
 		subs	r2, r2, #8
-		bpl	.insw_8_lp
+		bpl	.Linsw_8_lp
 
 		tst	r2, #7
 		LOADREGS(eqfd, sp!, {r4, r5, r6, pc})
 
-.no_insw_8:	tst	r2, #4
-		beq	.no_insw_4
+.Lno_insw_8:	tst	r2, #4
+		beq	.Lno_insw_4
 
 		ldr	r3, [r0]
 		and	r3, r3, ip
@@ -86,8 +86,8 @@ ENTRY(__raw_readsw)
 
 		stmia	r1!, {r3, r4}
 
-.no_insw_4:	tst	r2, #2
-		beq	.no_insw_2
+.Lno_insw_4:	tst	r2, #2
+		beq	.Lno_insw_2
 
 		ldr	r3, [r0]
 		and	r3, r3, ip
@@ -96,7 +96,7 @@ ENTRY(__raw_readsw)
 
 		str	r3, [r1], #4
 
-.no_insw_2:	tst	r2, #1
+.Lno_insw_2:	tst	r2, #1
 		ldrne	r3, [r0]
 		strneb	r3, [r1], #1
 		movne	r3, r3, lsr #8
diff --git a/arch/arm/lib/io-readsw-armv4.S b/arch/arm/lib/io-readsw-armv4.S
index c92b66e..4db1c5f 100644
--- a/arch/arm/lib/io-readsw-armv4.S
+++ b/arch/arm/lib/io-readsw-armv4.S
@@ -18,8 +18,8 @@
 #endif
 		.endm
 
-.insw_align:	movs	ip, r1, lsl #31
-		bne	.insw_noalign
+.Linsw_align:	movs	ip, r1, lsl #31
+		bne	.Linsw_noalign
 		ldrh	ip, [r0]
 		sub	r2, r2, #1
 		strh	ip, [r1], #2
@@ -28,14 +28,14 @@ ENTRY(__raw_readsw)
 		teq	r2, #0
 		moveq	pc, lr
 		tst	r1, #3
-		bne	.insw_align
+		bne	.Linsw_align
 
 		stmfd	sp!, {r4, r5, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_insw_8
+		bmi	.Lno_insw_8
 
-.insw_8_lp:	ldrh	r3, [r0]
+.Linsw_8_lp:	ldrh	r3, [r0]
 		ldrh	r4, [r0]
 		pack	r3, r3, r4
 
@@ -53,10 +53,10 @@ ENTRY(__raw_readsw)
 
 		subs	r2, r2, #8
 		stmia	r1!, {r3 - r5, ip}
-		bpl	.insw_8_lp
+		bpl	.Linsw_8_lp
 
-.no_insw_8:	tst	r2, #4
-		beq	.no_insw_4
+.Lno_insw_8:	tst	r2, #4
+		beq	.Lno_insw_4
 
 		ldrh	r3, [r0]
 		ldrh	r4, [r0]
@@ -68,15 +68,15 @@ ENTRY(__raw_readsw)
 
 		stmia	r1!, {r3, r4}
 
-.no_insw_4:	movs	r2, r2, lsl #31
-		bcc	.no_insw_2
+.Lno_insw_4:	movs	r2, r2, lsl #31
+		bcc	.Lno_insw_2
 
 		ldrh	r3, [r0]
 		ldrh	ip, [r0]
 		pack	r3, r3, ip
 		str	r3, [r1], #4
 
-.no_insw_2:	ldrneh	r3, [r0]
+.Lno_insw_2:	ldrneh	r3, [r0]
 		strneh	r3, [r1]
 
 		ldmfd	sp!, {r4, r5, pc}
@@ -93,7 +93,7 @@ ENTRY(__raw_readsw)
 #define pull_hbyte1		lsr #8
 #endif
 
-.insw_noalign:	stmfd	sp!, {r4, lr}
+.Linsw_noalign:	stmfd	sp!, {r4, lr}
 		ldrccb	ip, [r1, #-1]!
 		bcc	1f
 
diff --git a/arch/arm/lib/io-writesb.S b/arch/arm/lib/io-writesb.S
index 70b2561..08209fc 100644
--- a/arch/arm/lib/io-writesb.S
+++ b/arch/arm/lib/io-writesb.S
@@ -30,7 +30,7 @@
 #endif
 		.endm
 
-.outsb_align:	rsb	ip, ip, #4
+.Loutsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
 		movgt	ip, r2
 		cmp	ip, #2
@@ -41,44 +41,45 @@
 		ldrgtb	r3, [r1], #1
 		strgtb	r3, [r0]
 		subs	r2, r2, ip
-		bne	.outsb_aligned
+		bne	.Loutsb_aligned
 
 ENTRY(__raw_writesb)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		ands	ip, r1, #3
-		bne	.outsb_align
+		bne	.Loutsb_align
 
-.outsb_aligned:	stmfd	sp!, {r4, r5, lr}
+.Loutsb_aligned:
+		stmfd	sp!, {r4, r5, lr}
 
 		subs	r2, r2, #16
-		bmi	.outsb_no_16
+		bmi	.Loutsb_no_16
 
-.outsb_16_lp:	ldmia	r1!, {r3, r4, r5, ip}
+.Loutsb_16_lp:	ldmia	r1!, {r3, r4, r5, ip}
 		outword	r3
 		outword	r4
 		outword	r5
 		outword	ip
 		subs	r2, r2, #16
-		bpl	.outsb_16_lp
+		bpl	.Loutsb_16_lp
 
 		tst	r2, #15
 		LOADREGS(eqfd, sp!, {r4, r5, pc})
 
-.outsb_no_16:	tst	r2, #8
-		beq	.outsb_no_8
+.Loutsb_no_16:	tst	r2, #8
+		beq	.Loutsb_no_8
 
 		ldmia	r1!, {r3, r4}
 		outword	r3
 		outword	r4
 
-.outsb_no_8:	tst	r2, #4
-		beq	.outsb_no_4
+.Loutsb_no_8:	tst	r2, #4
+		beq	.Loutsb_no_4
 
 		ldr	r3, [r1], #4
 		outword	r3
 
-.outsb_no_4:	ands	r2, r2, #3
+.Loutsb_no_4:	ands	r2, r2, #3
 		LOADREGS(eqfd, sp!, {r4, r5, pc})
 
 		cmp	r2, #2
diff --git a/arch/arm/lib/io-writesw-armv3.S b/arch/arm/lib/io-writesw-armv3.S
index 950e7e3..52d62b4 100644
--- a/arch/arm/lib/io-writesw-armv3.S
+++ b/arch/arm/lib/io-writesw-armv3.S
@@ -11,16 +11,16 @@
 #include <asm/assembler.h>
 #include <asm/hardware.h>
 
-.outsw_bad_alignment:
-		adr	r0, .outsw_bad_align_msg
+.Loutsw_bad_alignment:
+		adr	r0, .Loutsw_bad_align_msg
 		mov	r2, lr
 		b	panic
-.outsw_bad_align_msg:
+.Loutsw_bad_align_msg:
 		.asciz	"outsw: bad buffer alignment (0x%p, lr=0x%08lX)\n"
 		.align
 
-.outsw_align:	tst	r1, #1
-		bne	.outsw_bad_alignment
+.Loutsw_align:	tst	r1, #1
+		bne	.Loutsw_bad_alignment
 
 		add	r1, r1, #2
 
@@ -35,14 +35,14 @@ ENTRY(__raw_writesw)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		tst	r1, #3
-		bne	.outsw_align
+		bne	.Loutsw_align
 
-.outsw_aligned:	stmfd	sp!, {r4, r5, r6, lr}
+		stmfd	sp!, {r4, r5, r6, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_outsw_8
+		bmi	.Lno_outsw_8
 
-.outsw_8_lp:	ldmia	r1!, {r3, r4, r5, r6}
+.Loutsw_8_lp:	ldmia	r1!, {r3, r4, r5, r6}
 
 		mov	ip, r3, lsl #16
 		orr	ip, ip, ip, lsr #16
@@ -77,13 +77,13 @@ ENTRY(__raw_writesw)
 		str	ip, [r0]
 
 		subs	r2, r2, #8
-		bpl	.outsw_8_lp
+		bpl	.Loutsw_8_lp
 
 		tst	r2, #7
 		LOADREGS(eqfd, sp!, {r4, r5, r6, pc})
 
-.no_outsw_8:	tst	r2, #4
-		beq	.no_outsw_4
+.Lno_outsw_8:	tst	r2, #4
+		beq	.Lno_outsw_4
 
 		ldmia	r1!, {r3, r4}
 
@@ -103,8 +103,8 @@ ENTRY(__raw_writesw)
 		orr	ip, ip, ip, lsl #16
 		str	ip, [r0]
 
-.no_outsw_4:	tst	r2, #2
-		beq	.no_outsw_2
+.Lno_outsw_4:	tst	r2, #2
+		beq	.Lno_outsw_2
 
 		ldr	r3, [r1], #4
 
@@ -116,7 +116,7 @@ ENTRY(__raw_writesw)
 		orr	ip, ip, ip, lsl #16
 		str	ip, [r0]
 
-.no_outsw_2:	tst	r2, #1
+.Lno_outsw_2:	tst	r2, #1
 
 		ldrne	r3, [r1]
 
diff --git a/arch/arm/lib/io-writesw-armv4.S b/arch/arm/lib/io-writesw-armv4.S
index 5e240e4..c8e85bd 100644
--- a/arch/arm/lib/io-writesw-armv4.S
+++ b/arch/arm/lib/io-writesw-armv4.S
@@ -22,8 +22,8 @@
 #endif
 		.endm
 
-.outsw_align:	movs	ip, r1, lsl #31
-		bne	.outsw_noalign
+.Loutsw_align:	movs	ip, r1, lsl #31
+		bne	.Loutsw_noalign
 
 		ldrh	r3, [r1], #2
 		sub	r2, r2, #1
@@ -33,35 +33,35 @@ ENTRY(__raw_writesw)
 		teq	r2, #0
 		moveq	pc, lr
 		ands	r3, r1, #3
-		bne	.outsw_align
+		bne	.Loutsw_align
 
 		stmfd	sp!, {r4, r5, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_outsw_8
+		bmi	.Lno_outsw_8
 
-.outsw_8_lp:	ldmia	r1!, {r3, r4, r5, ip}
+.Loutsw_8_lp:	ldmia	r1!, {r3, r4, r5, ip}
 		subs	r2, r2, #8
 		outword	r3
 		outword	r4
 		outword	r5
 		outword	ip
-		bpl	.outsw_8_lp
+		bpl	.Loutsw_8_lp
 
-.no_outsw_8:	tst	r2, #4
-		beq	.no_outsw_4
+.Lno_outsw_8:	tst	r2, #4
+		beq	.Lno_outsw_4
 
 		ldmia	r1!, {r3, ip}
 		outword	r3
 		outword	ip
 
-.no_outsw_4:	movs	r2, r2, lsl #31
-		bcc	.no_outsw_2
+.Lno_outsw_4:	movs	r2, r2, lsl #31
+		bcc	.Lno_outsw_2
 
 		ldr	r3, [r1], #4
 		outword	r3
 
-.no_outsw_2:	ldrneh	r3, [r1]
+.Lno_outsw_2:	ldrneh	r3, [r1]
 		strneh	r3, [r0]
 
 		ldmfd	sp!, {r4, r5, pc}
@@ -74,7 +74,8 @@ ENTRY(__raw_writesw)
 #define push_hbyte1	lsl #8
 #endif
 
-.outsw_noalign:	ldr	r3, [r1, -r3]!
+.Loutsw_noalign:
+		ldr	r3, [r1, -r3]!
 		subcs	r2, r2, #1
 		bcs	2f
 		subs	r2, r2, #2
diff --git a/arch/arm/lib/lshrdi3.S b/arch/arm/lib/lshrdi3.S
new file mode 100644
index 0000000..46c2ed1
--- /dev/null
+++ b/arch/arm/lib/lshrdi3.S
@@ -0,0 +1,48 @@
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define al r1
+#define ah r0
+#else
+#define al r0
+#define ah r1
+#endif
+
+ENTRY(__lshrdi3)
+
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	al, al, lsr r2
+	movpl	al, ah, lsr r3
+	orrmi	al, al, ah, lsl ip
+	mov	ah, ah, lsr r2
+	mov	pc, lr
+
diff --git a/arch/arm/lib/lshrdi3.c b/arch/arm/lib/lshrdi3.c
deleted file mode 100644
index 3681f49..0000000
--- a/arch/arm/lib/lshrdi3.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/* More subroutines needed by GCC output code on some machines.  */
-/* Compile this one with gcc.  */
-/* Copyright (C) 1989, 92-98, 1999 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* As a special exception, if you link this library with other files,
-   some of which are compiled with GCC, to produce an executable,
-   this library does not by itself cause the resulting executable
-   to be covered by the GNU General Public License.
-   This exception does not however invalidate any other reasons why
-   the executable file might be covered by the GNU General Public License.
- */
-/* support functions required by the kernel. based on code from gcc-2.95.3 */
-/* I Molton     29/07/01 */
-
-#include "gcclib.h"
-
-s64 __lshrdi3(s64 u, int b)
-{
-	DIunion w;
-	int bm;
-	DIunion uu;
-
-	if (b == 0)
-		return u;
-
-	uu.ll = u;
-
-	bm = (sizeof(s32) * BITS_PER_UNIT) - b;
-	if (bm <= 0) {
-		w.s.high = 0;
-		w.s.low = (u32) uu.s.high >> -bm;
-	} else {
-		u32 carries = (u32) uu.s.high << bm;
-		w.s.high = (u32) uu.s.high >> b;
-		w.s.low = ((u32) uu.s.low >> b) | carries;
-	}
-
-	return w.ll;
-}
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index f5a593c..7e71d67 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -1,393 +1,59 @@
 /*
  *  linux/arch/arm/lib/memcpy.S
  *
- *  Copyright (C) 1995-1999 Russell King
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  ASM optimised string functions
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
  */
+
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-		.text
-
-#define ENTER	\
-		mov	ip,sp	;\
-		stmfd	sp!,{r0,r4-r9,fp,ip,lr,pc}	;\
-		sub	fp,ip,#4
-
-#define EXIT	\
-		LOADREGS(ea, fp, {r0, r4 - r9, fp, sp, pc})
-
-#define EXITEQ	\
-		LOADREGS(eqea, fp, {r0, r4 - r9, fp, sp, pc})
-
-/*
- * Prototype: void memcpy(void *to,const void *from,unsigned long n);
- */
-ENTRY(memcpy)
-ENTRY(memmove)
-		ENTER
-		cmp	r1, r0
-		bcc	23f
-		subs	r2, r2, #4
-		blt	6f
-	PLD(	pld	[r1, #0]		)
-		ands	ip, r0, #3
-		bne	7f
-		ands	ip, r1, #3
-		bne	8f
+	.macro ldr1w ptr reg abort
+	ldr \reg, [\ptr], #4
+	.endm
 
-1:		subs	r2, r2, #8
-		blt	5f
-		subs	r2, r2, #20
-		blt	4f
-	PLD(	pld	[r1, #28]		)
-	PLD(	subs	r2, r2, #64		)
-	PLD(	blt	3f			)
-2:	PLD(	pld	[r1, #60]		)
-	PLD(	pld	[r1, #92]		)
-		ldmia	r1!, {r3 - r9, ip}
-		subs	r2, r2, #32
-		stmgeia	r0!, {r3 - r9, ip}
-		ldmgeia	r1!, {r3 - r9, ip}
-		subges	r2, r2, #32
-		stmia	r0!, {r3 - r9, ip}
-		bge	2b
-3:	PLD(	ldmia	r1!, {r3 - r9, ip}	)
-	PLD(	adds	r2, r2, #32		)
-	PLD(	stmgeia	r0!, {r3 - r9, ip}	)
-	PLD(	ldmgeia	r1!, {r3 - r9, ip}	)
-	PLD(	subges	r2, r2, #32		)
-	PLD(	stmia	r0!, {r3 - r9, ip}	)
-4:		cmn	r2, #16
-		ldmgeia	r1!, {r3 - r6}
-		subge	r2, r2, #16
-		stmgeia	r0!, {r3 - r6}
-		adds	r2, r2, #20
-		ldmgeia	r1!, {r3 - r5}
-		subge	r2, r2, #12
-		stmgeia	r0!, {r3 - r5}
-5:		adds	r2, r2, #8
-		blt	6f
-		subs	r2, r2, #4
-		ldrlt	r3, [r1], #4
-		ldmgeia	r1!, {r4, r5}
-		subge	r2, r2, #4
-		strlt	r3, [r0], #4
-		stmgeia	r0!, {r4, r5}
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
 
-6:		adds	r2, r2, #4
-		EXITEQ
-		cmp	r2, #2
-		ldrb	r3, [r1], #1
-		ldrgeb	r4, [r1], #1
-		ldrgtb	r5, [r1], #1
-		strb	r3, [r0], #1
-		strgeb	r4, [r0], #1
-		strgtb	r5, [r0], #1
-		EXIT
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
 
-7:		rsb	ip, ip, #4
-		cmp	ip, #2
-		ldrb	r3, [r1], #1
-		ldrgeb	r4, [r1], #1
-		ldrgtb	r5, [r1], #1
-		strb	r3, [r0], #1
-		strgeb	r4, [r0], #1
-		strgtb	r5, [r0], #1
-		subs	r2, r2, ip
-		blt	6b
-		ands	ip, r1, #3
-		beq	1b
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
 
-8:		bic	r1, r1, #3
-		ldr	r7, [r1], #4
-		cmp	ip, #2
-		bgt	18f
-		beq	13f
-		cmp	r2, #12
-		blt	11f
-	PLD(	pld	[r1, #12]		)
-		sub	r2, r2, #12
-	PLD(	subs	r2, r2, #32		)
-	PLD(	blt	10f			)
-	PLD(	pld	[r1, #28]		)
-9:	PLD(	pld	[r1, #44]		)
-10:		mov	r3, r7, pull #8
-		ldmia	r1!, {r4 - r7}
-		subs	r2, r2, #16
-		orr	r3, r3, r4, push #24
-		mov	r4, r4, pull #8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
-		stmia	r0!, {r3 - r6}
-		bge	9b
-	PLD(	cmn	r2, #32			)
-	PLD(	bge	10b			)
-	PLD(	add	r2, r2, #32		)
-		adds	r2, r2, #12
-		blt	12f
-11:		mov	r3, r7, pull #8
-		ldr	r7, [r1], #4
-		subs	r2, r2, #4
-		orr	r3, r3, r7, push #24
-		str	r3, [r0], #4
-		bge	11b
-12:		sub	r1, r1, #3
-		b	6b
+	.macro str1w ptr reg abort
+	str \reg, [\ptr], #4
+	.endm
 
-13:		cmp	r2, #12
-		blt	16f
-	PLD(	pld	[r1, #12]		)
-		sub	r2, r2, #12
-	PLD(	subs	r2, r2, #32		)
-	PLD(	blt	15f			)
-	PLD(	pld	[r1, #28]		)
-14:	PLD(	pld	[r1, #44]		)
-15:		mov	r3, r7, pull #16
-		ldmia	r1!, {r4 - r7}
-		subs	r2, r2, #16
-		orr	r3, r3, r4, push #16
-		mov	r4, r4, pull #16
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
-		stmia	r0!, {r3 - r6}
-		bge	14b
-	PLD(	cmn	r2, #32			)
-	PLD(	bge	15b			)
-	PLD(	add	r2, r2, #32		)
-		adds	r2, r2, #12
-		blt	17f
-16:		mov	r3, r7, pull #16
-		ldr	r7, [r1], #4
-		subs	r2, r2, #4
-		orr	r3, r3, r7, push #16
-		str	r3, [r0], #4
-		bge	16b
-17:		sub	r1, r1, #2
-		b	6b
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
 
-18:		cmp	r2, #12
-		blt	21f
-	PLD(	pld	[r1, #12]		)
-		sub	r2, r2, #12
-	PLD(	subs	r2, r2, #32		)
-	PLD(	blt	20f			)
-	PLD(	pld	[r1, #28]		)
-19:	PLD(	pld	[r1, #44]		)
-20:		mov	r3, r7, pull #24
-		ldmia	r1!, {r4 - r7}
-		subs	r2, r2, #16
-		orr	r3, r3, r4, push #8
-		mov	r4, r4, pull #24
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
-		stmia	r0!, {r3 - r6}
-		bge	19b
-	PLD(	cmn	r2, #32			)
-	PLD(	bge	20b			)
-	PLD(	add	r2, r2, #32		)
-		adds	r2, r2, #12
-		blt	22f
-21:		mov	r3, r7, pull #24
-		ldr	r7, [r1], #4
-		subs	r2, r2, #4
-		orr	r3, r3, r7, push #8
-		str	r3, [r0], #4
-		bge	21b
-22:		sub	r1, r1, #1
-		b	6b
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
 
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
 
-23:		add	r1, r1, r2
-		add	r0, r0, r2
-		subs	r2, r2, #4
-		blt	29f
-	PLD(	pld	[r1, #-4]		)
-		ands	ip, r0, #3
-		bne	30f
-		ands	ip, r1, #3
-		bne	31f
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
 
-24:		subs	r2, r2, #8
-		blt	28f
-		subs	r2, r2, #20
-		blt	27f
-	PLD(	pld	[r1, #-32]		)
-	PLD(	subs	r2, r2, #64		)
-	PLD(	blt	26f			)
-25:	PLD(	pld	[r1, #-64]		)
-	PLD(	pld	[r1, #-96]		)
-		ldmdb	r1!, {r3 - r9, ip}
-		subs	r2, r2, #32
-		stmgedb	r0!, {r3 - r9, ip}
-		ldmgedb	r1!, {r3 - r9, ip}
-		subges	r2, r2, #32
-		stmdb	r0!, {r3 - r9, ip}
-		bge	25b
-26:	PLD(	ldmdb	r1!, {r3 - r9, ip}	)
-	PLD(	adds	r2, r2, #32		)
-	PLD(	stmgedb	r0!, {r3 - r9, ip}	)
-	PLD(	ldmgedb	r1!, {r3 - r9, ip}	)
-	PLD(	subges	r2, r2, #32		)
-	PLD(	stmdb	r0!, {r3 - r9, ip}	)
-27:		cmn	r2, #16
-		ldmgedb	r1!, {r3 - r6}
-		subge	r2, r2, #16
-		stmgedb	r0!, {r3 - r6}
-		adds	r2, r2, #20
-		ldmgedb	r1!, {r3 - r5}
-		subge	r2, r2, #12
-		stmgedb	r0!, {r3 - r5}
-28:		adds	r2, r2, #8
-		blt	29f
-		subs	r2, r2, #4
-		ldrlt	r3, [r1, #-4]!
-		ldmgedb	r1!, {r4, r5}
-		subge	r2, r2, #4
-		strlt	r3, [r0, #-4]!
-		stmgedb	r0!, {r4, r5}
+	.text
 
-29:		adds	r2, r2, #4
-		EXITEQ
-		cmp	r2, #2
-		ldrb	r3, [r1, #-1]!
-		ldrgeb	r4, [r1, #-1]!
-		ldrgtb	r5, [r1, #-1]!
-		strb	r3, [r0, #-1]!
-		strgeb	r4, [r0, #-1]!
-		strgtb	r5, [r0, #-1]!
-		EXIT
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
 
-30:		cmp	ip, #2
-		ldrb	r3, [r1, #-1]!
-		ldrgeb	r4, [r1, #-1]!
-		ldrgtb	r5, [r1, #-1]!
-		strb	r3, [r0, #-1]!
-		strgeb	r4, [r0, #-1]!
-		strgtb	r5, [r0, #-1]!
-		subs	r2, r2, ip
-		blt	29b
-		ands	ip, r1, #3
-		beq	24b
-
-31:		bic	r1, r1, #3
-		ldr	r3, [r1], #0
-		cmp	ip, #2
-		blt	41f
-		beq	36f
-		cmp	r2, #12
-		blt	34f
-	PLD(	pld	[r1, #-16]		)
-		sub	r2, r2, #12
-	PLD(	subs	r2, r2, #32		)
-	PLD(	blt	33f			)
-	PLD(	pld	[r1, #-32]		)
-32:	PLD(	pld	[r1, #-48]		)
-33:		mov	r7, r3, push #8
-		ldmdb	r1!, {r3, r4, r5, r6}
-		subs	r2, r2, #16
-		orr	r7, r7, r6, pull #24
-		mov	r6, r6, push #8
-		orr	r6, r6, r5, pull #24
-		mov	r5, r5, push #8
-		orr	r5, r5, r4, pull #24
-		mov	r4, r4, push #8
-		orr	r4, r4, r3, pull #24
-		stmdb	r0!, {r4, r5, r6, r7}
-		bge	32b
-	PLD(	cmn	r2, #32			)
-	PLD(	bge	33b			)
-	PLD(	add	r2, r2, #32		)
-		adds	r2, r2, #12
-		blt	35f
-34:		mov	ip, r3, push #8
-		ldr	r3, [r1, #-4]!
-		subs	r2, r2, #4
-		orr	ip, ip, r3, pull #24
-		str	ip, [r0, #-4]!
-		bge	34b
-35:		add	r1, r1, #3
-		b	29b
-
-36:		cmp	r2, #12
-		blt	39f
-	PLD(	pld	[r1, #-16]		)
-		sub	r2, r2, #12
-	PLD(	subs	r2, r2, #32		)
-	PLD(	blt	38f			)
-	PLD(	pld	[r1, #-32]		)
-37:	PLD(	pld	[r1, #-48]		)
-38:		mov	r7, r3, push #16
-		ldmdb	r1!, {r3, r4, r5, r6}
-		subs	r2, r2, #16
-		orr	r7, r7, r6, pull #16
-		mov	r6, r6, push #16
-		orr	r6, r6, r5, pull #16
-		mov	r5, r5, push #16
-		orr	r5, r5, r4, pull #16
-		mov	r4, r4, push #16
-		orr	r4, r4, r3, pull #16
-		stmdb	r0!, {r4, r5, r6, r7}
-		bge	37b
-	PLD(	cmn	r2, #32			)
-	PLD(	bge	38b			)
-	PLD(	add	r2, r2, #32		)
-		adds	r2, r2, #12
-		blt	40f
-39:		mov	ip, r3, push #16
-		ldr	r3, [r1, #-4]!
-		subs	r2, r2, #4
-		orr	ip, ip, r3, pull #16
-		str	ip, [r0, #-4]!
-		bge	39b
-40:		add	r1, r1, #2
-		b	29b
+ENTRY(memcpy)
 
-41:		cmp	r2, #12
-		blt	44f
-	PLD(	pld	[r1, #-16]		)
-		sub	r2, r2, #12
-	PLD(	subs	r2, r2, #32		)
-	PLD(	blt	43f			)
-	PLD(	pld	[r1, #-32]		)
-42:	PLD(	pld	[r1, #-48]		)
-43:		mov	r7, r3, push #24
-		ldmdb	r1!, {r3, r4, r5, r6}
-		subs	r2, r2, #16
-		orr	r7, r7, r6, pull #8
-		mov	r6, r6, push #24
-		orr	r6, r6, r5, pull #8
-		mov	r5, r5, push #24
-		orr	r5, r5, r4, pull #8
-		mov	r4, r4, push #24
-		orr	r4, r4, r3, pull #8
-		stmdb	r0!, {r4, r5, r6, r7}
-		bge	42b
-	PLD(	cmn	r2, #32			)
-	PLD(	bge	43b			)
-	PLD(	add	r2, r2, #32		)
-		adds	r2, r2, #12
-		blt	45f
-44:		mov	ip, r3, push #24
-		ldr	r3, [r1, #-4]!
-		subs	r2, r2, #4
-		orr	ip, ip, r3, pull #8
-		str	ip, [r0, #-4]!
-		bge	44b
-45:		add	r1, r1, #1
-		b	29b
+#include "copy_template.S"
 
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
new file mode 100644
index 0000000..ef7fddc
--- /dev/null
+++ b/arch/arm/lib/memmove.S
@@ -0,0 +1,206 @@
+/*
+ *  linux/arch/arm/lib/memmove.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	(C) MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do.  That might be different in the future.
+ */
+//#define CALGN(code...)        code
+#define CALGN(code...)
+
+		.text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.  This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(memmove)
+
+		subs	ip, r0, r1
+		cmphi	r2, ip
+		bls	memcpy
+
+		stmfd	sp!, {r0, r4, lr}
+		add	r1, r1, r2
+		add	r0, r0, r2
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #-4]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, ip		)  @ C is set here
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #-4]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	pld	[r1, #-96]		)
+
+3:	PLD(	pld	[r1, #-128]		)
+4:		ldmdb	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		subs	r2, r2, #32
+		stmdb	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		nop
+		ldr	r3, [r1, #-4]!
+		ldr	r4, [r1, #-4]!
+		ldr	r5, [r1, #-4]!
+		ldr	r6, [r1, #-4]!
+		ldr	r7, [r1, #-4]!
+		ldr	r8, [r1, #-4]!
+		ldr	lr, [r1, #-4]!
+
+		add	pc, pc, ip
+		nop
+		nop
+		str	r3, [r0, #-4]!
+		str	r4, [r0, #-4]!
+		str	r5, [r0, #-4]!
+		str	r6, [r0, #-4]!
+		str	r7, [r0, #-4]!
+		str	r8, [r0, #-4]!
+		str	lr, [r0, #-4]!
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldrneb	r3, [r1, #-1]!
+		ldrcsb	r4, [r1, #-1]!
+		ldrcsb	ip, [r1, #-1]
+		strneb	r3, [r0, #-1]!
+		strcsb	r4, [r0, #-1]!
+		strcsb	ip, [r0, #-1]
+		ldmfd	sp!, {r0, r4, pc}
+
+9:		cmp	ip, #2
+		ldrgtb	r3, [r1, #-1]!
+		ldrgeb	r4, [r1, #-1]!
+		ldrb	lr, [r1, #-1]!
+		strgtb	r3, [r0, #-1]!
+		strgeb	r4, [r0, #-1]!
+		subs	r2, r2, ip
+		strb	lr, [r0, #-1]!
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	r3, [r1, #0]
+		beq	17f
+		blt	18f
+
+
+		.macro	backward_copy_shift push pull
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #-4]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	pld	[r1, #-96]		)
+
+12:	PLD(	pld	[r1, #-128]		)
+13:		ldmdb   r1!, {r7, r8, r9, ip}
+		mov     lr, r3, push #\push
+		subs    r2, r2, #32
+		ldmdb   r1!, {r3, r4, r5, r6}
+		orr     lr, lr, ip, pull #\pull
+		mov     ip, ip, push #\push
+		orr     ip, ip, r9, pull #\pull
+		mov     r9, r9, push #\push
+		orr     r9, r9, r8, pull #\pull
+		mov     r8, r8, push #\push
+		orr     r8, r8, r7, pull #\pull
+		mov     r7, r7, push #\push
+		orr     r7, r7, r6, pull #\pull
+		mov     r6, r6, push #\push
+		orr     r6, r6, r5, pull #\pull
+		mov     r5, r5, push #\push
+		orr     r5, r5, r4, pull #\pull
+		mov     r4, r4, push #\push
+		orr     r4, r4, r3, pull #\pull
+		stmdb   r0!, {r4 - r9, ip, lr}
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov     lr, r3, push #\push
+		ldr	r3, [r1, #-4]!
+		subs	ip, ip, #4
+		orr	lr, lr, r3, pull #\pull
+		str	lr, [r0, #-4]!
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		add	r1, r1, #(\pull / 8)
+		b	8b
+
+		.endm
+
+
+		backward_copy_shift	push=8	pull=24
+
+17:		backward_copy_shift	push=16	pull=16
+
+18:		backward_copy_shift	push=24	pull=8
+
diff --git a/arch/arm/lib/muldi3.S b/arch/arm/lib/muldi3.S
new file mode 100644
index 0000000..c7fbdf0
--- /dev/null
+++ b/arch/arm/lib/muldi3.S
@@ -0,0 +1,44 @@
+/*
+ *  linux/arch/arm/lib/muldi3.S
+ *
+ *  Author:     Nicolas Pitre
+ *  Created:    Oct 19, 2005
+ *  Copyright:  Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+ENTRY(__muldi3)
+
+	mul	xh, yl, xh
+	mla	xh, xl, yh, xh
+	mov	ip, xl, asr #16
+	mov	yh, yl, asr #16
+	bic	xl, xl, ip, lsl #16
+	bic	yl, yl, yh, lsl #16
+	mla	xh, yh, ip, xh
+	mul	yh, xl, yh
+	mul	xl, yl, xl
+	mul	ip, yl, ip
+	adds	xl, xl, yh, lsl #16
+	adc	xh, xh, yh, lsr #16
+	adds	xl, xl, ip, lsl #16
+	adc	xh, xh, ip, lsr #16
+	mov	pc, lr
+
diff --git a/arch/arm/lib/muldi3.c b/arch/arm/lib/muldi3.c
deleted file mode 100644
index 0a3b933..0000000
--- a/arch/arm/lib/muldi3.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/* More subroutines needed by GCC output code on some machines.  */
-/* Compile this one with gcc.  */
-/* Copyright (C) 1989, 92-98, 1999 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* As a special exception, if you link this library with other files,
-   some of which are compiled with GCC, to produce an executable,
-   this library does not by itself cause the resulting executable
-   to be covered by the GNU General Public License.
-   This exception does not however invalidate any other reasons why
-   the executable file might be covered by the GNU General Public License.
- */
-/* support functions required by the kernel. based on code from gcc-2.95.3 */
-/* I Molton     29/07/01 */
-
-#include "gcclib.h"
-
-#define umul_ppmm(xh, xl, a, b) \
-{register u32 __t0, __t1, __t2;                                     \
-  __asm__ ("%@ Inlined umul_ppmm					\n\
-        mov     %2, %5, lsr #16						\n\
-        mov     %0, %6, lsr #16						\n\
-        bic     %3, %5, %2, lsl #16					\n\
-        bic     %4, %6, %0, lsl #16					\n\
-        mul     %1, %3, %4						\n\
-        mul     %4, %2, %4						\n\
-        mul     %3, %0, %3						\n\
-        mul     %0, %2, %0						\n\
-        adds    %3, %4, %3						\n\
-        addcs   %0, %0, #65536						\n\
-        adds    %1, %1, %3, lsl #16					\n\
-        adc     %0, %0, %3, lsr #16"                                    \
-           : "=&r" ((u32) (xh)),                                    \
-             "=r" ((u32) (xl)),                                     \
-             "=&r" (__t0), "=&r" (__t1), "=r" (__t2)                    \
-           : "r" ((u32) (a)),                                       \
-             "r" ((u32) (b)));}
-
-#define __umulsidi3(u, v) \
-  ({DIunion __w;                                                        \
-    umul_ppmm (__w.s.high, __w.s.low, u, v);                            \
-    __w.ll; })
-
-s64 __muldi3(s64 u, s64 v)
-{
-	DIunion w;
-	DIunion uu, vv;
-
-	uu.ll = u, vv.ll = v;
-
-	w.ll = __umulsidi3(uu.s.low, vv.s.low);
-	w.s.high += ((u32) uu.s.low * (u32) vv.s.high
-		     + (u32) uu.s.high * (u32) vv.s.low);
-
-	return w.ll;
-}
diff --git a/arch/arm/lib/sha1.S b/arch/arm/lib/sha1.S
new file mode 100644
index 0000000..ff6ece4
--- /dev/null
+++ b/arch/arm/lib/sha1.S
@@ -0,0 +1,206 @@
+/*
+ *  linux/arch/arm/lib/sha1.S
+ *
+ *  SHA transform optimized for ARM
+ *
+ *  Copyright:	(C) 2005 by Nicolas Pitre <nico@cam.org>
+ *  Created:	September 17, 2005
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  The reference implementation for this code is linux/lib/sha1.c
+ */
+
+#include <linux/linkage.h>
+
+	.text
+
+
+/*
+ * void sha_transform(__u32 *digest, const char *in, __u32 *W)
+ *
+ * Note: the "in" ptr may be unaligned.
+ */
+
+ENTRY(sha_transform)
+
+	stmfd	sp!, {r4 - r8, lr}
+
+	@ for (i = 0; i < 16; i++)
+	@         W[i] = be32_to_cpu(in[i]); */
+
+#ifdef __ARMEB__
+	mov	r4, r0
+	mov	r0, r2
+	mov	r2, #64
+	bl	memcpy
+	mov	r2, r0
+	mov	r0, r4
+#else
+	mov	r3, r2
+	mov	lr, #16
+1:	ldrb	r4, [r1], #1
+	ldrb	r5, [r1], #1
+	ldrb	r6, [r1], #1
+	ldrb	r7, [r1], #1
+	subs	lr, lr, #1
+	orr	r5, r5, r4, lsl #8
+	orr	r6, r6, r5, lsl #8
+	orr	r7, r7, r6, lsl #8
+	str	r7, [r3], #4
+	bne	1b
+#endif
+
+	@ for (i = 0; i < 64; i++)
+	@         W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31);
+
+	sub	r3, r2, #4
+	mov	lr, #64
+2:	ldr	r4, [r3, #4]!
+	subs	lr, lr, #1
+	ldr	r5, [r3, #8]
+	ldr	r6, [r3, #32]
+	ldr	r7, [r3, #52]
+	eor	r4, r4, r5
+	eor	r4, r4, r6
+	eor	r4, r4, r7
+	mov	r4, r4, ror #31
+	str	r4, [r3, #64]
+	bne	2b
+
+	/*
+	 * The SHA functions are:
+	 *
+	 * f1(B,C,D) = (D ^ (B & (C ^ D)))
+	 * f2(B,C,D) = (B ^ C ^ D)
+	 * f3(B,C,D) = ((B & C) | (D & (B | C)))
+	 *
+	 * Then the sub-blocks are processed as follows:
+	 *
+	 * A' = ror(A, 27) + f(B,C,D) + E + K + *W++
+	 * B' = A
+	 * C' = ror(B, 2)
+	 * D' = C
+	 * E' = D
+	 *
+	 * We therefore unroll each loop 5 times to avoid register shuffling.
+	 * Also the ror for C (and also D and E which are successivelyderived
+	 * from it) is applied in place to cut on an additional mov insn for
+	 * each round.
+	 */
+
+	.macro	sha_f1, A, B, C, D, E
+	ldr	r3, [r2], #4
+	eor	ip, \C, \D
+	add	\E, r1, \E, ror #2
+	and	ip, \B, ip, ror #2
+	add	\E, \E, \A, ror #27
+	eor	ip, ip, \D, ror #2
+	add	\E, \E, r3
+	add	\E, \E, ip
+	.endm
+
+	.macro	sha_f2, A, B, C, D, E
+	ldr	r3, [r2], #4
+	add	\E, r1, \E, ror #2
+	eor	ip, \B, \C, ror #2
+	add	\E, \E, \A, ror #27
+	eor	ip, ip, \D, ror #2
+	add	\E, \E, r3
+	add	\E, \E, ip
+	.endm
+
+	.macro	sha_f3, A, B, C, D, E
+	ldr	r3, [r2], #4
+	add	\E, r1, \E, ror #2
+	orr	ip, \B, \C, ror #2
+	add	\E, \E, \A, ror #27
+	and	ip, ip, \D, ror #2
+	add	\E, \E, r3
+	and	r3, \B, \C, ror #2
+	orr	ip, ip, r3
+	add	\E, \E, ip
+	.endm
+
+	ldmia	r0, {r4 - r8}
+
+	mov	lr, #4
+	ldr	r1, .L_sha_K + 0
+
+	/* adjust initial values */
+	mov	r6, r6, ror #30
+	mov	r7, r7, ror #30
+	mov	r8, r8, ror #30
+
+3:	subs	lr, lr, #1
+	sha_f1	r4, r5, r6, r7, r8
+	sha_f1	r8, r4, r5, r6, r7
+	sha_f1	r7, r8, r4, r5, r6
+	sha_f1	r6, r7, r8, r4, r5
+	sha_f1	r5, r6, r7, r8, r4
+	bne	3b
+
+	ldr	r1, .L_sha_K + 4
+	mov	lr, #4
+
+4:	subs	lr, lr, #1
+	sha_f2	r4, r5, r6, r7, r8
+	sha_f2	r8, r4, r5, r6, r7
+	sha_f2	r7, r8, r4, r5, r6
+	sha_f2	r6, r7, r8, r4, r5
+	sha_f2	r5, r6, r7, r8, r4
+	bne	4b
+
+	ldr	r1, .L_sha_K + 8
+	mov	lr, #4
+
+5:	subs	lr, lr, #1
+	sha_f3	r4, r5, r6, r7, r8
+	sha_f3	r8, r4, r5, r6, r7
+	sha_f3	r7, r8, r4, r5, r6
+	sha_f3	r6, r7, r8, r4, r5
+	sha_f3	r5, r6, r7, r8, r4
+	bne	5b
+
+	ldr	r1, .L_sha_K + 12
+	mov	lr, #4
+
+6:	subs	lr, lr, #1
+	sha_f2	r4, r5, r6, r7, r8
+	sha_f2	r8, r4, r5, r6, r7
+	sha_f2	r7, r8, r4, r5, r6
+	sha_f2	r6, r7, r8, r4, r5
+	sha_f2	r5, r6, r7, r8, r4
+	bne	6b
+
+	ldmia	r0, {r1, r2, r3, ip, lr}
+	add	r4, r1, r4
+	add	r5, r2, r5
+	add	r6, r3, r6, ror #2
+	add	r7, ip, r7, ror #2
+	add	r8, lr, r8, ror #2
+	stmia	r0, {r4 - r8}
+
+	ldmfd	sp!, {r4 - r8, pc}
+
+.L_sha_K:
+	.word	0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
+
+
+/*
+ * void sha_init(__u32 *buf)
+ */
+
+.L_sha_initial_digest:
+	.word	0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
+
+ENTRY(sha_init)
+
+	str	lr, [sp, #-4]!
+	adr	r1, .L_sha_initial_digest
+	ldmia	r1, {r1, r2, r3, ip, lr}
+	stmia	r0, {r1, r2, r3, ip, lr}
+	ldr	pc, [sp], #4
+
diff --git a/arch/arm/lib/uaccess.S b/arch/arm/lib/uaccess.S
index d3ed063..0cc450f 100644
--- a/arch/arm/lib/uaccess.S
+++ b/arch/arm/lib/uaccess.S
@@ -27,7 +27,7 @@
  * Returns  : Number of bytes NOT copied.
  */
 
-.c2u_dest_not_aligned:
+.Lc2u_dest_not_aligned:
 		rsb	ip, ip, #4
 		cmp	ip, #2
 		ldrb	r3, [r1], #1
@@ -37,34 +37,32 @@ USER(		strgebt	r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #1
 USER(		strgtbt	r3, [r0], #1)			@ May fault
 		sub	r2, r2, ip
-		b	.c2u_dest_aligned
+		b	.Lc2u_dest_aligned
 
 ENTRY(__arch_copy_to_user)
 		stmfd	sp!, {r2, r4 - r7, lr}
 		cmp	r2, #4
-		blt	.c2u_not_enough
-	PLD(	pld	[r1, #0]		)
-	PLD(	pld	[r0, #0]		)
+		blt	.Lc2u_not_enough
 		ands	ip, r0, #3
-		bne	.c2u_dest_not_aligned
-.c2u_dest_aligned:
+		bne	.Lc2u_dest_not_aligned
+.Lc2u_dest_aligned:
 
 		ands	ip, r1, #3
-		bne	.c2u_src_not_aligned
+		bne	.Lc2u_src_not_aligned
 /*
  * Seeing as there has to be at least 8 bytes to copy, we can
  * copy one word, and force a user-mode page fault...
  */
 
-.c2u_0fupi:	subs	r2, r2, #4
+.Lc2u_0fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_0nowords
+		bmi	.Lc2u_0nowords
 		ldr	r3, [r1], #4
 USER(		strt	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT	@ On each page, use a ld/st??t instruction
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_0fupi
+		beq	.Lc2u_0fupi
 /*
  * ip = max no. of bytes to copy before needing another "strt" insn
  */
@@ -72,28 +70,16 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #32
-		blt	.c2u_0rem8lp
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-	PLD(	subs	ip, ip, #64			)
-	PLD(	blt	.c2u_0cpynopld		)
-	PLD(	pld	[r1, #60]		)
-	PLD(	pld	[r0, #60]		)
-
-.c2u_0cpy8lp:
-	PLD(	pld	[r1, #92]		)
-	PLD(	pld	[r0, #92]		)
-.c2u_0cpynopld:	ldmia	r1!, {r3 - r6}
+		blt	.Lc2u_0rem8lp
+
+.Lc2u_0cpy8lp:	ldmia	r1!, {r3 - r6}
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
 		ldmia	r1!, {r3 - r6}
 		subs	ip, ip, #32
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_0cpy8lp
-	PLD(	cmn	ip, #64			)
-	PLD(	bge	.c2u_0cpynopld		)
-	PLD(	add	ip, ip, #64		)
+		bpl	.Lc2u_0cpy8lp
 
-.c2u_0rem8lp:	cmn	ip, #16
+.Lc2u_0rem8lp:	cmn	ip, #16
 		ldmgeia	r1!, {r3 - r6}
 		stmgeia	r0!, {r3 - r6}			@ Shouldnt fault
 		tst	ip, #8
@@ -103,33 +89,33 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		ldrne	r3, [r1], #4
 		strnet	r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_0fupi
-.c2u_0nowords:	teq	ip, #0
-		beq	.c2u_finished
-.c2u_nowords:	cmp	ip, #2
+		beq	.Lc2u_0fupi
+.Lc2u_0nowords:	teq	ip, #0
+		beq	.Lc2u_finished
+.Lc2u_nowords:	cmp	ip, #2
 		ldrb	r3, [r1], #1
 USER(		strbt	r3, [r0], #1)			@ May fault
 		ldrgeb	r3, [r1], #1
 USER(		strgebt	r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #1
 USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+		b	.Lc2u_finished
 
-.c2u_not_enough:
+.Lc2u_not_enough:
 		movs	ip, r2
-		bne	.c2u_nowords
-.c2u_finished:	mov	r0, #0
+		bne	.Lc2u_nowords
+.Lc2u_finished:	mov	r0, #0
 		LOADREGS(fd,sp!,{r2, r4 - r7, pc})
 
-.c2u_src_not_aligned:
+.Lc2u_src_not_aligned:
 		bic	r1, r1, #3
 		ldr	r7, [r1], #4
 		cmp	ip, #2
-		bgt	.c2u_3fupi
-		beq	.c2u_2fupi
-.c2u_1fupi:	subs	r2, r2, #4
+		bgt	.Lc2u_3fupi
+		beq	.Lc2u_2fupi
+.Lc2u_1fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_1nowords
+		bmi	.Lc2u_1nowords
 		mov	r3, r7, pull #8
 		ldr	r7, [r1], #4
 		orr	r3, r3, r7, push #24
@@ -137,23 +123,14 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_1fupi
+		beq	.Lc2u_1fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.c2u_1rem8lp
-	PLD(	pld	[r1, #12]		)
-	PLD(	pld	[r0, #12]		)
-	PLD(	subs	ip, ip, #32		)
-	PLD(	blt	.c2u_1cpynopld		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-
-.c2u_1cpy8lp:
-	PLD(	pld	[r1, #44]		)
-	PLD(	pld	[r0, #44]		)
-.c2u_1cpynopld:	mov	r3, r7, pull #8
+		blt	.Lc2u_1rem8lp
+
+.Lc2u_1cpy8lp:	mov	r3, r7, pull #8
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
 		orr	r3, r3, r4, push #24
@@ -164,12 +141,9 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		mov	r6, r6, pull #8
 		orr	r6, r6, r7, push #24
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_1cpy8lp
-	PLD(	cmn	ip, #32			)
-	PLD(	bge	.c2u_1cpynopld		)
-	PLD(	add	ip, ip, #32		)
+		bpl	.Lc2u_1cpy8lp
 
-.c2u_1rem8lp:	tst	ip, #8
+.Lc2u_1rem8lp:	tst	ip, #8
 		movne	r3, r7, pull #8
 		ldmneia	r1!, {r4, r7}
 		orrne	r3, r3, r4, push #24
@@ -182,21 +156,21 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		orrne	r3, r3, r7, push #24
 		strnet	r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_1fupi
-.c2u_1nowords:	mov	r3, r7, get_byte_1
+		beq	.Lc2u_1fupi
+.Lc2u_1nowords:	mov	r3, r7, get_byte_1
 		teq	ip, #0
-		beq	.c2u_finished
+		beq	.Lc2u_finished
 		cmp	ip, #2
 USER(		strbt	r3, [r0], #1)			@ May fault
 		movge	r3, r7, get_byte_2
 USER(		strgebt	r3, [r0], #1)			@ May fault
 		movgt	r3, r7, get_byte_3
 USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+		b	.Lc2u_finished
 
-.c2u_2fupi:	subs	r2, r2, #4
+.Lc2u_2fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_2nowords
+		bmi	.Lc2u_2nowords
 		mov	r3, r7, pull #16
 		ldr	r7, [r1], #4
 		orr	r3, r3, r7, push #16
@@ -204,23 +178,14 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_2fupi
+		beq	.Lc2u_2fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.c2u_2rem8lp
-	PLD(	pld	[r1, #12]		)
-	PLD(	pld	[r0, #12]		)
-	PLD(	subs	ip, ip, #32		)
-	PLD(	blt	.c2u_2cpynopld		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-
-.c2u_2cpy8lp:
-	PLD(	pld	[r1, #44]		)
-	PLD(	pld	[r0, #44]		)
-.c2u_2cpynopld:	mov	r3, r7, pull #16
+		blt	.Lc2u_2rem8lp
+
+.Lc2u_2cpy8lp:	mov	r3, r7, pull #16
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
 		orr	r3, r3, r4, push #16
@@ -231,12 +196,9 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		mov	r6, r6, pull #16
 		orr	r6, r6, r7, push #16
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_2cpy8lp
-	PLD(	cmn	ip, #32			)
-	PLD(	bge	.c2u_2cpynopld		)
-	PLD(	add	ip, ip, #32		)
+		bpl	.Lc2u_2cpy8lp
 
-.c2u_2rem8lp:	tst	ip, #8
+.Lc2u_2rem8lp:	tst	ip, #8
 		movne	r3, r7, pull #16
 		ldmneia	r1!, {r4, r7}
 		orrne	r3, r3, r4, push #16
@@ -249,21 +211,21 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		orrne	r3, r3, r7, push #16
 		strnet	r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_2fupi
-.c2u_2nowords:	mov	r3, r7, get_byte_2
+		beq	.Lc2u_2fupi
+.Lc2u_2nowords:	mov	r3, r7, get_byte_2
 		teq	ip, #0
-		beq	.c2u_finished
+		beq	.Lc2u_finished
 		cmp	ip, #2
 USER(		strbt	r3, [r0], #1)			@ May fault
 		movge	r3, r7, get_byte_3
 USER(		strgebt	r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #0
 USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+		b	.Lc2u_finished
 
-.c2u_3fupi:	subs	r2, r2, #4
+.Lc2u_3fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_3nowords
+		bmi	.Lc2u_3nowords
 		mov	r3, r7, pull #24
 		ldr	r7, [r1], #4
 		orr	r3, r3, r7, push #8
@@ -271,23 +233,14 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_3fupi
+		beq	.Lc2u_3fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.c2u_3rem8lp
-	PLD(	pld	[r1, #12]		)
-	PLD(	pld	[r0, #12]		)
-	PLD(	subs	ip, ip, #32		)
-	PLD(	blt	.c2u_3cpynopld		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-
-.c2u_3cpy8lp:
-	PLD(	pld	[r1, #44]		)
-	PLD(	pld	[r0, #44]		)
-.c2u_3cpynopld:	mov	r3, r7, pull #24
+		blt	.Lc2u_3rem8lp
+
+.Lc2u_3cpy8lp:	mov	r3, r7, pull #24
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
 		orr	r3, r3, r4, push #8
@@ -298,12 +251,9 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		mov	r6, r6, pull #24
 		orr	r6, r6, r7, push #8
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_3cpy8lp
-	PLD(	cmn	ip, #32			)
-	PLD(	bge	.c2u_3cpynopld		)
-	PLD(	add	ip, ip, #32		)
+		bpl	.Lc2u_3cpy8lp
 
-.c2u_3rem8lp:	tst	ip, #8
+.Lc2u_3rem8lp:	tst	ip, #8
 		movne	r3, r7, pull #24
 		ldmneia	r1!, {r4, r7}
 		orrne	r3, r3, r4, push #8
@@ -316,17 +266,17 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		orrne	r3, r3, r7, push #8
 		strnet	r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_3fupi
-.c2u_3nowords:	mov	r3, r7, get_byte_3
+		beq	.Lc2u_3fupi
+.Lc2u_3nowords:	mov	r3, r7, get_byte_3
 		teq	ip, #0
-		beq	.c2u_finished
+		beq	.Lc2u_finished
 		cmp	ip, #2
 USER(		strbt	r3, [r0], #1)			@ May fault
 		ldrgeb	r3, [r1], #1
 USER(		strgebt	r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #0
 USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+		b	.Lc2u_finished
 
 		.section .fixup,"ax"
 		.align	0
@@ -340,7 +290,7 @@ USER(		strgtbt	r3, [r0], #1)			@ May fault
  *          : n    - number of bytes to copy
  * Returns  : Number of bytes NOT copied.
  */
-.cfu_dest_not_aligned:
+.Lcfu_dest_not_aligned:
 		rsb	ip, ip, #4
 		cmp	ip, #2
 USER(		ldrbt	r3, [r1], #1)			@ May fault
@@ -350,33 +300,32 @@ USER(		ldrgebt	r3, [r1], #1)			@ May fault
 USER(		ldrgtbt	r3, [r1], #1)			@ May fault
 		strgtb	r3, [r0], #1
 		sub	r2, r2, ip
-		b	.cfu_dest_aligned
+		b	.Lcfu_dest_aligned
 
 ENTRY(__arch_copy_from_user)
 		stmfd	sp!, {r0, r2, r4 - r7, lr}
 		cmp	r2, #4
-		blt	.cfu_not_enough
-	PLD(	pld	[r1, #0]		)
-	PLD(	pld	[r0, #0]		)
+		blt	.Lcfu_not_enough
 		ands	ip, r0, #3
-		bne	.cfu_dest_not_aligned
-.cfu_dest_aligned:
+		bne	.Lcfu_dest_not_aligned
+.Lcfu_dest_aligned:
 		ands	ip, r1, #3
-		bne	.cfu_src_not_aligned
+		bne	.Lcfu_src_not_aligned
+
 /*
  * Seeing as there has to be at least 8 bytes to copy, we can
  * copy one word, and force a user-mode page fault...
  */
 
-.cfu_0fupi:	subs	r2, r2, #4
+.Lcfu_0fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_0nowords
+		bmi	.Lcfu_0nowords
 USER(		ldrt	r3, [r1], #4)
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT	@ On each page, use a ld/st??t instruction
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_0fupi
+		beq	.Lcfu_0fupi
 /*
  * ip = max no. of bytes to copy before needing another "strt" insn
  */
@@ -384,28 +333,16 @@ USER(		ldrt	r3, [r1], #4)
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #32
-		blt	.cfu_0rem8lp
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-	PLD(	subs	ip, ip, #64			)
-	PLD(	blt	.cfu_0cpynopld		)
-	PLD(	pld	[r1, #60]		)
-	PLD(	pld	[r0, #60]		)
-
-.cfu_0cpy8lp:
-	PLD(	pld	[r1, #92]		)
-	PLD(	pld	[r0, #92]		)
-.cfu_0cpynopld:	ldmia	r1!, {r3 - r6}			@ Shouldnt fault
+		blt	.Lcfu_0rem8lp
+
+.Lcfu_0cpy8lp:	ldmia	r1!, {r3 - r6}			@ Shouldnt fault
 		stmia	r0!, {r3 - r6}
 		ldmia	r1!, {r3 - r6}			@ Shouldnt fault
 		subs	ip, ip, #32
 		stmia	r0!, {r3 - r6}
-		bpl	.cfu_0cpy8lp
-	PLD(	cmn	ip, #64			)
-	PLD(	bge	.cfu_0cpynopld		)
-	PLD(	add	ip, ip, #64		)
+		bpl	.Lcfu_0cpy8lp
 
-.cfu_0rem8lp:	cmn	ip, #16
+.Lcfu_0rem8lp:	cmn	ip, #16
 		ldmgeia	r1!, {r3 - r6}			@ Shouldnt fault
 		stmgeia	r0!, {r3 - r6}
 		tst	ip, #8
@@ -415,34 +352,34 @@ USER(		ldrt	r3, [r1], #4)
 		ldrnet	r3, [r1], #4			@ Shouldnt fault
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_0fupi
-.cfu_0nowords:	teq	ip, #0
-		beq	.cfu_finished
-.cfu_nowords:	cmp	ip, #2
+		beq	.Lcfu_0fupi
+.Lcfu_0nowords:	teq	ip, #0
+		beq	.Lcfu_finished
+.Lcfu_nowords:	cmp	ip, #2
 USER(		ldrbt	r3, [r1], #1)			@ May fault
 		strb	r3, [r0], #1
 USER(		ldrgebt	r3, [r1], #1)			@ May fault
 		strgeb	r3, [r0], #1
 USER(		ldrgtbt	r3, [r1], #1)			@ May fault
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
-.cfu_not_enough:
+.Lcfu_not_enough:
 		movs	ip, r2
-		bne	.cfu_nowords
-.cfu_finished:	mov	r0, #0
+		bne	.Lcfu_nowords
+.Lcfu_finished:	mov	r0, #0
 		add	sp, sp, #8
 		LOADREGS(fd,sp!,{r4 - r7, pc})
 
-.cfu_src_not_aligned:
+.Lcfu_src_not_aligned:
 		bic	r1, r1, #3
 USER(		ldrt	r7, [r1], #4)			@ May fault
 		cmp	ip, #2
-		bgt	.cfu_3fupi
-		beq	.cfu_2fupi
-.cfu_1fupi:	subs	r2, r2, #4
+		bgt	.Lcfu_3fupi
+		beq	.Lcfu_2fupi
+.Lcfu_1fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_1nowords
+		bmi	.Lcfu_1nowords
 		mov	r3, r7, pull #8
 USER(		ldrt	r7, [r1], #4)			@ May fault
 		orr	r3, r3, r7, push #24
@@ -450,23 +387,14 @@ USER(		ldrt	r7, [r1], #4)			@ May fault
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_1fupi
+		beq	.Lcfu_1fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.cfu_1rem8lp
-	PLD(	pld	[r1, #12]		)
-	PLD(	pld	[r0, #12]		)
-	PLD(	subs	ip, ip, #32		)
-	PLD(	blt	.cfu_1cpynopld		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-
-.cfu_1cpy8lp:
-	PLD(	pld	[r1, #44]		)
-	PLD(	pld	[r0, #44]		)
-.cfu_1cpynopld:	mov	r3, r7, pull #8
+		blt	.Lcfu_1rem8lp
+
+.Lcfu_1cpy8lp:	mov	r3, r7, pull #8
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		subs	ip, ip, #16
 		orr	r3, r3, r4, push #24
@@ -477,12 +405,9 @@ USER(		ldrt	r7, [r1], #4)			@ May fault
 		mov	r6, r6, pull #8
 		orr	r6, r6, r7, push #24
 		stmia	r0!, {r3 - r6}
-		bpl	.cfu_1cpy8lp
-	PLD(	cmn	ip, #32			)
-	PLD(	bge	.cfu_1cpynopld		)
-	PLD(	add	ip, ip, #32		)
+		bpl	.Lcfu_1cpy8lp
 
-.cfu_1rem8lp:	tst	ip, #8
+.Lcfu_1rem8lp:	tst	ip, #8
 		movne	r3, r7, pull #8
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
 		orrne	r3, r3, r4, push #24
@@ -495,21 +420,21 @@ USER(		ldrnet	r7, [r1], #4)			@ May fault
 		orrne	r3, r3, r7, push #24
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_1fupi
-.cfu_1nowords:	mov	r3, r7, get_byte_1
+		beq	.Lcfu_1fupi
+.Lcfu_1nowords:	mov	r3, r7, get_byte_1
 		teq	ip, #0
-		beq	.cfu_finished
+		beq	.Lcfu_finished
 		cmp	ip, #2
 		strb	r3, [r0], #1
 		movge	r3, r7, get_byte_2
 		strgeb	r3, [r0], #1
 		movgt	r3, r7, get_byte_3
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
-.cfu_2fupi:	subs	r2, r2, #4
+.Lcfu_2fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_2nowords
+		bmi	.Lcfu_2nowords
 		mov	r3, r7, pull #16
 USER(		ldrt	r7, [r1], #4)			@ May fault
 		orr	r3, r3, r7, push #16
@@ -517,23 +442,15 @@ USER(		ldrt	r7, [r1], #4)			@ May fault
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_2fupi
+		beq	.Lcfu_2fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.cfu_2rem8lp
-	PLD(	pld	[r1, #12]		)
-	PLD(	pld	[r0, #12]		)
-	PLD(	subs	ip, ip, #32		)
-	PLD(	blt	.cfu_2cpynopld		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-
-.cfu_2cpy8lp:
-	PLD(	pld	[r1, #44]		)
-	PLD(	pld	[r0, #44]		)
-.cfu_2cpynopld:	mov	r3, r7, pull #16
+		blt	.Lcfu_2rem8lp
+
+
+.Lcfu_2cpy8lp:	mov	r3, r7, pull #16
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		subs	ip, ip, #16
 		orr	r3, r3, r4, push #16
@@ -544,12 +461,9 @@ USER(		ldrt	r7, [r1], #4)			@ May fault
 		mov	r6, r6, pull #16
 		orr	r6, r6, r7, push #16
 		stmia	r0!, {r3 - r6}
-		bpl	.cfu_2cpy8lp
-	PLD(	cmn	ip, #32			)
-	PLD(	bge	.cfu_2cpynopld		)
-	PLD(	add	ip, ip, #32		)
+		bpl	.Lcfu_2cpy8lp
 
-.cfu_2rem8lp:	tst	ip, #8
+.Lcfu_2rem8lp:	tst	ip, #8
 		movne	r3, r7, pull #16
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
 		orrne	r3, r3, r4, push #16
@@ -562,21 +476,21 @@ USER(		ldrnet	r7, [r1], #4)			@ May fault
 		orrne	r3, r3, r7, push #16
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_2fupi
-.cfu_2nowords:	mov	r3, r7, get_byte_2
+		beq	.Lcfu_2fupi
+.Lcfu_2nowords:	mov	r3, r7, get_byte_2
 		teq	ip, #0
-		beq	.cfu_finished
+		beq	.Lcfu_finished
 		cmp	ip, #2
 		strb	r3, [r0], #1
 		movge	r3, r7, get_byte_3
 		strgeb	r3, [r0], #1
 USER(		ldrgtbt	r3, [r1], #0)			@ May fault
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
-.cfu_3fupi:	subs	r2, r2, #4
+.Lcfu_3fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_3nowords
+		bmi	.Lcfu_3nowords
 		mov	r3, r7, pull #24
 USER(		ldrt	r7, [r1], #4)			@ May fault
 		orr	r3, r3, r7, push #8
@@ -584,23 +498,14 @@ USER(		ldrt	r7, [r1], #4)			@ May fault
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_3fupi
+		beq	.Lcfu_3fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.cfu_3rem8lp
-	PLD(	pld	[r1, #12]		)
-	PLD(	pld	[r0, #12]		)
-	PLD(	subs	ip, ip, #32		)
-	PLD(	blt	.cfu_3cpynopld		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	pld	[r0, #28]		)
-
-.cfu_3cpy8lp:
-	PLD(	pld	[r1, #44]		)
-	PLD(	pld	[r0, #44]		)
-.cfu_3cpynopld:	mov	r3, r7, pull #24
+		blt	.Lcfu_3rem8lp
+
+.Lcfu_3cpy8lp:	mov	r3, r7, pull #24
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		orr	r3, r3, r4, push #8
 		mov	r4, r4, pull #24
@@ -611,12 +516,9 @@ USER(		ldrt	r7, [r1], #4)			@ May fault
 		orr	r6, r6, r7, push #8
 		stmia	r0!, {r3 - r6}
 		subs	ip, ip, #16
-		bpl	.cfu_3cpy8lp
-	PLD(	cmn	ip, #32			)
-	PLD(	bge	.cfu_3cpynopld		)
-	PLD(	add	ip, ip, #32		)
+		bpl	.Lcfu_3cpy8lp
 
-.cfu_3rem8lp:	tst	ip, #8
+.Lcfu_3rem8lp:	tst	ip, #8
 		movne	r3, r7, pull #24
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
 		orrne	r3, r3, r4, push #8
@@ -629,17 +531,17 @@ USER(		ldrnet	r7, [r1], #4)			@ May fault
 		orrne	r3, r3, r7, push #8
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_3fupi
-.cfu_3nowords:	mov	r3, r7, get_byte_3
+		beq	.Lcfu_3fupi
+.Lcfu_3nowords:	mov	r3, r7, get_byte_3
 		teq	ip, #0
-		beq	.cfu_finished
+		beq	.Lcfu_finished
 		cmp	ip, #2
 		strb	r3, [r0], #1
 USER(		ldrgebt	r3, [r1], #1)			@ May fault
 		strgeb	r3, [r0], #1
 USER(		ldrgtbt	r3, [r1], #1)			@ May fault
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
 		.section .fixup,"ax"
 		.align	0
@@ -657,41 +559,3 @@ USER(		ldrgtbt	r3, [r1], #1)			@ May fault
 		LOADREGS(fd,sp!, {r4 - r7, pc})
 		.previous
 
-/* Prototype: int __arch_clear_user(void *addr, size_t sz)
- * Purpose  : clear some user memory
- * Params   : addr - user memory address to clear
- *          : sz   - number of bytes to clear
- * Returns  : number of bytes NOT cleared
- */
-ENTRY(__arch_clear_user)
-		stmfd	sp!, {r1, lr}
-		mov	r2, #0
-		cmp	r1, #4
-		blt	2f
-		ands	ip, r0, #3
-		beq	1f
-		cmp	ip, #2
-USER(		strbt	r2, [r0], #1)
-USER(		strlebt	r2, [r0], #1)
-USER(		strltbt	r2, [r0], #1)
-		rsb	ip, ip, #4
-		sub	r1, r1, ip		@  7  6  5  4  3  2  1
-1:		subs	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
-USER(		strplt	r2, [r0], #4)
-USER(		strplt	r2, [r0], #4)
-		bpl	1b
-		adds	r1, r1, #4		@  3  2  1  0 -1 -2 -3
-USER(		strplt	r2, [r0], #4)
-2:		tst	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
-USER(		strnebt	r2, [r0], #1)
-USER(		strnebt	r2, [r0], #1)
-		tst	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
-USER(		strnebt	r2, [r0], #1)
-		mov	r0, #0
-		LOADREGS(fd,sp!, {r1, pc})
-
-		.section .fixup,"ax"
-		.align	0
-9001:		LOADREGS(fd,sp!, {r0, pc})
-		.previous
-
diff --git a/arch/arm/lib/ucmpdi2.S b/arch/arm/lib/ucmpdi2.S
new file mode 100644
index 0000000..112630f
--- /dev/null
+++ b/arch/arm/lib/ucmpdi2.S
@@ -0,0 +1,35 @@
+/*
+ *  linux/arch/arm/lib/ucmpdi2.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Oct 19, 2005
+ *  Copyright:	Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+ENTRY(__ucmpdi2)
+
+	cmp	xh, yh
+	cmpeq	xl, yl
+	movlo	r0, #0
+	moveq	r0, #1
+	movhi	r0, #2
+	mov	pc, lr
+
diff --git a/arch/arm/lib/ucmpdi2.c b/arch/arm/lib/ucmpdi2.c
deleted file mode 100644
index 57f3f2d..0000000
--- a/arch/arm/lib/ucmpdi2.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* More subroutines needed by GCC output code on some machines.  */
-/* Compile this one with gcc.  */
-/* Copyright (C) 1989, 92-98, 1999 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* As a special exception, if you link this library with other files,
-   some of which are compiled with GCC, to produce an executable,
-   this library does not by itself cause the resulting executable
-   to be covered by the GNU General Public License.
-   This exception does not however invalidate any other reasons why
-   the executable file might be covered by the GNU General Public License.
- */
-/* support functions required by the kernel. based on code from gcc-2.95.3 */
-/* I Molton     29/07/01 */
-
-#include "gcclib.h"
-
-int __ucmpdi2(s64 a, s64 b)
-{
-	DIunion au, bu;
-
-	au.ll = a, bu.ll = b;
-
-	if ((u32) au.s.high < (u32) bu.s.high)
-		return 0;
-	else if ((u32) au.s.high > (u32) bu.s.high)
-		return 2;
-	if ((u32) au.s.low < (u32) bu.s.low)
-		return 0;
-	else if ((u32) au.s.low > (u32) bu.s.low)
-		return 2;
-	return 1;
-}