diff options
-rw-r--r-- | arch/arm/lib/Makefile | 5 | ||||
-rw-r--r-- | arch/arm/lib/copy_template.S | 255 | ||||
-rw-r--r-- | arch/arm/lib/memcpy.S | 410 | ||||
-rw-r--r-- | arch/arm/lib/memmove.S | 206 |
4 files changed, 502 insertions, 374 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index d3d9b21..8f9770f 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -7,8 +7,9 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ csumpartialcopy.o csumpartialcopyuser.o clearbit.o \ copy_page.o delay.o findbit.o memchr.o memcpy.o \ - memset.o memzero.o setbit.o strncpy_from_user.o \ - strnlen_user.o strchr.o strrchr.o testchangebit.o \ + memmove.o memset.o memzero.o setbit.o \ + strncpy_from_user.o strnlen_user.o \ + strchr.o strrchr.o testchangebit.o \ testclearbit.o testsetbit.o uaccess.o \ getuser.o putuser.o clear_user.o \ ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S new file mode 100644 index 0000000..838e435 --- /dev/null +++ b/arch/arm/lib/copy_template.S @@ -0,0 +1,255 @@ +/* + * linux/arch/arm/lib/copy_template.s + * + * Code template for optimized memory copy functions + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: MontaVista Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * This can be used to enable code to cacheline align the source pointer. + * Experiments on tested architectures (StrongARM and XScale) didn't show + * this a worthwhile thing to do. That might be different in the future. + */ +//#define CALGN(code...) code +#define CALGN(code...) + +/* + * Theory of operation + * ------------------- + * + * This file provides the core code for a forward memory copy used in + * the implementation of memcopy(), copy_to_user() and copy_from_user(). + * + * The including file must define the following accessor macros + * according to the need of the given function: + * + * ldr1w ptr reg abort + * + * This loads one word from 'ptr', stores it in 'reg' and increments + * 'ptr' to the next word. The 'abort' argument is used for fixup tables. + * + * ldr4w ptr reg1 reg2 reg3 reg4 abort + * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + * + * This loads four or eight words starting from 'ptr', stores them + * in provided registers and increments 'ptr' past those words. + * The'abort' argument is used for fixup tables. + * + * ldr1b ptr reg cond abort + * + * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte. + * It also must apply the condition code if provided, otherwise the + * "al" condition is assumed by default. + * + * str1w ptr reg abort + * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + * str1b ptr reg cond abort + * + * Same as their ldr* counterparts, but data is stored to 'ptr' location + * rather than being loaded. + * + * enter reg1 reg2 + * + * Preserve the provided registers on the stack plus any additional + * data as needed by the implementation including this code. Called + * upon code entry. + * + * exit reg1 reg2 + * + * Restore registers with the values previously saved with the + * 'preserv' macro. Called upon code termination. + */ + + + enter r4, lr + + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #0] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r1, #31 ) + CALGN( rsb r3, ip, #32 ) + CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, r3 ) @ C gets set + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #0] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 4f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +3: PLD( pld [r1, #124] ) +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + subs r2, r2, #32 + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 + addne pc, pc, ip @ C is always clear here + b 7f +6: nop + ldr1w r1, r3, abort=20f + ldr1w r1, r4, abort=20f + ldr1w r1, r5, abort=20f + ldr1w r1, r6, abort=20f + ldr1w r1, r7, abort=20f + ldr1w r1, r8, abort=20f + ldr1w r1, lr, abort=20f + + add pc, pc, ip + nop + nop + str1w r0, r3, abort=20f + str1w r0, r4, abort=20f + str1w r0, r5, abort=20f + str1w r0, r6, abort=20f + str1w r0, r7, abort=20f + str1w r0, r8, abort=20f + str1w r0, lr, abort=20f + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldr1b r1, r3, ne, abort=21f + ldr1b r1, r4, cs, abort=21f + ldr1b r1, ip, cs, abort=21f + str1b r0, r3, ne, abort=21f + str1b r0, r4, cs, abort=21f + str1b r0, ip, cs, abort=21f + + exit r4, pc + +9: rsb ip, ip, #4 + cmp ip, #2 + ldr1b r1, r3, gt, abort=21f + ldr1b r1, r4, ge, abort=21f + ldr1b r1, lr, abort=21f + str1b r0, r3, gt, abort=21f + str1b r0, r4, ge, abort=21f + subs r2, r2, ip + str1b r0, lr, abort=21f + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr1w r1, lr, abort=21f + beq 17f + bgt 18f + + + .macro forward_copy_shift pull push + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r1, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #0] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #28] ) + PLD( blt 13f ) + PLD( pld [r1, #60] ) + PLD( pld [r1, #92] ) + +12: PLD( pld [r1, #124] ) +13: ldr4w r1, r4, r5, r6, r7, abort=19f + mov r3, lr, pull #\pull + subs r2, r2, #32 + ldr4w r1, r8, r9, ip, lr, abort=19f + orr r3, r3, r4, push #\push + mov r4, r4, pull #\pull + orr r4, r4, r5, push #\push + mov r5, r5, pull #\pull + orr r5, r5, r6, push #\push + mov r6, r6, pull #\pull + orr r6, r6, r7, push #\push + mov r7, r7, pull #\pull + orr r7, r7, r8, push #\push + mov r8, r8, pull #\pull + orr r8, r8, r9, push #\push + mov r9, r9, pull #\pull + orr r9, r9, ip, push #\push + mov ip, ip, pull #\pull + orr ip, ip, lr, push #\push + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov r3, lr, pull #\pull + ldr1w r1, lr, abort=21f + subs ip, ip, #4 + orr r3, r3, lr, push #\push + str1w r0, r3, abort=21f + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: sub r1, r1, #(\push / 8) + b 8b + + .endm + + + forward_copy_shift pull=8 push=24 + +17: forward_copy_shift pull=16 push=16 + +18: forward_copy_shift pull=24 push=8 + + +/* + * Abort preanble and completion macros. + * If a fixup handler is required then those macros must surround it. + * It is assumed that the fixup code will handle the private part of + * the exit macro. + */ + + .macro copy_abort_preamble +19: ldmfd sp!, {r5 - r9} + b 21f +20: ldmfd sp!, {r5 - r8} +21: + .endm + + .macro copy_abort_end + ldmfd sp!, {r4, pc} + .endm + diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index f5a593c..7e71d67 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -1,393 +1,59 @@ /* * linux/arch/arm/lib/memcpy.S * - * Copyright (C) 1995-1999 Russell King + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: MontaVista Software, Inc. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * ASM optimised string functions + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ + #include <linux/linkage.h> #include <asm/assembler.h> - .text - -#define ENTER \ - mov ip,sp ;\ - stmfd sp!,{r0,r4-r9,fp,ip,lr,pc} ;\ - sub fp,ip,#4 - -#define EXIT \ - LOADREGS(ea, fp, {r0, r4 - r9, fp, sp, pc}) - -#define EXITEQ \ - LOADREGS(eqea, fp, {r0, r4 - r9, fp, sp, pc}) - -/* - * Prototype: void memcpy(void *to,const void *from,unsigned long n); - */ -ENTRY(memcpy) -ENTRY(memmove) - ENTER - cmp r1, r0 - bcc 23f - subs r2, r2, #4 - blt 6f - PLD( pld [r1, #0] ) - ands ip, r0, #3 - bne 7f - ands ip, r1, #3 - bne 8f + .macro ldr1w ptr reg abort + ldr \reg, [\ptr], #4 + .endm -1: subs r2, r2, #8 - blt 5f - subs r2, r2, #20 - blt 4f - PLD( pld [r1, #28] ) - PLD( subs r2, r2, #64 ) - PLD( blt 3f ) -2: PLD( pld [r1, #60] ) - PLD( pld [r1, #92] ) - ldmia r1!, {r3 - r9, ip} - subs r2, r2, #32 - stmgeia r0!, {r3 - r9, ip} - ldmgeia r1!, {r3 - r9, ip} - subges r2, r2, #32 - stmia r0!, {r3 - r9, ip} - bge 2b -3: PLD( ldmia r1!, {r3 - r9, ip} ) - PLD( adds r2, r2, #32 ) - PLD( stmgeia r0!, {r3 - r9, ip} ) - PLD( ldmgeia r1!, {r3 - r9, ip} ) - PLD( subges r2, r2, #32 ) - PLD( stmia r0!, {r3 - r9, ip} ) -4: cmn r2, #16 - ldmgeia r1!, {r3 - r6} - subge r2, r2, #16 - stmgeia r0!, {r3 - r6} - adds r2, r2, #20 - ldmgeia r1!, {r3 - r5} - subge r2, r2, #12 - stmgeia r0!, {r3 - r5} -5: adds r2, r2, #8 - blt 6f - subs r2, r2, #4 - ldrlt r3, [r1], #4 - ldmgeia r1!, {r4, r5} - subge r2, r2, #4 - strlt r3, [r0], #4 - stmgeia r0!, {r4, r5} + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} + .endm -6: adds r2, r2, #4 - EXITEQ - cmp r2, #2 - ldrb r3, [r1], #1 - ldrgeb r4, [r1], #1 - ldrgtb r5, [r1], #1 - strb r3, [r0], #1 - strgeb r4, [r0], #1 - strgtb r5, [r0], #1 - EXIT + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm -7: rsb ip, ip, #4 - cmp ip, #2 - ldrb r3, [r1], #1 - ldrgeb r4, [r1], #1 - ldrgtb r5, [r1], #1 - strb r3, [r0], #1 - strgeb r4, [r0], #1 - strgtb r5, [r0], #1 - subs r2, r2, ip - blt 6b - ands ip, r1, #3 - beq 1b + .macro ldr1b ptr reg cond=al abort + ldr\cond\()b \reg, [\ptr], #1 + .endm -8: bic r1, r1, #3 - ldr r7, [r1], #4 - cmp ip, #2 - bgt 18f - beq 13f - cmp r2, #12 - blt 11f - PLD( pld [r1, #12] ) - sub r2, r2, #12 - PLD( subs r2, r2, #32 ) - PLD( blt 10f ) - PLD( pld [r1, #28] ) -9: PLD( pld [r1, #44] ) -10: mov r3, r7, pull #8 - ldmia r1!, {r4 - r7} - subs r2, r2, #16 - orr r3, r3, r4, push #24 - mov r4, r4, pull #8 - orr r4, r4, r5, push #24 - mov r5, r5, pull #8 - orr r5, r5, r6, push #24 - mov r6, r6, pull #8 - orr r6, r6, r7, push #24 - stmia r0!, {r3 - r6} - bge 9b - PLD( cmn r2, #32 ) - PLD( bge 10b ) - PLD( add r2, r2, #32 ) - adds r2, r2, #12 - blt 12f -11: mov r3, r7, pull #8 - ldr r7, [r1], #4 - subs r2, r2, #4 - orr r3, r3, r7, push #24 - str r3, [r0], #4 - bge 11b -12: sub r1, r1, #3 - b 6b + .macro str1w ptr reg abort + str \reg, [\ptr], #4 + .endm -13: cmp r2, #12 - blt 16f - PLD( pld [r1, #12] ) - sub r2, r2, #12 - PLD( subs r2, r2, #32 ) - PLD( blt 15f ) - PLD( pld [r1, #28] ) -14: PLD( pld [r1, #44] ) -15: mov r3, r7, pull #16 - ldmia r1!, {r4 - r7} - subs r2, r2, #16 - orr r3, r3, r4, push #16 - mov r4, r4, pull #16 - orr r4, r4, r5, push #16 - mov r5, r5, pull #16 - orr r5, r5, r6, push #16 - mov r6, r6, pull #16 - orr r6, r6, r7, push #16 - stmia r0!, {r3 - r6} - bge 14b - PLD( cmn r2, #32 ) - PLD( bge 15b ) - PLD( add r2, r2, #32 ) - adds r2, r2, #12 - blt 17f -16: mov r3, r7, pull #16 - ldr r7, [r1], #4 - subs r2, r2, #4 - orr r3, r3, r7, push #16 - str r3, [r0], #4 - bge 16b -17: sub r1, r1, #2 - b 6b + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} + .endm -18: cmp r2, #12 - blt 21f - PLD( pld [r1, #12] ) - sub r2, r2, #12 - PLD( subs r2, r2, #32 ) - PLD( blt 20f ) - PLD( pld [r1, #28] ) -19: PLD( pld [r1, #44] ) -20: mov r3, r7, pull #24 - ldmia r1!, {r4 - r7} - subs r2, r2, #16 - orr r3, r3, r4, push #8 - mov r4, r4, pull #24 - orr r4, r4, r5, push #8 - mov r5, r5, pull #24 - orr r5, r5, r6, push #8 - mov r6, r6, pull #24 - orr r6, r6, r7, push #8 - stmia r0!, {r3 - r6} - bge 19b - PLD( cmn r2, #32 ) - PLD( bge 20b ) - PLD( add r2, r2, #32 ) - adds r2, r2, #12 - blt 22f -21: mov r3, r7, pull #24 - ldr r7, [r1], #4 - subs r2, r2, #4 - orr r3, r3, r7, push #8 - str r3, [r0], #4 - bge 21b -22: sub r1, r1, #1 - b 6b + .macro str1b ptr reg cond=al abort + str\cond\()b \reg, [\ptr], #1 + .endm + .macro enter reg1 reg2 + stmdb sp!, {r0, \reg1, \reg2} + .endm -23: add r1, r1, r2 - add r0, r0, r2 - subs r2, r2, #4 - blt 29f - PLD( pld [r1, #-4] ) - ands ip, r0, #3 - bne 30f - ands ip, r1, #3 - bne 31f + .macro exit reg1 reg2 + ldmfd sp!, {r0, \reg1, \reg2} + .endm -24: subs r2, r2, #8 - blt 28f - subs r2, r2, #20 - blt 27f - PLD( pld [r1, #-32] ) - PLD( subs r2, r2, #64 ) - PLD( blt 26f ) -25: PLD( pld [r1, #-64] ) - PLD( pld [r1, #-96] ) - ldmdb r1!, {r3 - r9, ip} - subs r2, r2, #32 - stmgedb r0!, {r3 - r9, ip} - ldmgedb r1!, {r3 - r9, ip} - subges r2, r2, #32 - stmdb r0!, {r3 - r9, ip} - bge 25b -26: PLD( ldmdb r1!, {r3 - r9, ip} ) - PLD( adds r2, r2, #32 ) - PLD( stmgedb r0!, {r3 - r9, ip} ) - PLD( ldmgedb r1!, {r3 - r9, ip} ) - PLD( subges r2, r2, #32 ) - PLD( stmdb r0!, {r3 - r9, ip} ) -27: cmn r2, #16 - ldmgedb r1!, {r3 - r6} - subge r2, r2, #16 - stmgedb r0!, {r3 - r6} - adds r2, r2, #20 - ldmgedb r1!, {r3 - r5} - subge r2, r2, #12 - stmgedb r0!, {r3 - r5} -28: adds r2, r2, #8 - blt 29f - subs r2, r2, #4 - ldrlt r3, [r1, #-4]! - ldmgedb r1!, {r4, r5} - subge r2, r2, #4 - strlt r3, [r0, #-4]! - stmgedb r0!, {r4, r5} + .text -29: adds r2, r2, #4 - EXITEQ - cmp r2, #2 - ldrb r3, [r1, #-1]! - ldrgeb r4, [r1, #-1]! - ldrgtb r5, [r1, #-1]! - strb r3, [r0, #-1]! - strgeb r4, [r0, #-1]! - strgtb r5, [r0, #-1]! - EXIT +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ -30: cmp ip, #2 - ldrb r3, [r1, #-1]! - ldrgeb r4, [r1, #-1]! - ldrgtb r5, [r1, #-1]! - strb r3, [r0, #-1]! - strgeb r4, [r0, #-1]! - strgtb r5, [r0, #-1]! - subs r2, r2, ip - blt 29b - ands ip, r1, #3 - beq 24b - -31: bic r1, r1, #3 - ldr r3, [r1], #0 - cmp ip, #2 - blt 41f - beq 36f - cmp r2, #12 - blt 34f - PLD( pld [r1, #-16] ) - sub r2, r2, #12 - PLD( subs r2, r2, #32 ) - PLD( blt 33f ) - PLD( pld [r1, #-32] ) -32: PLD( pld [r1, #-48] ) -33: mov r7, r3, push #8 - ldmdb r1!, {r3, r4, r5, r6} - subs r2, r2, #16 - orr r7, r7, r6, pull #24 - mov r6, r6, push #8 - orr r6, r6, r5, pull #24 - mov r5, r5, push #8 - orr r5, r5, r4, pull #24 - mov r4, r4, push #8 - orr r4, r4, r3, pull #24 - stmdb r0!, {r4, r5, r6, r7} - bge 32b - PLD( cmn r2, #32 ) - PLD( bge 33b ) - PLD( add r2, r2, #32 ) - adds r2, r2, #12 - blt 35f -34: mov ip, r3, push #8 - ldr r3, [r1, #-4]! - subs r2, r2, #4 - orr ip, ip, r3, pull #24 - str ip, [r0, #-4]! - bge 34b -35: add r1, r1, #3 - b 29b - -36: cmp r2, #12 - blt 39f - PLD( pld [r1, #-16] ) - sub r2, r2, #12 - PLD( subs r2, r2, #32 ) - PLD( blt 38f ) - PLD( pld [r1, #-32] ) -37: PLD( pld [r1, #-48] ) -38: mov r7, r3, push #16 - ldmdb r1!, {r3, r4, r5, r6} - subs r2, r2, #16 - orr r7, r7, r6, pull #16 - mov r6, r6, push #16 - orr r6, r6, r5, pull #16 - mov r5, r5, push #16 - orr r5, r5, r4, pull #16 - mov r4, r4, push #16 - orr r4, r4, r3, pull #16 - stmdb r0!, {r4, r5, r6, r7} - bge 37b - PLD( cmn r2, #32 ) - PLD( bge 38b ) - PLD( add r2, r2, #32 ) - adds r2, r2, #12 - blt 40f -39: mov ip, r3, push #16 - ldr r3, [r1, #-4]! - subs r2, r2, #4 - orr ip, ip, r3, pull #16 - str ip, [r0, #-4]! - bge 39b -40: add r1, r1, #2 - b 29b +ENTRY(memcpy) -41: cmp r2, #12 - blt 44f - PLD( pld [r1, #-16] ) - sub r2, r2, #12 - PLD( subs r2, r2, #32 ) - PLD( blt 43f ) - PLD( pld [r1, #-32] ) -42: PLD( pld [r1, #-48] ) -43: mov r7, r3, push #24 - ldmdb r1!, {r3, r4, r5, r6} - subs r2, r2, #16 - orr r7, r7, r6, pull #8 - mov r6, r6, push #24 - orr r6, r6, r5, pull #8 - mov r5, r5, push #24 - orr r5, r5, r4, pull #8 - mov r4, r4, push #24 - orr r4, r4, r3, pull #8 - stmdb r0!, {r4, r5, r6, r7} - bge 42b - PLD( cmn r2, #32 ) - PLD( bge 43b ) - PLD( add r2, r2, #32 ) - adds r2, r2, #12 - blt 45f -44: mov ip, r3, push #24 - ldr r3, [r1, #-4]! - subs r2, r2, #4 - orr ip, ip, r3, pull #8 - str ip, [r0, #-4]! - bge 44b -45: add r1, r1, #1 - b 29b +#include "copy_template.S" diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S new file mode 100644 index 0000000..ef7fddc --- /dev/null +++ b/arch/arm/lib/memmove.S @@ -0,0 +1,206 @@ +/* + * linux/arch/arm/lib/memmove.S + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: (C) MontaVista Software Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * This can be used to enable code to cacheline align the source pointer. + * Experiments on tested architectures (StrongARM and XScale) didn't show + * this a worthwhile thing to do. That might be different in the future. + */ +//#define CALGN(code...) code +#define CALGN(code...) + + .text + +/* + * Prototype: void *memmove(void *dest, const void *src, size_t n); + * + * Note: + * + * If the memory regions don't overlap, we simply branch to memcpy which is + * normally a bit faster. Otherwise the copy is done going downwards. This + * is a transposition of the code from copy_template.S but with the copy + * occurring in the opposite direction. + */ + +ENTRY(memmove) + + subs ip, r0, r1 + cmphi r2, ip + bls memcpy + + stmfd sp!, {r0, r4, lr} + add r1, r1, r2 + add r0, r0, r2 + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #-4] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r1, #31 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, ip ) @ C is set here + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #-4] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #-32] ) + PLD( blt 4f ) + PLD( pld [r1, #-64] ) + PLD( pld [r1, #-96] ) + +3: PLD( pld [r1, #-128] ) +4: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr} + subs r2, r2, #32 + stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr} + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 + addne pc, pc, ip @ C is always clear here + b 7f +6: nop + ldr r3, [r1, #-4]! + ldr r4, [r1, #-4]! + ldr r5, [r1, #-4]! + ldr r6, [r1, #-4]! + ldr r7, [r1, #-4]! + ldr r8, [r1, #-4]! + ldr lr, [r1, #-4]! + + add pc, pc, ip + nop + nop + str r3, [r0, #-4]! + str r4, [r0, #-4]! + str r5, [r0, #-4]! + str r6, [r0, #-4]! + str r7, [r0, #-4]! + str r8, [r0, #-4]! + str lr, [r0, #-4]! + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldrneb r3, [r1, #-1]! + ldrcsb r4, [r1, #-1]! + ldrcsb ip, [r1, #-1] + strneb r3, [r0, #-1]! + strcsb r4, [r0, #-1]! + strcsb ip, [r0, #-1] + ldmfd sp!, {r0, r4, pc} + +9: cmp ip, #2 + ldrgtb r3, [r1, #-1]! + ldrgeb r4, [r1, #-1]! + ldrb lr, [r1, #-1]! + strgtb r3, [r0, #-1]! + strgeb r4, [r0, #-1]! + subs r2, r2, ip + strb lr, [r0, #-1]! + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr r3, [r1, #0] + beq 17f + blt 18f + + + .macro backward_copy_shift push pull + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r1, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #-4] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #-32] ) + PLD( blt 13f ) + PLD( pld [r1, #-64] ) + PLD( pld [r1, #-96] ) + +12: PLD( pld [r1, #-128] ) +13: ldmdb r1!, {r7, r8, r9, ip} + mov lr, r3, push #\push + subs r2, r2, #32 + ldmdb r1!, {r3, r4, r5, r6} + orr lr, lr, ip, pull #\pull + mov ip, ip, push #\push + orr ip, ip, r9, pull #\pull + mov r9, r9, push #\push + orr r9, r9, r8, pull #\pull + mov r8, r8, push #\push + orr r8, r8, r7, pull #\pull + mov r7, r7, push #\push + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb r0!, {r4 - r9, ip, lr} + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov lr, r3, push #\push + ldr r3, [r1, #-4]! + subs ip, ip, #4 + orr lr, lr, r3, pull #\pull + str lr, [r0, #-4]! + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: add r1, r1, #(\pull / 8) + b 8b + + .endm + + + backward_copy_shift push=8 pull=24 + +17: backward_copy_shift push=16 pull=16 + +18: backward_copy_shift push=24 pull=8 + |