From 6323875db20fd8ca8c8fbbd608bc377f2d4c8cf5 Mon Sep 17 00:00:00 2001
From: Dave Martin <dave.martin@linaro.org>
Date: Mon, 29 Nov 2010 19:43:25 +0100
Subject: ARM: 6501/1: Thumb-2: Correct data alignment for CONFIG_THUMB2_KERNEL
 in mm/proc-v7.S

Directives such as .long and .word do not magically cause the
assembler location counter to become aligned in gas.  As a result,
using these directives in code sections can result in misaligned
data words when building a Thumb-2 kernel (CONFIG_THUMB2_KERNEL).

This is a Bad Thing, since the ABI permits the compiler to assume
that fundamental types of word size or above are word- aligned when
accessing them from C.  If the data is not really word-aligned,
this can cause impaired performance and stray alignment faults in
some circumstances.

In general, the following rules should be applied when using data
word declaration directives inside code sections:

    * .quad and .double:
         .align 3

    * .long, .word, .single, .float:
         .align (or .align 2)

    * .short:
        No explicit alignment required, since Thumb-2
        instructions are always 2 or 4 bytes in size.
        immediately after an instruction.

In this specific case, we can achieve the desired alignment by
forcing a 32-bit branch instruction using the W() macro, since the
assembler location counter is already 32-bit aligned in this case.

Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Dave Martin <dave.martin@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/proc-v7.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/arm/mm')

diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 53cbe22..9b9ff5d 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -381,7 +381,7 @@ __v7_ca9mp_proc_info:
 		PMD_SECT_XN | \
 		PMD_SECT_AP_WRITE | \
 		PMD_SECT_AP_READ
-	b	__v7_ca9mp_setup
+	W(b)	__v7_ca9mp_setup
 	.long	cpu_arch_name
 	.long	cpu_elf_name
 	.long	HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_TLS
@@ -413,7 +413,7 @@ __v7_proc_info:
 		PMD_SECT_XN | \
 		PMD_SECT_AP_WRITE | \
 		PMD_SECT_AP_READ
-	b	__v7_setup
+	W(b)	__v7_setup
 	.long	cpu_arch_name
 	.long	cpu_elf_name
 	.long	HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_TLS
-- 
cgit v1.1


From f91e2c3bd427239c198351f44814dd39db91afe0 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 7 Dec 2010 16:52:04 +0100
Subject: ARM: 6527/1: Use CTR instead of CCSIDR for the D-cache line size on
 ARMv7

The current implementation of the dcache_line_size macro reads the L1
cache size from the CCSIDR register. This, however, is not guaranteed to
be the smallest cache line in the cache hierarchy. The patch changes to
the macro to use the more architecturally correct CTR register.

Reported-by: Kevin Sapp <ksapp@quicinc.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/proc-macros.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/arm/mm')

diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index 7d63bea..321555b 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -61,14 +61,14 @@
 	.endm
 
 /*
- * cache_line_size - get the cache line size from the CSIDR register
- * (available on ARMv7+). It assumes that the CSSR register was configured
- * to access the L1 data cache CSIDR.
+ * dcache_line_size - get the minimum D-cache line size from the CTR register
+ * on ARMv7.
  */
 	.macro	dcache_line_size, reg, tmp
-	mrc	p15, 1, \tmp, c0, c0, 0		@ read CSIDR
-	and	\tmp, \tmp, #7			@ cache line size encoding
-	mov	\reg, #16			@ size offset
+	mrc	p15, 0, \tmp, c0, c0, 1		@ read ctr
+	lsr	\tmp, \tmp, #16
+	and	\tmp, \tmp, #0xf		@ cache line size encoding
+	mov	\reg, #4			@ bytes per word
 	mov	\reg, \reg, lsl \tmp		@ actual cache line size
 	.endm
 
-- 
cgit v1.1


From da30e0ac0f9a521f0cfec8145ddd1ad131f66d61 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 7 Dec 2010 16:56:29 +0100
Subject: ARM: 6528/1: Use CTR for the I-cache line size on ARMv7

The current implementation of the v7_coherent_*_range function assumes
that the D and I cache lines have the same size, which is incorrect
architecturally. This patch adds the icache_line_size macro which reads
the CTR register. The main loop in v7_coherent_*_range is split in two
independent loops or the D and I caches. This also has the performance
advantage that the DSB is moved outside the main loop.

Reported-by: Kevin Sapp <ksapp@quicinc.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v7.S    | 27 +++++++++++++++++----------
 arch/arm/mm/proc-macros.S | 10 ++++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'arch/arm/mm')

diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index a3ebf7a..6136e68 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -173,15 +173,22 @@ ENTRY(v7_coherent_user_range)
  UNWIND(.fnstart		)
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
-	bic	r0, r0, r3
+	bic	r12, r0, r3
 1:
- USER(	mcr	p15, 0, r0, c7, c11, 1	)	@ clean D line to the point of unification
+ USER(	mcr	p15, 0, r12, c7, c11, 1	)	@ clean D line to the point of unification
+	add	r12, r12, r2
+	cmp	r12, r1
+	blo	1b
 	dsb
- USER(	mcr	p15, 0, r0, c7, c5, 1	)	@ invalidate I line
-	add	r0, r0, r2
+	icache_line_size r2, r3
+	sub	r3, r2, #1
+	bic	r12, r0, r3
 2:
-	cmp	r0, r1
-	blo	1b
+ USER(	mcr	p15, 0, r12, c7, c5, 1	)	@ invalidate I line
+	add	r12, r12, r2
+	cmp	r12, r1
+	blo	2b
+3:
 	mov	r0, #0
 	ALT_SMP(mcr	p15, 0, r0, c7, c1, 6)	@ invalidate BTB Inner Shareable
 	ALT_UP(mcr	p15, 0, r0, c7, c5, 6)	@ invalidate BTB
@@ -194,10 +201,10 @@ ENTRY(v7_coherent_user_range)
  * isn't mapped, just try the next page.
  */
 9001:
-	mov	r0, r0, lsr #12
-	mov	r0, r0, lsl #12
-	add	r0, r0, #4096
-	b	2b
+	mov	r12, r12, lsr #12
+	mov	r12, r12, lsl #12
+	add	r12, r12, #4096
+	b	3b
  UNWIND(.fnend		)
 ENDPROC(v7_coherent_kern_range)
 ENDPROC(v7_coherent_user_range)
diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index 321555b..b795afd 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -72,6 +72,16 @@
 	mov	\reg, \reg, lsl \tmp		@ actual cache line size
 	.endm
 
+/*
+ * icache_line_size - get the minimum I-cache line size from the CTR register
+ * on ARMv7.
+ */
+	.macro	icache_line_size, reg, tmp
+	mrc	p15, 0, \tmp, c0, c0, 1		@ read ctr
+	and	\tmp, \tmp, #0xf		@ cache line size encoding
+	mov	\reg, #4			@ bytes per word
+	mov	\reg, \reg, lsl \tmp		@ actual cache line size
+	.endm
 
 /*
  * Sanity check the PTE configuration for the code below - which makes
-- 
cgit v1.1


From 85b093bcc5322baa811a03ec73de0909c157f181 Mon Sep 17 00:00:00 2001
From: Valentine Barshak <vbarshak@mvista.com>
Date: Tue, 14 Dec 2010 00:03:16 +0100
Subject: ARM: 6535/1: V6 MPCore v6_dma_inv_range and v6_dma_flush_range RWFO
 fix

Cache ownership must be acquired by reading/writing data from the
cache line to make cache operation have the desired effect on the
SMP MPCore CPU. However, the ownership is never acquired in the
v6_dma_inv_range function when cleaning the first line and
flushing the last one, in case the address is not aligned
to D_CACHE_LINE_SIZE boundary.
Fix this by reading/writing data if needed, before performing
cache operations.
While at it, fix v6_dma_flush_range to prevent RWFO outside
the buffer.

Cc: stable@kernel.org
Signed-off-by: Valentine Barshak <vbarshak@mvista.com>
Signed-off-by: George G. Davis <gdavis@mvista.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v6.S | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'arch/arm/mm')

diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 99fa688..c96fa1b 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -203,6 +203,10 @@ ENTRY(v6_flush_kern_dcache_area)
  *	- end     - virtual end address of region
  */
 v6_dma_inv_range:
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrb	r2, [r0]			@ read for ownership
+	strb	r2, [r0]			@ write for ownership
+#endif
 	tst	r0, #D_CACHE_LINE_SIZE - 1
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 #ifdef HARVARD_CACHE
@@ -211,6 +215,10 @@ v6_dma_inv_range:
 	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
 #endif
 	tst	r1, #D_CACHE_LINE_SIZE - 1
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrneb	r2, [r1, #-1]			@ read for ownership
+	strneb	r2, [r1, #-1]			@ write for ownership
+#endif
 	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
 #ifdef HARVARD_CACHE
 	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
@@ -218,10 +226,6 @@ v6_dma_inv_range:
 	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
 #endif
 1:
-#ifdef CONFIG_DMA_CACHE_RWFO
-	ldr	r2, [r0]			@ read for ownership
-	str	r2, [r0]			@ write for ownership
-#endif
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
 #else
@@ -229,6 +233,10 @@ v6_dma_inv_range:
 #endif
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrlo	r2, [r0]			@ read for ownership
+	strlo	r2, [r0]			@ write for ownership
+#endif
 	blo	1b
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
@@ -263,12 +271,12 @@ v6_dma_clean_range:
  *	- end     - virtual end address of region
  */
 ENTRY(v6_dma_flush_range)
-	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
-1:
 #ifdef CONFIG_DMA_CACHE_RWFO
-	ldr	r2, [r0]			@ read for ownership
-	str	r2, [r0]			@ write for ownership
+	ldrb	r2, [r0]		@ read for ownership
+	strb	r2, [r0]		@ write for ownership
 #endif
+	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
+1:
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line
 #else
@@ -276,6 +284,10 @@ ENTRY(v6_dma_flush_range)
 #endif
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
+#ifdef CONFIG_DMA_CACHE_RWFO
+	ldrlob	r2, [r0]			@ read for ownership
+	strlob	r2, [r0]			@ write for ownership
+#endif
 	blo	1b
 	mov	r0, #0
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
-- 
cgit v1.1