From 6323875db20fd8ca8c8fbbd608bc377f2d4c8cf5 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 29 Nov 2010 19:43:25 +0100 Subject: ARM: 6501/1: Thumb-2: Correct data alignment for CONFIG_THUMB2_KERNEL in mm/proc-v7.S Directives such as .long and .word do not magically cause the assembler location counter to become aligned in gas. As a result, using these directives in code sections can result in misaligned data words when building a Thumb-2 kernel (CONFIG_THUMB2_KERNEL). This is a Bad Thing, since the ABI permits the compiler to assume that fundamental types of word size or above are word- aligned when accessing them from C. If the data is not really word-aligned, this can cause impaired performance and stray alignment faults in some circumstances. In general, the following rules should be applied when using data word declaration directives inside code sections: * .quad and .double: .align 3 * .long, .word, .single, .float: .align (or .align 2) * .short: No explicit alignment required, since Thumb-2 instructions are always 2 or 4 bytes in size. immediately after an instruction. In this specific case, we can achieve the desired alignment by forcing a 32-bit branch instruction using the W() macro, since the assembler location counter is already 32-bit aligned in this case. Reviewed-by: Will Deacon Signed-off-by: Dave Martin Acked-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/mm/proc-v7.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm/mm') diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 53cbe22..9b9ff5d 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -381,7 +381,7 @@ __v7_ca9mp_proc_info: PMD_SECT_XN | \ PMD_SECT_AP_WRITE | \ PMD_SECT_AP_READ - b __v7_ca9mp_setup + W(b) __v7_ca9mp_setup .long cpu_arch_name .long cpu_elf_name .long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_TLS @@ -413,7 +413,7 @@ __v7_proc_info: PMD_SECT_XN | \ PMD_SECT_AP_WRITE | \ PMD_SECT_AP_READ - b __v7_setup + W(b) __v7_setup .long cpu_arch_name .long cpu_elf_name .long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_TLS -- cgit v1.1 From f91e2c3bd427239c198351f44814dd39db91afe0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Dec 2010 16:52:04 +0100 Subject: ARM: 6527/1: Use CTR instead of CCSIDR for the D-cache line size on ARMv7 The current implementation of the dcache_line_size macro reads the L1 cache size from the CCSIDR register. This, however, is not guaranteed to be the smallest cache line in the cache hierarchy. The patch changes to the macro to use the more architecturally correct CTR register. Reported-by: Kevin Sapp Signed-off-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/mm/proc-macros.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/arm/mm') diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index 7d63bea..321555b 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -61,14 +61,14 @@ .endm /* - * cache_line_size - get the cache line size from the CSIDR register - * (available on ARMv7+). It assumes that the CSSR register was configured - * to access the L1 data cache CSIDR. + * dcache_line_size - get the minimum D-cache line size from the CTR register + * on ARMv7. */ .macro dcache_line_size, reg, tmp - mrc p15, 1, \tmp, c0, c0, 0 @ read CSIDR - and \tmp, \tmp, #7 @ cache line size encoding - mov \reg, #16 @ size offset + mrc p15, 0, \tmp, c0, c0, 1 @ read ctr + lsr \tmp, \tmp, #16 + and \tmp, \tmp, #0xf @ cache line size encoding + mov \reg, #4 @ bytes per word mov \reg, \reg, lsl \tmp @ actual cache line size .endm -- cgit v1.1 From da30e0ac0f9a521f0cfec8145ddd1ad131f66d61 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Dec 2010 16:56:29 +0100 Subject: ARM: 6528/1: Use CTR for the I-cache line size on ARMv7 The current implementation of the v7_coherent_*_range function assumes that the D and I cache lines have the same size, which is incorrect architecturally. This patch adds the icache_line_size macro which reads the CTR register. The main loop in v7_coherent_*_range is split in two independent loops or the D and I caches. This also has the performance advantage that the DSB is moved outside the main loop. Reported-by: Kevin Sapp Signed-off-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/mm/cache-v7.S | 27 +++++++++++++++++---------- arch/arm/mm/proc-macros.S | 10 ++++++++++ 2 files changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/arm/mm') diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index a3ebf7a..6136e68 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -173,15 +173,22 @@ ENTRY(v7_coherent_user_range) UNWIND(.fnstart ) dcache_line_size r2, r3 sub r3, r2, #1 - bic r0, r0, r3 + bic r12, r0, r3 1: - USER( mcr p15, 0, r0, c7, c11, 1 ) @ clean D line to the point of unification + USER( mcr p15, 0, r12, c7, c11, 1 ) @ clean D line to the point of unification + add r12, r12, r2 + cmp r12, r1 + blo 1b dsb - USER( mcr p15, 0, r0, c7, c5, 1 ) @ invalidate I line - add r0, r0, r2 + icache_line_size r2, r3 + sub r3, r2, #1 + bic r12, r0, r3 2: - cmp r0, r1 - blo 1b + USER( mcr p15, 0, r12, c7, c5, 1 ) @ invalidate I line + add r12, r12, r2 + cmp r12, r1 + blo 2b +3: mov r0, #0 ALT_SMP(mcr p15, 0, r0, c7, c1, 6) @ invalidate BTB Inner Shareable ALT_UP(mcr p15, 0, r0, c7, c5, 6) @ invalidate BTB @@ -194,10 +201,10 @@ ENTRY(v7_coherent_user_range) * isn't mapped, just try the next page. */ 9001: - mov r0, r0, lsr #12 - mov r0, r0, lsl #12 - add r0, r0, #4096 - b 2b + mov r12, r12, lsr #12 + mov r12, r12, lsl #12 + add r12, r12, #4096 + b 3b UNWIND(.fnend ) ENDPROC(v7_coherent_kern_range) ENDPROC(v7_coherent_user_range) diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index 321555b..b795afd 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -72,6 +72,16 @@ mov \reg, \reg, lsl \tmp @ actual cache line size .endm +/* + * icache_line_size - get the minimum I-cache line size from the CTR register + * on ARMv7. + */ + .macro icache_line_size, reg, tmp + mrc p15, 0, \tmp, c0, c0, 1 @ read ctr + and \tmp, \tmp, #0xf @ cache line size encoding + mov \reg, #4 @ bytes per word + mov \reg, \reg, lsl \tmp @ actual cache line size + .endm /* * Sanity check the PTE configuration for the code below - which makes -- cgit v1.1 From 85b093bcc5322baa811a03ec73de0909c157f181 Mon Sep 17 00:00:00 2001 From: Valentine Barshak Date: Tue, 14 Dec 2010 00:03:16 +0100 Subject: ARM: 6535/1: V6 MPCore v6_dma_inv_range and v6_dma_flush_range RWFO fix Cache ownership must be acquired by reading/writing data from the cache line to make cache operation have the desired effect on the SMP MPCore CPU. However, the ownership is never acquired in the v6_dma_inv_range function when cleaning the first line and flushing the last one, in case the address is not aligned to D_CACHE_LINE_SIZE boundary. Fix this by reading/writing data if needed, before performing cache operations. While at it, fix v6_dma_flush_range to prevent RWFO outside the buffer. Cc: stable@kernel.org Signed-off-by: Valentine Barshak Signed-off-by: George G. Davis Acked-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/mm/cache-v6.S | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/arm/mm') diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 99fa688..c96fa1b 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -203,6 +203,10 @@ ENTRY(v6_flush_kern_dcache_area) * - end - virtual end address of region */ v6_dma_inv_range: +#ifdef CONFIG_DMA_CACHE_RWFO + ldrb r2, [r0] @ read for ownership + strb r2, [r0] @ write for ownership +#endif tst r0, #D_CACHE_LINE_SIZE - 1 bic r0, r0, #D_CACHE_LINE_SIZE - 1 #ifdef HARVARD_CACHE @@ -211,6 +215,10 @@ v6_dma_inv_range: mcrne p15, 0, r0, c7, c11, 1 @ clean unified line #endif tst r1, #D_CACHE_LINE_SIZE - 1 +#ifdef CONFIG_DMA_CACHE_RWFO + ldrneb r2, [r1, #-1] @ read for ownership + strneb r2, [r1, #-1] @ write for ownership +#endif bic r1, r1, #D_CACHE_LINE_SIZE - 1 #ifdef HARVARD_CACHE mcrne p15, 0, r1, c7, c14, 1 @ clean & invalidate D line @@ -218,10 +226,6 @@ v6_dma_inv_range: mcrne p15, 0, r1, c7, c15, 1 @ clean & invalidate unified line #endif 1: -#ifdef CONFIG_DMA_CACHE_RWFO - ldr r2, [r0] @ read for ownership - str r2, [r0] @ write for ownership -#endif #ifdef HARVARD_CACHE mcr p15, 0, r0, c7, c6, 1 @ invalidate D line #else @@ -229,6 +233,10 @@ v6_dma_inv_range: #endif add r0, r0, #D_CACHE_LINE_SIZE cmp r0, r1 +#ifdef CONFIG_DMA_CACHE_RWFO + ldrlo r2, [r0] @ read for ownership + strlo r2, [r0] @ write for ownership +#endif blo 1b mov r0, #0 mcr p15, 0, r0, c7, c10, 4 @ drain write buffer @@ -263,12 +271,12 @@ v6_dma_clean_range: * - end - virtual end address of region */ ENTRY(v6_dma_flush_range) - bic r0, r0, #D_CACHE_LINE_SIZE - 1 -1: #ifdef CONFIG_DMA_CACHE_RWFO - ldr r2, [r0] @ read for ownership - str r2, [r0] @ write for ownership + ldrb r2, [r0] @ read for ownership + strb r2, [r0] @ write for ownership #endif + bic r0, r0, #D_CACHE_LINE_SIZE - 1 +1: #ifdef HARVARD_CACHE mcr p15, 0, r0, c7, c14, 1 @ clean & invalidate D line #else @@ -276,6 +284,10 @@ ENTRY(v6_dma_flush_range) #endif add r0, r0, #D_CACHE_LINE_SIZE cmp r0, r1 +#ifdef CONFIG_DMA_CACHE_RWFO + ldrlob r2, [r0] @ read for ownership + strlob r2, [r0] @ write for ownership +#endif blo 1b mov r0, #0 mcr p15, 0, r0, c7, c10, 4 @ drain write buffer -- cgit v1.1