From f317820cb6ee3fb173319bf76e0e62437be78ad2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 2 Nov 2012 14:20:24 +0000 Subject: x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich Acked-by: H. Peter Anvin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xor_64.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm/xor_64.h') diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1baf89d..546f1e3 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = { /* Also try the AVX routines */ #include +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ AVX_XOR_SPEED; \ + xor_speed(&xor_block_sse_pf64); \ xor_speed(&xor_block_sse); \ } while (0) -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - AVX_SELECT(&xor_block_sse) - #endif /* _ASM_X86_XOR_64_H */ -- cgit v1.1