From 35129dde88afad07f54b332d4f9eda2d254b80f2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Jul 2017 18:16:00 +0100 Subject: md/raid6: use faster multiplication for ARM NEON delta syndrome The P/Q left side optimization in the delta syndrome simply involves repeatedly multiplying a value by polynomial 'x' in GF(2^8). Given that 'x * x * x * x' equals 'x^4' even in the polynomial world, we can accelerate this substantially by performing up to 4 such operations at once, using the NEON instructions for polynomial multiplication. Results on a Cortex-A57 running in 64-bit mode: Before: ------- raid6: neonx1 xor() 1680 MB/s raid6: neonx2 xor() 2286 MB/s raid6: neonx4 xor() 3162 MB/s raid6: neonx8 xor() 3389 MB/s After: ------ raid6: neonx1 xor() 2281 MB/s raid6: neonx2 xor() 3362 MB/s raid6: neonx4 xor() 3787 MB/s raid6: neonx8 xor() 4239 MB/s While we're at it, simplify MASK() by using a signed shift rather than a vector compare involving a temp register. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- lib/raid6/neon.uc | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc index 4fa51b7..d5242f5 100644 --- a/lib/raid6/neon.uc +++ b/lib/raid6/neon.uc @@ -46,8 +46,12 @@ static inline unative_t SHLBYTE(unative_t v) */ static inline unative_t MASK(unative_t v) { - const uint8x16_t temp = NBYTES(0); - return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); + return (unative_t)vshrq_n_s8((int8x16_t)v, 7); +} + +static inline unative_t PMUL(unative_t v, unative_t u) +{ + return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u); } void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) @@ -110,7 +114,30 @@ void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop, wq$$ = veorq_u8(w1$$, wd$$); } /* P/Q left side optimization */ - for ( z = start-1 ; z >= 0 ; z-- ) { + for ( z = start-1 ; z >= 3 ; z -= 4 ) { + w2$$ = vshrq_n_u8(wq$$, 4); + w1$$ = vshlq_n_u8(wq$$, 4); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + } + + switch (z) { + case 2: + w2$$ = vshrq_n_u8(wq$$, 5); + w1$$ = vshlq_n_u8(wq$$, 3); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + break; + case 1: + w2$$ = vshrq_n_u8(wq$$, 6); + w1$$ = vshlq_n_u8(wq$$, 2); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + break; + case 0: w2$$ = MASK(wq$$); w1$$ = SHLBYTE(wq$$); -- cgit v1.1 From 6ec4e2514decd6fb4782a9364fa71d6244d05af4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Jul 2017 18:16:01 +0100 Subject: md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- lib/raid6/Makefile | 4 +- lib/raid6/algos.c | 3 ++ lib/raid6/recov_neon.c | 110 ++++++++++++++++++++++++++++++++++++++++ lib/raid6/recov_neon_inner.c | 117 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 lib/raid6/recov_neon.c create mode 100644 lib/raid6/recov_neon_inner.c (limited to 'lib') diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3057011..a93adf6 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -5,7 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o -raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_TILEGX) += tilegx8.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o @@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding ifeq ($(ARCH),arm) NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon endif +CFLAGS_recov_neon_inner.o += $(NEON_FLAGS) ifeq ($(ARCH),arm64) +CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 7857049..4769947 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -113,6 +113,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #ifdef CONFIG_S390 &raid6_recov_s390xc, #endif +#if defined(CONFIG_KERNEL_MODE_NEON) + &raid6_recov_neon, +#endif &raid6_recov_intx1, NULL }; diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c new file mode 100644 index 0000000..eeb5c40 --- /dev/null +++ b/lib/raid6/recov_neon.c @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include + +#ifdef __KERNEL__ +#include +#else +#define kernel_neon_begin() +#define kernel_neon_end() +#define cpu_has_neon() (1) +#endif + +static int raid6_has_neon(void) +{ + return cpu_has_neon(); +} + +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + +static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_neon_begin(); + __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); + kernel_neon_end(); +} + +static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_neon_begin(); + __raid6_datap_recov_neon(bytes, p, q, dq, qmul); + kernel_neon_end(); +} + +const struct raid6_recov_calls raid6_recov_neon = { + .data2 = raid6_2data_recov_neon, + .datap = raid6_datap_recov_neon, + .valid = raid6_has_neon, + .name = "neon", + .priority = 10, +}; diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c new file mode 100644 index 0000000..8cd20c9 --- /dev/null +++ b/lib/raid6/recov_neon_inner.c @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include + +static const uint8x16_t x0f = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, +}; + +#ifdef CONFIG_ARM +/* + * AArch32 does not provide this intrinsic natively because it does not + * implement the underlying instruction. AArch32 only provides a 64-bit + * wide vtbl.8 instruction, so use that instead. + */ +static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + union { + uint8x16_t val; + uint8x8x2_t pair; + } __a = { a }; + + return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), + vtbl2_u8(__a.pair, vget_high_u8(b))); +} +#endif + +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul) +{ + uint8x16_t pm0 = vld1q_u8(pbmul); + uint8x16_t pm1 = vld1q_u8(pbmul + 16); + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy, px, qx, db; + + px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); + qx = veorq_u8(vx, vy); + + vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4); + vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); + vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f)); + vx = veorq_u8(vx, vy); + db = veorq_u8(vx, qx); + + vst1q_u8(dq, db); + vst1q_u8(dp, veorq_u8(db, px)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul) +{ + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy; + + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); + vx = veorq_u8(vx, vy); + vy = veorq_u8(vx, vld1q_u8(p)); + + vst1q_u8(dq, vx); + vst1q_u8(p, vy); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} -- cgit v1.1