From 7e018bb933d5291155739614e422773c4c2d8781 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Thu, 9 Jul 2015 15:37:53 +0300 Subject: Unroll loops to process 64 byte cache line per iteration --- src/ffts_real.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 205 insertions(+), 39 deletions(-) (limited to 'src') diff --git a/src/ffts_real.c b/src/ffts_real.c index a737696..0327f15 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -152,22 +152,36 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) ); } #elif HAVE_SSE3 - if (N < 8) { + if (FFTS_UNLIKELY(N <= 8)) { __m128 t0 = _mm_load_ps(buf); - __m128 t1 = _mm_load_ps(A); - __m128 t2 = _mm_load_ps(B); + __m128 t1 = _mm_load_ps(buf + N - 4); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps( - _mm_mul_ps(t0, _mm_moveldup_ps(t1)), + _mm_mul_ps(t0, _mm_moveldup_ps(t2)), _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)), - _mm_movehdup_ps(t1))), _mm_addsub_ps( - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1))), - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)))); + _mm_movehdup_ps(t2))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)))); + + if (N == 8) { + t2 = _mm_load_ps(A + 4); + t3 = _mm_load_ps(B + 4); + + _mm_store_ps(out + 4, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t2)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t2))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3)))); + } } else { __m128 t0 = _mm_load_ps(buf); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(buf + i); __m128 t2 = _mm_load_ps(buf + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -193,28 +207,69 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t1 = _mm_load_ps(buf + i + 8); + t2 = _mm_load_ps(buf + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(out + i + 8, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(buf + N - i - 16); + t1 = _mm_load_ps(buf + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(out + i + 12, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } #elif HAVE_SSE - if (N < 8) { + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); - __m128 t1 = _mm_load_ps(A); - __m128 t2 = _mm_load_ps(B); + __m128 t1 = _mm_load_ps(buf + N - 4); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps( - _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(t0, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)), - _mm_xor_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3,3,1,1)), c0))), - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)), - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0), + _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0), _MM_SHUFFLE(2,3,0,1))))); + + if (N == 8) { + t2 = _mm_load_ps(A + 4); + t3 = _mm_load_ps(B + 4); + + _mm_store_ps(out + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0), + _MM_SHUFFLE(2,3,0,1))))); + } } else { __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(buf + i); __m128 t2 = _mm_load_ps(buf + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -242,6 +297,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), _MM_SHUFFLE(2,3,0,1))))); + + t1 = _mm_load_ps(buf + i + 8); + t2 = _mm_load_ps(buf + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(out + i + 8, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), + _MM_SHUFFLE(2,3,0,1))))); + + t0 = _mm_load_ps(buf + N - i - 16); + t1 = _mm_load_ps(buf + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(out + i + 12, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), + _MM_SHUFFLE(2,3,0,1))))); } } #else @@ -326,23 +409,37 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) ); } #elif HAVE_SSE3 - if (N < 8) { - __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + if (FFTS_UNLIKELY(N <= 8)) { + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); __m128 t1 = _mm_load_ps(in); - __m128 t2 = _mm_load_ps(A); - __m128 t3 = _mm_load_ps(B); + __m128 t2 = _mm_load_ps(in + N - 4); + __m128 t3 = _mm_load_ps(A); + __m128 t4 = _mm_load_ps(B); _mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps( - _mm_mul_ps(t1, _mm_moveldup_ps(t2)), + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), - _mm_movehdup_ps(t2))), _mm_addsub_ps( - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)))); + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + if (N == 8) { + t3 = _mm_load_ps(A + 4); + t4 = _mm_load_ps(B + 4); + + _mm_store_ps(buf + 4, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t2, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)), t4)))); + } } else { __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(in + i); __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -368,29 +465,70 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t1 = _mm_load_ps(in + i + 8); + t2 = _mm_load_ps(in + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(buf + i + 8, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(in + N - i - 16); + t1 = _mm_load_ps(in + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(buf + i + 12, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } #elif HAVE_SSE - if (N < 8) { + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); - __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); __m128 t1 = _mm_load_ps(in); - __m128 t2 = _mm_load_ps(A); - __m128 t3 = _mm_load_ps(B); + __m128 t2 = _mm_load_ps(in + N - 4); + __m128 t3 = _mm_load_ps(A); + __m128 t4 = _mm_load_ps(B); _mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps( - _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), - _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1)))), - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), - _mm_xor_ps(t3, c0)))); + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + + if (N == 8) { + t3 = _mm_load_ps(A + 4); + t4 = _mm_load_ps(B + 4); + + _mm_store_ps(buf + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t2, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + } } else { __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(in + i); __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -418,6 +556,34 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), _mm_xor_ps(t4, c0)))); + + t1 = _mm_load_ps(in + i + 8); + t2 = _mm_load_ps(in + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(buf + i + 8, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + + t0 = _mm_load_ps(in + N - i - 16); + t1 = _mm_load_ps(in + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(buf + i + 12, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); } } #else -- cgit v1.1