From 95c783a2afd9a2e299812be2623fecd415bd1c41 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 7 Jul 2015 19:14:22 +0300 Subject: Add SSE3 optimized version of ffts_execute_1d_real_inv --- src/ffts_real.c | 98 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 20 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index f3fbaae..5c01103 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -47,9 +47,12 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #else /* avoid using negative zero as some configurations have problems with those */ -static const FFTS_ALIGN(16) unsigned int sign_mask[4] = { +static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = { 0x80000000, 0, 0x80000000, 0 }; +static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = { + 0, 0x80000000, 0, 0x80000000 +}; #endif #endif @@ -150,9 +153,9 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) } #elif HAVE_SSE3 if (N < 8) { - const __m128 t0 = _mm_load_ps(buf); - const __m128 t1 = _mm_load_ps(A); - const __m128 t2 = _mm_load_ps(B); + __m128 t0 = _mm_load_ps(buf); + __m128 t1 = _mm_load_ps(A); + __m128 t2 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps( _mm_mul_ps(t0, _mm_moveldup_ps(t1)), @@ -194,10 +197,10 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) } #elif HAVE_SSE if (N < 8) { - const __m128 c0 = _mm_load_ps((const float*) sign_mask); - const __m128 t0 = _mm_load_ps(buf); - const __m128 t1 = _mm_load_ps(A); - const __m128 t2 = _mm_load_ps(B); + __m128 c0 = _mm_load_ps((const float*) sign_mask_even); + __m128 t0 = _mm_load_ps(buf); + __m128 t1 = _mm_load_ps(A); + __m128 t2 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps( _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))), @@ -208,7 +211,7 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0), _MM_SHUFFLE(2,3,0,1))))); } else { - const __m128 c0 = _mm_load_ps((const float*) sign_mask); + __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); for (i = 0; i < N; i += 8) { @@ -322,18 +325,69 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE +#elif HAVE_SSE3 if (N < 8) { - for (i = 0; i < N/2; i++) { - buf[2*i + 0] = - in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] + - in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1]; - buf[2*i + 1] = - in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] - - in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + __m128 t1 = _mm_load_ps(in); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); + + _mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t2)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t2))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)))); + } else { + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); + + for (i = 0; i < N; i += 8) { + __m128 t1 = _mm_load_ps(in + i); + __m128 t2 = _mm_load_ps(in + N - i - 4); + __m128 t3 = _mm_load_ps(A + i); + __m128 t4 = _mm_load_ps(B + i); + + _mm_store_ps(buf + i, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(in + N - i - 8); + t1 = _mm_load_ps(in + i + 4); + t3 = _mm_load_ps(A + i + 4); + t4 = _mm_load_ps(B + i + 4); + + _mm_store_ps(buf + i + 4, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } + } +#elif HAVE_SSE + if (N < 8) { + __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + __m128 t1 = _mm_load_ps(in); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); + + _mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t3, c0)))); } else { - const __m128 c0 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f); + __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); for (i = 0; i < N; i += 8) { @@ -341,7 +395,7 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); __m128 t4 = _mm_load_ps(B + i); - + _mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps( _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), @@ -429,14 +483,18 @@ ffts_init_1d_real(size_t N, int sign) #ifdef HAVE_SSE3 p->B[2 * i + 0] = (float) (-0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); #else - p->B[2 * i + 0] = (float) (0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); + p->B[2 * i + 0] = (float) ( 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); #endif p->B[2 * i + 1] = (float) ( 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } } else { for (i = 0; i < N/2; i++) { p->A[2 * i + 0] = (float) (1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); +#ifdef HAVE_SSE3 + p->A[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); +#else p->A[2 * i + 1] = (float) (1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); +#endif p->B[2 * i + 0] = (float) (1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); p->B[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } -- cgit v1.1