summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-06 12:10:17 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-06 12:10:17 +0300
commit6bf4e36dd29a12136f018c208f830dbaac05f182 (patch)
tree2d7c36c48f0863b53f3eea35311173c0d26d6d52
parentfbcfb21e9de85b6443848c721523d3793ae668ff (diff)
downloadffts-6bf4e36dd29a12136f018c208f830dbaac05f182.zip
ffts-6bf4e36dd29a12136f018c208f830dbaac05f182.tar.gz
SSE optimized versions of ffts_execute_1d_real and ffts_execute_1d_real_inv
-rw-r--r--src/ffts_real.c104
1 files changed, 100 insertions, 4 deletions
diff --git a/src/ffts_real.c b/src/ffts_real.c
index 82a9e79..0dd24d8 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -134,10 +134,58 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
+#elif HAVE_SSE
+ if (N < 8) {
+ for (i = 0; i < N/2; i++) {
+ out[2*i + 0] =
+ buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] +
+ buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1];
+ out[2*i + 1] =
+ buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] +
+ buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
+ }
+ } else {
+ const __m128 c0 = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f);
+ __m128 t0 = _mm_load_ps(buf);
+
+ for (i = 0; i < N; i += 8) {
+ __m128 t1 = _mm_load_ps(buf + i);
+ __m128 t2 = _mm_load_ps(buf + N - i - 4);
+ __m128 t3 = _mm_load_ps(A + i);
+ __m128 t4 = _mm_load_ps(B + i);
+
+ _mm_store_ps(out + i, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+ _MM_SHUFFLE(2,3,0,1)))));
+
+ t0 = _mm_load_ps(buf + N - i - 8);
+ t1 = _mm_load_ps(buf + i + 4);
+ t3 = _mm_load_ps(A + i + 4);
+ t4 = _mm_load_ps(B + i + 4);
+
+ _mm_store_ps(out + i + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+ _MM_SHUFFLE(2,3,0,1)))));
+ }
+ }
#else
for (i = 0; i < N/2; i++) {
- out[2*i + 0] = buf[2*i + 0] * A[2*i] - buf[2*i + 1] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1];
- out[2*i + 1] = buf[2*i + 1] * A[2*i] + buf[2*i + 0] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
+ out[2*i + 0] =
+ buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] +
+ buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1];
+ out[2*i + 1] =
+ buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] +
+ buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
}
#endif
@@ -211,10 +259,58 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
+#elif HAVE_SSE
+ if (N < 8) {
+ for (i = 0; i < N/2; i++) {
+ buf[2*i + 0] =
+ in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
+ in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1];
+ buf[2*i + 1] =
+ in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
+ in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
+ }
+ } else {
+ const __m128 c0 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);
+ __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
+
+ for (i = 0; i < N; i += 8) {
+ __m128 t1 = _mm_load_ps(in + i);
+ __m128 t2 = _mm_load_ps(in + N - i - 4);
+ __m128 t3 = _mm_load_ps(A + i);
+ __m128 t4 = _mm_load_ps(B + i);
+
+ _mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t4, c0))));
+
+ t0 = _mm_load_ps(in + N - i - 8);
+ t1 = _mm_load_ps(in + i + 4);
+ t3 = _mm_load_ps(A + i + 4);
+ t4 = _mm_load_ps(B + i + 4);
+
+ _mm_store_ps(buf + i + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t4, c0))));
+ }
+ }
#else
for (i = 0; i < N/2; i++) {
- buf[2*i + 0] = in[2*i + 0] * A[2*i] + in[2*i + 1] * A[2*i + 1] + in[N - 2*i] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1];
- buf[2*i + 1] = in[2*i + 1] * A[2*i] - in[2*i + 0] * A[2*i + 1] - in[N - 2*i] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
+ buf[2*i + 0] =
+ in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
+ in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1];
+ buf[2*i + 1] =
+ in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
+ in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
}
#endif
OpenPOWER on IntegriCloud