summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-07 19:14:22 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-07 19:14:22 +0300
commit95c783a2afd9a2e299812be2623fecd415bd1c41 (patch)
tree726e9853dee1ade228ef5d99b9e2a1014440fae1
parentea0c10a22b233af7ef9ddd9bd6b71d3ab9208cff (diff)
downloadffts-95c783a2afd9a2e299812be2623fecd415bd1c41.zip
ffts-95c783a2afd9a2e299812be2623fecd415bd1c41.tar.gz
Add SSE3 optimized version of ffts_execute_1d_real_inv
-rw-r--r--src/ffts_real.c98
1 files changed, 78 insertions, 20 deletions
diff --git a/src/ffts_real.c b/src/ffts_real.c
index f3fbaae..5c01103 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -47,9 +47,12 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <intrin.h>
#else
/* avoid using negative zero as some configurations have problems with those */
-static const FFTS_ALIGN(16) unsigned int sign_mask[4] = {
+static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
0x80000000, 0, 0x80000000, 0
};
+static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
+ 0, 0x80000000, 0, 0x80000000
+};
#endif
#endif
@@ -150,9 +153,9 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
}
#elif HAVE_SSE3
if (N < 8) {
- const __m128 t0 = _mm_load_ps(buf);
- const __m128 t1 = _mm_load_ps(A);
- const __m128 t2 = _mm_load_ps(B);
+ __m128 t0 = _mm_load_ps(buf);
+ __m128 t1 = _mm_load_ps(A);
+ __m128 t2 = _mm_load_ps(B);
_mm_store_ps(out, _mm_add_ps(_mm_addsub_ps(
_mm_mul_ps(t0, _mm_moveldup_ps(t1)),
@@ -194,10 +197,10 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
}
#elif HAVE_SSE
if (N < 8) {
- const __m128 c0 = _mm_load_ps((const float*) sign_mask);
- const __m128 t0 = _mm_load_ps(buf);
- const __m128 t1 = _mm_load_ps(A);
- const __m128 t2 = _mm_load_ps(B);
+ __m128 c0 = _mm_load_ps((const float*) sign_mask_even);
+ __m128 t0 = _mm_load_ps(buf);
+ __m128 t1 = _mm_load_ps(A);
+ __m128 t2 = _mm_load_ps(B);
_mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps(
_mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))),
@@ -208,7 +211,7 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0),
_MM_SHUFFLE(2,3,0,1)))));
} else {
- const __m128 c0 = _mm_load_ps((const float*) sign_mask);
+ __m128 c0 = _mm_load_ps((const float*) sign_mask_even);
__m128 t0 = _mm_load_ps(buf);
for (i = 0; i < N; i += 8) {
@@ -322,18 +325,69 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#elif HAVE_SSE
+#elif HAVE_SSE3
if (N < 8) {
- for (i = 0; i < N/2; i++) {
- buf[2*i + 0] =
- in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
- in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1];
- buf[2*i + 1] =
- in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
- in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
+ __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]);
+ __m128 t1 = _mm_load_ps(in);
+ __m128 t2 = _mm_load_ps(A);
+ __m128 t3 = _mm_load_ps(B);
+
+ _mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t2)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t2))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3))));
+ } else {
+ __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
+
+ for (i = 0; i < N; i += 8) {
+ __m128 t1 = _mm_load_ps(in + i);
+ __m128 t2 = _mm_load_ps(in + N - i - 4);
+ __m128 t3 = _mm_load_ps(A + i);
+ __m128 t4 = _mm_load_ps(B + i);
+
+ _mm_store_ps(buf + i, _mm_sub_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ t0 = _mm_load_ps(in + N - i - 8);
+ t1 = _mm_load_ps(in + i + 4);
+ t3 = _mm_load_ps(A + i + 4);
+ t4 = _mm_load_ps(B + i + 4);
+
+ _mm_store_ps(buf + i + 4, _mm_sub_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
+ }
+#elif HAVE_SSE
+ if (N < 8) {
+ __m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
+ __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]);
+ __m128 t1 = _mm_load_ps(in);
+ __m128 t2 = _mm_load_ps(A);
+ __m128 t3 = _mm_load_ps(B);
+
+ _mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t3, c0))));
} else {
- const __m128 c0 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);
+ __m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
for (i = 0; i < N; i += 8) {
@@ -341,7 +395,7 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
__m128 t2 = _mm_load_ps(in + N - i - 4);
__m128 t3 = _mm_load_ps(A + i);
__m128 t4 = _mm_load_ps(B + i);
-
+
_mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
_mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
_mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
@@ -429,14 +483,18 @@ ffts_init_1d_real(size_t N, int sign)
#ifdef HAVE_SSE3
p->B[2 * i + 0] = (float) (-0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)));
#else
- p->B[2 * i + 0] = (float) (0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)));
+ p->B[2 * i + 0] = (float) ( 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)));
#endif
p->B[2 * i + 1] = (float) ( 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
}
} else {
for (i = 0; i < N/2; i++) {
p->A[2 * i + 0] = (float) (1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)));
+#ifdef HAVE_SSE3
+ p->A[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
+#else
p->A[2 * i + 1] = (float) (1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
+#endif
p->B[2 * i + 0] = (float) (1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)));
p->B[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
}
OpenPOWER on IntegriCloud