summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-07 12:50:30 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-07 12:50:30 +0300
commitea0c10a22b233af7ef9ddd9bd6b71d3ab9208cff (patch)
tree164a7cd5abcd7e0b20b69f9a9238f2f08ea7bb22
parent8c3b06d4790ef37d541212bdc689f5b0ecab7245 (diff)
downloadffts-ea0c10a22b233af7ef9ddd9bd6b71d3ab9208cff.zip
ffts-ea0c10a22b233af7ef9ddd9bd6b71d3ab9208cff.tar.gz
Add SSE3 optimized version of ffts_execute_1d_real
-rw-r--r--src/ffts_real.c93
1 files changed, 80 insertions, 13 deletions
diff --git a/src/ffts_real.c b/src/ffts_real.c
index f3b5126..f3fbaae 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -39,6 +39,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <arm_neon.h>
#elif HAVE_SSE
#include <xmmintrin.h>
+
+/* check if have SSE3 intrinsics */
+#ifdef HAVE_PMMINTRIN_H
+#include <pmmintrin.h>
+#elif HAVE_INTRIN_H
+#include <intrin.h>
+#else
+/* avoid using negative zero as some configurations have problems with those */
+static const FFTS_ALIGN(16) unsigned int sign_mask[4] = {
+ 0x80000000, 0, 0x80000000, 0
+};
+#endif
#endif
static void
@@ -88,8 +100,10 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
p->plans[0]->transform(p->plans[0], input, buf);
+#ifndef HAVE_SSE
buf[N + 0] = buf[0];
buf[N + 1] = buf[1];
+#endif
#ifdef __ARM_NEON__
for (i = 0; i < N/2; i += 2) {
@@ -134,18 +148,67 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-#elif HAVE_SSE
+#elif HAVE_SSE3
if (N < 8) {
- for (i = 0; i < N/2; i++) {
- out[2*i + 0] =
- buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] +
- buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1];
- out[2*i + 1] =
- buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] +
- buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
+ const __m128 t0 = _mm_load_ps(buf);
+ const __m128 t1 = _mm_load_ps(A);
+ const __m128 t2 = _mm_load_ps(B);
+
+ _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps(
+ _mm_mul_ps(t0, _mm_moveldup_ps(t1)),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t1))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2))));
+ } else {
+ __m128 t0 = _mm_load_ps(buf);
+
+ for (i = 0; i < N; i += 8) {
+ __m128 t1 = _mm_load_ps(buf + i);
+ __m128 t2 = _mm_load_ps(buf + N - i - 4);
+ __m128 t3 = _mm_load_ps(A + i);
+ __m128 t4 = _mm_load_ps(B + i);
+
+ _mm_store_ps(out + i, _mm_add_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ t0 = _mm_load_ps(buf + N - i - 8);
+ t1 = _mm_load_ps(buf + i + 4);
+ t3 = _mm_load_ps(A + i + 4);
+ t4 = _mm_load_ps(B + i + 4);
+
+ _mm_store_ps(out + i + 4, _mm_add_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
+ }
+#elif HAVE_SSE
+ if (N < 8) {
+ const __m128 c0 = _mm_load_ps((const float*) sign_mask);
+ const __m128 t0 = _mm_load_ps(buf);
+ const __m128 t1 = _mm_load_ps(A);
+ const __m128 t2 = _mm_load_ps(B);
+
+ _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+ _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0),
+ _MM_SHUFFLE(2,3,0,1)))));
} else {
- const __m128 c0 = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f);
+ const __m128 c0 = _mm_load_ps((const float*) sign_mask);
__m128 t0 = _mm_load_ps(buf);
for (i = 0; i < N; i += 8) {
@@ -278,7 +341,7 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
__m128 t2 = _mm_load_ps(in + N - i - 4);
__m128 t3 = _mm_load_ps(A + i);
__m128 t4 = _mm_load_ps(B + i);
-
+
_mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
_mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
_mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
@@ -361,10 +424,14 @@ ffts_init_1d_real(size_t N, int sign)
if (sign < 0) {
for (i = 0; i < N/2; i++) {
- p->A[2 * i + 0] = (float) (0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)));
- p->A[2 * i + 1] = (float) (0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
+ p->A[2 * i + 0] = (float) ( 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)));
+ p->A[2 * i + 1] = (float) ( 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
+#ifdef HAVE_SSE3
+ p->B[2 * i + 0] = (float) (-0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)));
+#else
p->B[2 * i + 0] = (float) (0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)));
- p->B[2 * i + 1] = (float) (0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
+#endif
+ p->B[2 * i + 1] = (float) ( 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)));
}
} else {
for (i = 0; i < N/2; i++) {
OpenPOWER on IntegriCloud