summaryrefslogtreecommitdiffstats
path: root/src/ffts_real.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-09 15:37:53 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-07-09 15:37:53 +0300
commit7e018bb933d5291155739614e422773c4c2d8781 (patch)
tree446f20ed4b744803a6338033d1d14478ed1d45a5 /src/ffts_real.c
parent06eb1e603f9527c1cf205d630fa5c58bd808f9fb (diff)
downloadffts-7e018bb933d5291155739614e422773c4c2d8781.zip
ffts-7e018bb933d5291155739614e422773c4c2d8781.tar.gz
Unroll loops to process 64 byte cache line per iteration
Diffstat (limited to 'src/ffts_real.c')
-rw-r--r--src/ffts_real.c244
1 files changed, 205 insertions, 39 deletions
diff --git a/src/ffts_real.c b/src/ffts_real.c
index a737696..0327f15 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -152,22 +152,36 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
);
}
#elif HAVE_SSE3
- if (N < 8) {
+ if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_load_ps(buf);
- __m128 t1 = _mm_load_ps(A);
- __m128 t2 = _mm_load_ps(B);
+ __m128 t1 = _mm_load_ps(buf + N - 4);
+ __m128 t2 = _mm_load_ps(A);
+ __m128 t3 = _mm_load_ps(B);
_mm_store_ps(out, _mm_add_ps(_mm_addsub_ps(
- _mm_mul_ps(t0, _mm_moveldup_ps(t1)),
+ _mm_mul_ps(t0, _mm_moveldup_ps(t2)),
_mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)),
- _mm_movehdup_ps(t1))), _mm_addsub_ps(
- _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)),
- _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1))),
- _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2))));
+ _mm_movehdup_ps(t2))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3))));
+
+ if (N == 8) {
+ t2 = _mm_load_ps(A + 4);
+ t3 = _mm_load_ps(B + 4);
+
+ _mm_store_ps(out + 4, _mm_add_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t2)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t2))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3))));
+ }
} else {
__m128 t0 = _mm_load_ps(buf);
- for (i = 0; i < N; i += 8) {
+ for (i = 0; i < N; i += 16) {
__m128 t1 = _mm_load_ps(buf + i);
__m128 t2 = _mm_load_ps(buf + N - i - 4);
__m128 t3 = _mm_load_ps(A + i);
@@ -193,28 +207,69 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
_mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ t1 = _mm_load_ps(buf + i + 8);
+ t2 = _mm_load_ps(buf + N - i - 12);
+ t3 = _mm_load_ps(A + i + 8);
+ t4 = _mm_load_ps(B + i + 8);
+
+ _mm_store_ps(out + i + 8, _mm_add_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ t0 = _mm_load_ps(buf + N - i - 16);
+ t1 = _mm_load_ps(buf + i + 12);
+ t3 = _mm_load_ps(A + i + 12);
+ t4 = _mm_load_ps(B + i + 12);
+
+ _mm_store_ps(out + i + 12, _mm_add_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
#elif HAVE_SSE
- if (N < 8) {
+ if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_even);
__m128 t0 = _mm_load_ps(buf);
- __m128 t1 = _mm_load_ps(A);
- __m128 t2 = _mm_load_ps(B);
+ __m128 t1 = _mm_load_ps(buf + N - 4);
+ __m128 t2 = _mm_load_ps(A);
+ __m128 t3 = _mm_load_ps(B);
_mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps(
- _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(t0, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))),
_mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)),
- _mm_xor_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3,3,1,1)), c0))),
- _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)),
- _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)),
- _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0),
+ _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0),
_MM_SHUFFLE(2,3,0,1)))));
+
+ if (N == 8) {
+ t2 = _mm_load_ps(A + 4);
+ t3 = _mm_load_ps(B + 4);
+
+ _mm_store_ps(out + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0),
+ _MM_SHUFFLE(2,3,0,1)))));
+ }
} else {
__m128 c0 = _mm_load_ps((const float*) sign_mask_even);
__m128 t0 = _mm_load_ps(buf);
- for (i = 0; i < N; i += 8) {
+ for (i = 0; i < N; i += 16) {
__m128 t1 = _mm_load_ps(buf + i);
__m128 t2 = _mm_load_ps(buf + N - i - 4);
__m128 t3 = _mm_load_ps(A + i);
@@ -242,6 +297,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
_mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
_MM_SHUFFLE(2,3,0,1)))));
+
+ t1 = _mm_load_ps(buf + i + 8);
+ t2 = _mm_load_ps(buf + N - i - 12);
+ t3 = _mm_load_ps(A + i + 8);
+ t4 = _mm_load_ps(B + i + 8);
+
+ _mm_store_ps(out + i + 8, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+ _MM_SHUFFLE(2,3,0,1)))));
+
+ t0 = _mm_load_ps(buf + N - i - 16);
+ t1 = _mm_load_ps(buf + i + 12);
+ t3 = _mm_load_ps(A + i + 12);
+ t4 = _mm_load_ps(B + i + 12);
+
+ _mm_store_ps(out + i + 12, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+ _MM_SHUFFLE(2,3,0,1)))));
}
}
#else
@@ -326,23 +409,37 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
);
}
#elif HAVE_SSE3
- if (N < 8) {
- __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]);
+ if (FFTS_UNLIKELY(N <= 8)) {
+ __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
__m128 t1 = _mm_load_ps(in);
- __m128 t2 = _mm_load_ps(A);
- __m128 t3 = _mm_load_ps(B);
+ __m128 t2 = _mm_load_ps(in + N - 4);
+ __m128 t3 = _mm_load_ps(A);
+ __m128 t4 = _mm_load_ps(B);
_mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps(
- _mm_mul_ps(t1, _mm_moveldup_ps(t2)),
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
_mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
- _mm_movehdup_ps(t2))), _mm_addsub_ps(
- _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
- _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))),
- _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3))));
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ if (N == 8) {
+ t3 = _mm_load_ps(A + 4);
+ t4 = _mm_load_ps(B + 4);
+
+ _mm_store_ps(buf + 4, _mm_sub_ps(_mm_addsub_ps(
+ _mm_mul_ps(t2, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)), t4))));
+ }
} else {
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
- for (i = 0; i < N; i += 8) {
+ for (i = 0; i < N; i += 16) {
__m128 t1 = _mm_load_ps(in + i);
__m128 t2 = _mm_load_ps(in + N - i - 4);
__m128 t3 = _mm_load_ps(A + i);
@@ -368,29 +465,70 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
_mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ t1 = _mm_load_ps(in + i + 8);
+ t2 = _mm_load_ps(in + N - i - 12);
+ t3 = _mm_load_ps(A + i + 8);
+ t4 = _mm_load_ps(B + i + 8);
+
+ _mm_store_ps(buf + i + 8, _mm_sub_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+ t0 = _mm_load_ps(in + N - i - 16);
+ t1 = _mm_load_ps(in + i + 12);
+ t3 = _mm_load_ps(A + i + 12);
+ t4 = _mm_load_ps(B + i + 12);
+
+ _mm_store_ps(buf + i + 12, _mm_sub_ps(_mm_addsub_ps(
+ _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_movehdup_ps(t3))), _mm_addsub_ps(
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
#elif HAVE_SSE
- if (N < 8) {
+ if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
- __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]);
+ __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
__m128 t1 = _mm_load_ps(in);
- __m128 t2 = _mm_load_ps(A);
- __m128 t3 = _mm_load_ps(B);
+ __m128 t2 = _mm_load_ps(in + N - 4);
+ __m128 t3 = _mm_load_ps(A);
+ __m128 t4 = _mm_load_ps(B);
_mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
- _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
_mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
- _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))),
- _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
- _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1)))),
- _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)),
- _mm_xor_ps(t3, c0))));
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t4, c0))));
+
+ if (N == 8) {
+ t3 = _mm_load_ps(A + 4);
+ t4 = _mm_load_ps(B + 4);
+
+ _mm_store_ps(buf + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+ _mm_mul_ps(t2, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t4, c0))));
+ }
} else {
__m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
- for (i = 0; i < N; i += 8) {
+ for (i = 0; i < N; i += 16) {
__m128 t1 = _mm_load_ps(in + i);
__m128 t2 = _mm_load_ps(in + N - i - 4);
__m128 t3 = _mm_load_ps(A + i);
@@ -418,6 +556,34 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)),
_mm_xor_ps(t4, c0))));
+
+ t1 = _mm_load_ps(in + i + 8);
+ t2 = _mm_load_ps(in + N - i - 12);
+ t3 = _mm_load_ps(A + i + 8);
+ t4 = _mm_load_ps(B + i + 8);
+
+ _mm_store_ps(buf + i + 8, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t4, c0))));
+
+ t0 = _mm_load_ps(in + N - i - 16);
+ t1 = _mm_load_ps(in + i + 12);
+ t3 = _mm_load_ps(A + i + 12);
+ t4 = _mm_load_ps(B + i + 12);
+
+ _mm_store_ps(buf + i + 12, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+ _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+ _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+ _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+ _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+ _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)),
+ _mm_xor_ps(t4, c0))));
}
}
#else
OpenPOWER on IntegriCloud