summaryrefslogtreecommitdiffstats
path: root/src/ffts_nd.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/ffts_nd.c')
-rw-r--r--src/ffts_nd.c112
1 files changed, 1 insertions, 111 deletions
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index 49c6229..64220f1 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -34,15 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts_nd.h"
#include "ffts_internal.h"
-
-#ifdef HAVE_NEON
-#include "neon.h"
-#include <arm_neon.h>
-#elif HAVE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define TSIZE 8
+#include "ffts_transpose.h"
static void
ffts_free_nd(ffts_plan_t *p)
@@ -86,108 +78,6 @@ ffts_free_nd(ffts_plan_t *p)
}
static void
-ffts_transpose(uint64_t *in, uint64_t *out, int w, int h)
-{
-#ifdef HAVE_NEON
-#if 0
- neon_transpose4(in, out, w, h);
-#else
- neon_transpose8(in, out, w, h);
-#endif
-#else
-#ifdef HAVE_SSE
- uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
- int tx, ty;
- /* int x; */
- int y;
- int tw = w / TSIZE;
- int th = h / TSIZE;
-
- for (ty = 0; ty < th; ty++) {
- for (tx = 0; tx < tw; tx++) {
- uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
- uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */
-
- /* copy/transpose to tmp */
- for (y = 0; y < TSIZE; y += 2) {
- /* for (x=0;x<TSIZE;x+=2) {
- op[x*TSIZE] = ip[x];
- */
- __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w));
- __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w));
- __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w));
- __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w));
- __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w));
- __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w));
- __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w));
- __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w));
-
- __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
- __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
- __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
- __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
- __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
- __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
- __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
- __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
-
- ip0 += 2;
- /* _mm_store_pd((double *)(op0 + y*h + x), t0);
- _mm_store_pd((double *)(op0 + y*h + x + h), t1);
- */
-
- _mm_store_pd((double*)(op0 + 0 ), t0);
- _mm_store_pd((double*)(op0 + 0 + TSIZE), t1);
- _mm_store_pd((double*)(op0 + 2 ), t2);
- _mm_store_pd((double*)(op0 + 2 + TSIZE), t3);
- _mm_store_pd((double*)(op0 + 4 ), t4);
- _mm_store_pd((double*)(op0 + 4 + TSIZE), t5);
- _mm_store_pd((double*)(op0 + 6 ), t6);
- _mm_store_pd((double*)(op0 + 6 + TSIZE), t7);
- /* } */
-
- op0 += 2*TSIZE;
- }
-
- op0 = out + h*tx*TSIZE + ty*TSIZE;
- ip0 = tmp;
- for (y = 0; y < TSIZE; y += 1) {
- /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */
-
- __m128d q0 = _mm_load_pd((double*)(ip0 + 0));
- __m128d q1 = _mm_load_pd((double*)(ip0 + 2));
- __m128d q2 = _mm_load_pd((double*)(ip0 + 4));
- __m128d q3 = _mm_load_pd((double*)(ip0 + 6));
-
- _mm_store_pd((double*)(op0 + 0), q0);
- _mm_store_pd((double*)(op0 + 2), q1);
- _mm_store_pd((double*)(op0 + 4), q2);
- _mm_store_pd((double*)(op0 + 6), q3);
-
- op0 += h;
- ip0 += TSIZE;
- }
- }
- }
- /*
- size_t i,j;
- for(i=0;i<w;i+=2) {
- for(j=0;j<h;j+=2) {
- // out[i*h + j] = in[j*w + i];
- __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
- __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
- __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
- __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
- _mm_store_pd((double *)(out + i*h + j), t0);
- _mm_store_pd((double *)(out + i*h + j + h), t1);
- }
- }
- */
-#endif
-#endif
-}
-
-static void
ffts_execute_nd(ffts_plan_t *p, const void *in, void *out)
{
uint64_t *din = (uint64_t*) in;
OpenPOWER on IntegriCloud