diff options
-rw-r--r-- | config.h.in | 10 | ||||
-rwxr-xr-x | configure | 102 | ||||
-rw-r--r-- | configure.ac | 13 | ||||
-rw-r--r-- | include/ffts.h | 4 | ||||
-rw-r--r-- | src/Makefile.am | 1 | ||||
-rw-r--r-- | src/Makefile.in | 1 | ||||
-rw-r--r-- | src/cp_sse.c | 446 | ||||
-rw-r--r-- | src/cp_sse.h | 11 | ||||
-rw-r--r-- | src/macros.h | 528 | ||||
-rw-r--r-- | src/neon_float.h | 1037 | ||||
-rw-r--r-- | src/patterns.c | 54 | ||||
-rw-r--r-- | src/patterns.h | 6 | ||||
-rw-r--r-- | src/sse_float.h | 33 |
13 files changed, 1790 insertions, 456 deletions
diff --git a/config.h.in b/config.h.in index 69071a2..7922cd6 100644 --- a/config.h.in +++ b/config.h.in @@ -12,13 +12,12 @@ /* Define to 1 if you have the `m' library (-lm). */ #undef HAVE_LIBM -/* Define to 1 if your system has a GNU libc compatible `malloc' function, and - to 0 otherwise. */ -#undef HAVE_MALLOC - /* Define to 1 if you have the <memory.h> header file. */ #undef HAVE_MEMORY_H +/* Define to FFT with ARM NEON. */ +#undef HAVE_NEON + /* Define to 1 if you have the `pow' function. */ #undef HAVE_POW @@ -97,9 +96,6 @@ such a type exists and the standard includes do not define it. */ #undef int32_t -/* Define to rpl_malloc if the replacement function should be used. */ -#undef malloc - /* Define to the equivalent of the C99 'restrict' keyword, or to nothing if this is not supported. Do not define if restrict is supported directly. */ @@ -628,6 +628,8 @@ LIBOBJS EGREP GREP CPP +HAVE_NEON_FALSE +HAVE_NEON_TRUE am__fastdepCC_FALSE am__fastdepCC_TRUE CCDEPMODE @@ -717,6 +719,7 @@ ac_user_opts=' enable_option_checking enable_dependency_tracking enable_single +enable_neon ' ac_precious_vars='build_alias host_alias @@ -1349,6 +1352,7 @@ Optional Features: --disable-dependency-tracking speeds up one-time build --enable-single compile single-precision library + --enable-neon enable NEON extensions Some influential environment variables: CXX C++ compiler command @@ -4270,6 +4274,7 @@ fi #SFFT_CFLAGS="$CFLAGS" #SFFT_CC="$CC" +SIMD=sse # Check whether --enable-single was given. if test "${enable_single+set}" = set; then : @@ -4289,6 +4294,31 @@ $as_echo "#define FFTS_PREC_SINGLE 0" >>confdefs.h fi +# Check whether --enable-neon was given. +if test "${enable_neon+set}" = set; then : + enableval=$enable_neon; have_neon=$enableval +else + have_neon=no +fi + +if test "$have_neon" = "yes"; then + if test "$SIMD" != "sse"; then + as_fn_error $? "conflicting SIMD extensisons specified" "$LINENO" 5 + fi + +$as_echo "#define HAVE_NEON 1" >>confdefs.h + +fi + if test "$have_neon" = "yes"; then + HAVE_NEON_TRUE= + HAVE_NEON_FALSE='#' +else + HAVE_NEON_TRUE='#' + HAVE_NEON_FALSE= +fi + + + #if test "$ord_sr" = "no"; then # AC_DEFINE(SFFT_ORD_SR,0,[Define to enable ordinary split radix.]) #fi @@ -4968,73 +4998,7 @@ _ACEOF # Checks for library functions. -for ac_header in stdlib.h -do : - ac_fn_c_check_header_mongrel "$LINENO" "stdlib.h" "ac_cv_header_stdlib_h" "$ac_includes_default" -if test "x$ac_cv_header_stdlib_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_STDLIB_H 1 -_ACEOF - -fi - -done - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU libc compatible malloc" >&5 -$as_echo_n "checking for GNU libc compatible malloc... " >&6; } -if ${ac_cv_func_malloc_0_nonnull+:} false; then : - $as_echo_n "(cached) " >&6 -else - if test "$cross_compiling" = yes; then : - ac_cv_func_malloc_0_nonnull=no -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#if defined STDC_HEADERS || defined HAVE_STDLIB_H -# include <stdlib.h> -#else -char *malloc (); -#endif - -int -main () -{ -return ! malloc (0); - ; - return 0; -} -_ACEOF -if ac_fn_c_try_run "$LINENO"; then : - ac_cv_func_malloc_0_nonnull=yes -else - ac_cv_func_malloc_0_nonnull=no -fi -rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ - conftest.$ac_objext conftest.beam conftest.$ac_ext -fi - -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_malloc_0_nonnull" >&5 -$as_echo "$ac_cv_func_malloc_0_nonnull" >&6; } -if test $ac_cv_func_malloc_0_nonnull = yes; then : - -$as_echo "#define HAVE_MALLOC 1" >>confdefs.h - -else - $as_echo "#define HAVE_MALLOC 0" >>confdefs.h - - case " $LIBOBJS " in - *" malloc.$ac_objext "* ) ;; - *) LIBOBJS="$LIBOBJS malloc.$ac_objext" - ;; -esac - - -$as_echo "#define malloc rpl_malloc" >>confdefs.h - -fi - - +#AC_FUNC_MALLOC for ac_func in gettimeofday pow do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` @@ -5188,6 +5152,10 @@ if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then as_fn_error $? "conditional \"am__fastdepCC\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${HAVE_NEON_TRUE}" && test -z "${HAVE_NEON_FALSE}"; then + as_fn_error $? "conditional \"HAVE_NEON\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 diff --git a/configure.ac b/configure.ac index 06cb913..25d6c71 100644 --- a/configure.ac +++ b/configure.ac @@ -20,6 +20,7 @@ AC_PROG_CC #SFFT_CFLAGS="$CFLAGS" #SFFT_CC="$CC" +SIMD=sse AC_ARG_ENABLE(single, [AC_HELP_STRING([--enable-single],[compile single-precision library])], sfft_single=$enableval, sfft_single=no) if test "$sfft_single" = "yes"; then @@ -29,6 +30,16 @@ if test "$sfft_single" = "no"; then AC_DEFINE(FFTS_PREC_SINGLE,0,[Define to FFT in single precision.]) fi +AC_ARG_ENABLE(neon, [AC_HELP_STRING([--enable-neon],[enable NEON extensions])], have_neon=$enableval, have_neon=no) +if test "$have_neon" = "yes"; then + if test "$SIMD" != "sse"; then + AC_MSG_ERROR([conflicting SIMD extensisons specified]) + fi + AC_DEFINE(HAVE_NEON,1,[Define to FFT with ARM NEON.]) +fi +AM_CONDITIONAL(HAVE_NEON, test "$have_neon" = "yes") + + #if test "$ord_sr" = "no"; then # AC_DEFINE(SFFT_ORD_SR,0,[Define to enable ordinary split radix.]) #fi @@ -49,7 +60,7 @@ AC_TYPE_SIZE_T AC_TYPE_UINT64_T # Checks for library functions. -AC_FUNC_MALLOC +#AC_FUNC_MALLOC AC_CHECK_FUNCS([gettimeofday pow]) diff --git a/include/ffts.h b/include/ffts.h index e266491..9bd0dbe 100644 --- a/include/ffts.h +++ b/include/ffts.h @@ -45,7 +45,7 @@ struct _ffts_plan_t { ptrdiff_t *is; ptrdiff_t *offsets; void __attribute__ ((aligned(32))) **ws; - void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict); + void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict); size_t i0, i1, i2; uint64_t n_bits, leaftime; @@ -57,6 +57,6 @@ typedef struct _ffts_plan_t ffts_plan_t; void ffts_execute(ffts_plan_t * restrict, const void * restrict, const void * restrict); ffts_plan_t *ffts_init(size_t N, int sign); - +void ffts_free(ffts_plan_t *); #endif diff --git a/src/Makefile.am b/src/Makefile.am index a2ea644..f005968 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -8,6 +8,7 @@ all: $(OBJLIBS) %.o: %.c $(HDRS) $(CC) $(CFLAGS) -c -o $@ $< -I../include + $(CC) $(CFLAGS) -S $< -I../include $(OBJLIBS): $(OBJS) $(AR) rcs libffts.a $(OBJS) diff --git a/src/Makefile.in b/src/Makefile.in index fbc26bb..340688f 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -346,6 +346,7 @@ all: $(OBJLIBS) %.o: %.c $(HDRS) $(CC) $(CFLAGS) -c -o $@ $< -I../include + $(CC) $(CFLAGS) -S $< -I../include $(OBJLIBS): $(OBJS) $(AR) rcs libffts.a $(OBJS) diff --git a/src/cp_sse.c b/src/cp_sse.c index 8b09031..2ac13c4 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -3,213 +3,174 @@ #include "patterns.h" __INLINE void -firstpass_type_1(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) { - size_t i, i0 = p->i0, i1 = p->i1; +firstpass_type_1(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { + size_t i, ii0 = p->i0, ii1 = p->i1; size_t *offsets = (size_t *)p->offsets; size_t *is = (size_t *)p->is; - for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out); - for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out); +#ifdef __ARM_NEON__ + const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7]; + for(i=ii0;i>0;--i) { + neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); + offsets += 2; + } + for(i=ii1;i>0;--i) { + neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); + offsets += 2; + } + neon_shl8_oe(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); + offsets += 2; + for(i=ii1;i>0;--i) { + neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2); + offsets += 2; + } + +#else + for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out); + for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out); LEAF_OE(&is, in, &offsets, out); - for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out); + for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out); +#endif } __INLINE void -firstpass_type_2(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) { - size_t i, i0 = p->i0, i1 = p->i1; +firstpass_type_2(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { + size_t i, ii0 = p->i0, ii1 = p->i1; size_t *offsets = (size_t *)p->offsets; size_t *is = (size_t *)p->is; - for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out); +#ifdef __ARM_NEON__ + const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7]; + + for(i=ii0;i>0;--i) { + neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); + offsets+=2; + } + neon_shl8_eo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); + offsets += 2; + for(i=ii1;i>0;--i) { + neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); + offsets += 2; + } + for(i=ii1;i>0;--i) { + neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2); + offsets += 2; + } + +#else + for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out); LEAF_EO(&is, in, &offsets, out); - for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out); - for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out); + for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out); + for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out); +#endif } __INLINE void -firstpass_64(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) { +firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { size_t *offsets = (size_t *)p->offsets; size_t *is = (size_t *)p->is; LEAF_EE(&is, in, &offsets, out); LEAF_OE(&is, in, &offsets, out); } -void -firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { - __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; - float *LUT8 = p->ws[0]; - float *LUT16 = p->ws[1]; - float *LUT32 = p->ws[2]; - - L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19); - L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23); - K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31); - L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27); - K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13); - K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15); - K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r16_17,&r18_19,&r20_21,&r22_23); - K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r24_25,&r26_27,&r28_29,&r30_31); - K_N(_mm_load_ps(LUT32),_mm_load_ps(LUT32+4),&r0_1,&r8_9,&r16_17,&r24_25); - S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48); - K_N(_mm_load_ps(LUT32+8),_mm_load_ps(LUT32+12),&r2_3,&r10_11,&r18_19,&r26_27); - S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52); - K_N(_mm_load_ps(LUT32+16),_mm_load_ps(LUT32+20),&r4_5,&r12_13,&r20_21,&r28_29); - S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56); - K_N(_mm_load_ps(LUT32+24),_mm_load_ps(LUT32+28),&r6_7,&r14_15,&r22_23,&r30_31); - S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60); - -} - -void -firstpass_16(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { - __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; - float *LUT8 = p->ws[0]; - float *LUT16 = p->ws[1]; - - L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11); - L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13); - K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13); - S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24); - K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15); - S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28); -} - -void -firstpass_8(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { - __m128 r0_1,r2_3,r4_5,r6_7; - float *LUT8 = p->ws[0]; - L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7); - K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12); -} -void -firstpass_4(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { - __m128 r0,r1,r2,r3; - L_4(in+0,in+4,in+2,in+6,&r0,&r1,&r2,&r3); - S_4(r0,r1,r2,r3,out+0,out+2,out+4,out+6); -} -void -firstpass_2(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { - __m128 r0,r1; - L_S2(in+0,in+2,&r0,&r1); - S_2(r0,r1,out+0,out+2); -} - -void X_8(data_t * restrict data0, size_t N, const data_t * restrict LUT) { - data_t *data2 = data0 + 2*N/4; - data_t *data4 = data0 + 4*N/4; - data_t *data6 = data0 + 6*N/4; - data_t *data1 = data0 + 1*N/4; - data_t *data3 = data0 + 3*N/4; - data_t *data5 = data0 + 5*N/4; - data_t *data7 = data0 + 7*N/4; - size_t k, n4 = N/4; - - for(k=N/8/2;k>0;--k) { - __m128 r0, r1, r2, r3, r4, r5, r6, r7; - r0 = _mm_load_ps(data0); - r1 = _mm_load_ps(data1); - r2 = _mm_load_ps(data2); - r3 = _mm_load_ps(data3); - K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &r0, &r1, &r2, &r3); - r4 = _mm_load_ps(data4); - r6 = _mm_load_ps(data6); - K_N(_mm_load_ps(LUT+8), _mm_load_ps(LUT+12), &r0, &r2, &r4, &r6); - r5 = _mm_load_ps(data5); - r7 = _mm_load_ps(data7); - K_N(_mm_load_ps(LUT+16), _mm_load_ps(LUT+20), &r1, &r3, &r5, &r7); - LUT += 24; - _mm_store_ps(data0, r0); data0 += 4; - _mm_store_ps(data1, r1); data1 += 4; - _mm_store_ps(data2, r2); data2 += 4; - _mm_store_ps(data3, r3); data3 += 4; - _mm_store_ps(data4, r4); data4 += 4; - _mm_store_ps(data5, r5); data5 += 4; - _mm_store_ps(data6, r6); data6 += 4; - _mm_store_ps(data7, r7); data7 += 4; - } -} -void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out, size_t N) { +void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) { transform_index_t *ps = p->transforms; + int leafN = 8; + p->firstpass((const float *)in, (float *)out, p); + size_t ps0_next = ps[0]; + while(ps0_next) { + size_t ps0 = ps0_next; + size_t ps1 = ps[1]; + ps0_next = ps[2]; + ps += 2; - p->firstpass((const float *)in, (float *)out, N, p); - while(ps[0]) { - - if(ps[0] == 32) { + if(ps0 == 2*leafN) { float *LUT = (float *)p->ws[0]; - float *data = (float *)(out) + ps[1]; - size_t n = 32; - size_t i; - for(i=0;i<n/4/2;i++) { - __m128 uk = _mm_load_ps(data); - __m128 uk2 = _mm_load_ps(data + 2*n/4); - __m128 zk_p = _mm_load_ps(data + 4*n/4); - __m128 zk_n = _mm_load_ps(data + 6*n/4); - - K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &uk, &uk2, &zk_p, &zk_n); - - _mm_store_ps(data, uk); - _mm_store_ps(data + 2*n/4, uk2); - _mm_store_ps(data + 4*n/4, zk_p); - _mm_store_ps(data + 6*n/4, zk_n); - - LUT += 8; - data += 4; - } - + float *data = (float *)(out) + ps1; + #ifdef __ARM_NEON__ + X_4_SPLIT(data, 16, LUT); + #else + X_4(data, 16, LUT); + #endif }else{ - int index = __builtin_ctzl(ps[0])-5; - float *LUT = (float *)p->ws[__builtin_ctzl(ps[0])-5]; - X_8(((float *)out) + ps[1], ps[0], LUT); + int index = __builtin_ctzl(ps0)-4; + float *LUT = (float *)p->ws[__builtin_ctzl(ps0)-4]; + #ifdef __ARM_NEON__ + X_8_SPLIT(((float *)out) + ps1, ps0, LUT); + #else + X_8(((float *)out) + ps1, ps0, LUT); + #endif } - ps += 2; + } + #ifdef __ARM_NEON__ + if(p->N>32) + X_8_SPLIT_T((float *)out, p->N, p->lastlut); + #endif } +void ffts_free(ffts_plan_t *p) { + + size_t i; + + if(p->ws) { + for(i=0;i<p->n_luts;i++) { + FFTS_FREE(p->ws[i]); + } + free(p->ws); + } + if(p->is) free(p->is); + if(p->offsets) free(p->offsets); + free(p->transforms); + + free(p); +} ffts_plan_t *ffts_init(size_t N, int sign) { ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - size_t leafN = 16; + size_t leafN = 8; size_t i; - if(sign < 0) MULI_SIGN = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f); - else MULI_SIGN = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f); + if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); + + if(sign < 0) SCALAR_MULI_SIGN = -0.0f*I; + else SCALAR_MULI_SIGN = -0.0f; if(N > 32) { - init_offsets(p, N, leafN); - init_is(p, N, leafN, 2); - init_tree(p, N, leafN); + ffts_init_offsets(p, N, leafN); + ffts_init_is(p, N, leafN, 2); + ffts_init_tree(p, N, leafN); - if(N == 64) p->firstpass = &firstpass_64; - else if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_2; - else p->firstpass = &firstpass_type_1; - - LEAFLUT[0] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941); - LEAFLUT[1] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); - LEAFLUT[2] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011); - LEAFLUT[3] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199); - LEAFLUT[4] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981); - LEAFLUT[5] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011); + // if(N == 64) p->firstpass = &firstpass_64; + if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_1; + else p->firstpass = &firstpass_type_2; + + LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941); + LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); + LEAFLUT[2] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011); + LEAFLUT[3] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199); + LEAFLUT[4] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981); + LEAFLUT[5] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011); - LEAFLUT[6] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1); - LEAFLUT[7] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0); - LEAFLUT[8] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1); - LEAFLUT[9] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0); - LEAFLUT[10] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941); - LEAFLUT[11] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); + LEAFLUT[6] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1); + LEAFLUT[7] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0); + LEAFLUT[8] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1); + LEAFLUT[9] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0); + LEAFLUT[10] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941); + LEAFLUT[11] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); if(sign > 0) { - LEAFLUT[1] = _mm_xor_ps(LEAFLUT[1], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); - LEAFLUT[3] = _mm_xor_ps(LEAFLUT[3], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); - LEAFLUT[5] = _mm_xor_ps(LEAFLUT[5], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); - LEAFLUT[7] = _mm_xor_ps(LEAFLUT[7], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); - LEAFLUT[9] = _mm_xor_ps(LEAFLUT[9], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); - LEAFLUT[11] = _mm_xor_ps(LEAFLUT[11], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + V neg = VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); + LEAFLUT[1] = VXOR(LEAFLUT[1], neg); + LEAFLUT[3] = VXOR(LEAFLUT[3], neg); + LEAFLUT[5] = VXOR(LEAFLUT[5], neg); + LEAFLUT[7] = VXOR(LEAFLUT[7], neg); + LEAFLUT[9] = VXOR(LEAFLUT[9], neg); + LEAFLUT[11] = VXOR(LEAFLUT[11], neg); } p->i0 = N/leafN/3+1; @@ -223,11 +184,14 @@ ffts_plan_t *ffts_init(size_t N, int sign) { p->transforms[0] = 0; p->transforms[1] = 1; if(N == 2) p->firstpass = &firstpass_2; - else if(N == 4) p->firstpass = &firstpass_4; + else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f; + else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b; else if(N == 8) p->firstpass = &firstpass_8; else if(N == 16) p->firstpass = &firstpass_16; else if(N == 32) p->firstpass = &firstpass_32; + p->is = NULL; + p->offsets = NULL; } int hardcoded = 0; @@ -236,9 +200,14 @@ ffts_plan_t *ffts_init(size_t N, int sign) { size_t n_luts = __builtin_ctzl(N/leafN); if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } + if(n_luts >= 32) n_luts = 0; - //printf("n_luts = %zu\n", n_luts); - p->ws = malloc(n_luts * sizeof(data_t *)); +// fprintf(stderr, "n_luts = %zu\n", n_luts); + if(n_luts) + p->ws = malloc(n_luts * sizeof(data_t *)); + else + p->ws = NULL; + cdata_t *w; int n = leafN*2; @@ -246,45 +215,67 @@ ffts_plan_t *ffts_init(size_t N, int sign) { for(i=0;i<n_luts;i++) { - //printf("LUT[%zu] = %d\n", i, n); +// fprintf(stderr, "LUT[%zu] = %d\n", i, n); if(!i || hardcoded) { - - w = _mm_malloc(n/4 * 2 * sizeof(cdata_t), 32); - - cdata_t *w0 = _mm_malloc(n/4 * sizeof(cdata_t), 32); + cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); size_t j; for(j=0;j<n/4;j++) { w0[j] = W(n,j); } - __m128 temp0, temp1, temp2; - float *fw = (float *)w; float *fw0 = (float *)w0; + #ifdef __ARM_NEON__ + if(N <= 32) { + w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2; + for(j=0;j<n/4;j+=2) { + temp0 = VLD(fw0 + j*2); + V re, im; + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*4 , re); + VST(fw + j*4+4, im); + } + }else{ + w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); + float *fw = (float *)w; + VS temp0, temp1, temp2; + for(j=0;j<n/4;j+=4) { + temp0 = VLD2(fw0 + j*2); + STORESPR(fw + j*2, temp0); + } + } + #else + w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2; for(j=0;j<n/4;j+=2) { - temp0 = _mm_load_ps(fw0 + j*2); - __m128 re, im; - re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0)); - im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, MULI_SIGN); - _mm_store_ps(fw + j*4 , re); - _mm_store_ps(fw + j*4+4, im); + temp0 = VLD(fw0 + j*2); + V re, im; + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*4 , re); + VST(fw + j*4+4, im); } + #endif // for(j=0;j<n/2;j++) { // printf("%f %f\n", creal(w[j]), cimag(w[j])); // } - _mm_free(w0); + FFTS_FREE(w0); }else{ - w = _mm_malloc(n/8 * 3 * 2 * sizeof(cdata_t), 32); - cdata_t *w0 = _mm_malloc(n/8 * sizeof(cdata_t), 32); - cdata_t *w1 = _mm_malloc(n/8 * sizeof(cdata_t), 32); - cdata_t *w2 = _mm_malloc(n/8 * sizeof(cdata_t), 32); + cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); size_t j; for(j=0;j<n/8;j++) { @@ -294,46 +285,69 @@ ffts_plan_t *ffts_init(size_t N, int sign) { } - __m128 temp0, temp1, temp2, re, im; - - float *fw = (float *)w; float *fw0 = (float *)w0; float *fw1 = (float *)w1; float *fw2 = (float *)w2; + #ifdef __ARM_NEON__ + w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); + float *fw = (float *)w; + VS temp0, temp1, temp2; + for(j=0;j<n/8;j+=4) { + temp0 = VLD2(fw0 + j*2); + STORESPR(fw + j*2*3, temp0); + //VST(fw + j*2*3, temp0.val[0]); + //VST(fw + j*2*3 + 4, temp0.val[1]); + temp1 = VLD2(fw1 + j*2); + STORESPR(fw + j*2*3 + 8, temp1); + //VST(fw + j*2*3 + 8, temp1.val[0]); + //VST(fw + j*2*3 + 12, temp1.val[1]); + temp2 = VLD2(fw2 + j*2); + STORESPR(fw + j*2*3 + 16, temp2); + //VST(fw + j*2*3 + 16, temp2.val[0]); + //VST(fw + j*2*3 + 20, temp2.val[1]); + #else + w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2, re, im; for(j=0;j<n/8;j+=2) { - temp0 = _mm_load_ps(fw0 + j*2); - re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0)); - im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, MULI_SIGN); - _mm_store_ps(fw + j*2*6 , re); - _mm_store_ps(fw + j*2*6+4, im); - - temp1 = _mm_load_ps(fw1 + j*2); - re = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(2, 2, 0, 0)); - im = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, MULI_SIGN); - _mm_store_ps(fw + j*2*6+8 , re); - _mm_store_ps(fw + j*2*6+12, im); - - temp2 = _mm_load_ps(fw2 + j*2); - re = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 2, 0, 0)); - im = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, MULI_SIGN); - _mm_store_ps(fw + j*2*6+16, re); - _mm_store_ps(fw + j*2*6+20, im); + temp0 = VLD(fw0 + j*2); + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6 , re); + VST(fw + j*2*6+4, im); + + temp1 = VLD(fw1 + j*2); + re = VDUPRE(temp1); + im = VDUPIM(temp1); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6+8 , re); + VST(fw + j*2*6+12, im); + + temp2 = VLD(fw2 + j*2); + re = VDUPRE(temp2); + im = VDUPIM(temp2); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6+16, re); + VST(fw + j*2*6+20, im); + #endif } - _mm_free(w0); - _mm_free(w1); - _mm_free(w2); + FFTS_FREE(w0); + FFTS_FREE(w1); + FFTS_FREE(w2); } p->ws[i] = w; n *= 2; } + p->N = N; + p->lastlut = w; + p->n_luts = n_luts; + +// fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t)); - return p; } /* @@ -342,8 +356,8 @@ int main(int argc, char *argv[]) { int count = atoi(argv[2]); ffts_plan_t *p = ffts_init(n); - cdata_t *in = _mm_malloc(n * sizeof(cdata_t), 32); - cdata_t *out = _mm_malloc(n * sizeof(cdata_t), 32); + cdata_t *in = FFTS_MALLOC(n * sizeof(cdata_t), 32); + cdata_t *out = FFTS_MALLOC(n * sizeof(cdata_t), 32); size_t i; for(i=0;i<n;i++) in[i] = i; diff --git a/src/cp_sse.h b/src/cp_sse.h index 2c6825f..f94055d 100644 --- a/src/cp_sse.h +++ b/src/cp_sse.h @@ -6,11 +6,11 @@ #include <math.h> #include <complex.h> #include <stddef.h> -#include <xmmintrin.h> #include <stdint.h> +#include <stdalign.h> typedef complex float cdata_t; -typedef float data_t; +typedef alignas(16) float data_t; #define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N))) @@ -20,9 +20,10 @@ struct _ffts_plan_t { ptrdiff_t *is; ptrdiff_t *offsets; void __attribute__ ((aligned(32))) **ws; - void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict); - size_t i0, i1; - + void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict); + size_t i0, i1, n_luts; + size_t N; + void *lastlut; transform_index_t *transforms; }; diff --git a/src/macros.h b/src/macros.h index b2f44e6..039ee40 100644 --- a/src/macros.h +++ b/src/macros.h @@ -1,125 +1,121 @@ #ifndef __MACROS_H__ #define __MACROS_H__ +#include "../config.h" + +#ifdef HAVE_NEON + #include "neon_float.h" +#else + #include "sse_float.h" +#endif + + #include "cp_sse.h" #define __INLINE static inline __attribute__((always_inline)) -#define VLIT4 _mm_set_ps - -__m128 MULI_SIGN; +cdata_t SCALAR_MULI_SIGN; +V MULI_SIGN; +V LEAFLUT[12]; -__INLINE __m128 IMULI(__m128 a) { - __m128 temp = _mm_xor_ps(a, MULI_SIGN);//_mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); - return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(2,3,0,1)); +__INLINE V IMULI(V a) { + return VSWAPPAIRS(VXOR(a, MULI_SIGN)); } __INLINE void -S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) { - __m128 t0, t1, t2, t3; - _mm_store_ps(o0, r0); - _mm_store_ps(o1, r1); - _mm_store_ps(o2, r2); - _mm_store_ps(o3, r3); +S_4(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) { + V t0, t1, t2, t3; + VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); } -__INLINE void S_2(__m128 r0, __m128 r1, data_t * restrict o0, data_t * restrict o1) { - _mm_store_ps(o0, r0); - _mm_store_ps(o1, r1); +__INLINE void S_2(V r0, V r1, data_t * restrict o0, data_t * restrict o1) { + VST(o0, r0); VST(o1, r1); } -__INLINE void L_S2(const data_t * restrict i0, const data_t * restrict i1, __m128 * restrict r0, __m128 * restrict r1) { - __m128 t0, t1; - t0 = _mm_load_ps(i0); - t1 = _mm_load_ps(i1); - *r0 = _mm_add_ps(t0, t1); - *r1 = _mm_sub_ps(t0, t1); +__INLINE void L_S2(const data_t * restrict i0, const data_t * restrict i1, V * restrict r0, V * restrict r1) { + V t0, t1; + t0 = VLD(i0); t1 = VLD(i1); + *r0 = VADD(t0, t1); + *r1 = VSUB(t0, t1); } __INLINE void L_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 t0, t1, t2, t3; - t0 = _mm_load_ps(i0); - t1 = _mm_load_ps(i1); - t2 = _mm_load_ps(i2); - t3 = _mm_load_ps(i3); - *r0 = _mm_add_ps (t0, t1); - *r1 = _mm_sub_ps (t0, t1); - *r2 = _mm_add_ps (t2, t3); - *r3 = _mm_sub_ps (t2, t3); + V *r0, V *r1, V *r2, V *r3) { + V t0, t1, t2, t3; + t0 = VLD(i0); + t1 = VLD(i1); + t2 = VLD(i2); + t3 = VLD(i3); + *r0 = VADD (t0, t1); + *r1 = VSUB (t0, t1); + *r2 = VADD (t2, t3); + *r3 = VSUB (t2, t3); } __INLINE void L_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 t0, t1, t2, t3, t4, t5, t6, t7; - t0 = _mm_load_ps(i0); - t1 = _mm_load_ps(i1); - t2 = _mm_load_ps(i2); - t3 = _mm_load_ps(i3); - t4 = _mm_add_ps (t0, t1); - t5 = _mm_sub_ps (t0, t1); - t6 = _mm_add_ps (t2, t3); - t7 = IMULI(_mm_sub_ps (t2, t3)); - *r0 = _mm_add_ps (t4, t6); - *r2 = _mm_sub_ps (t4, t6); - *r1 = _mm_sub_ps (t5, t7); - *r3 = _mm_add_ps (t5, t7); + V *r0, V *r1, V *r2, V *r3) { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD (t0, t1); + t5 = VSUB (t0, t1); + t6 = VADD (t2, t3); + t7 = IMULI(VSUB (t2, t3)); + *r0 = VADD (t4, t6); + *r2 = VSUB (t4, t6); + *r1 = VSUB (t5, t7); + *r3 = VADD (t5, t7); } __INLINE void -K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 uk, uk2, zk, zk_d; - uk = *r0; - uk2 = *r1; - zk = _mm_add_ps(*r2, *r3); - zk_d = IMULI(_mm_sub_ps(*r2, *r3)); - *r0 = _mm_add_ps(uk, zk); - *r2 = _mm_sub_ps(uk, zk); - *r1 = _mm_sub_ps(uk2, zk_d); - *r3 = _mm_add_ps(uk2, zk_d); +K_0(V *r0, V *r1, V *r2, V *r3) { + V uk, uk2, zk, zk_d; + uk = *r0; uk2 = *r1; + zk = VADD(*r2, *r3); + zk_d = IMULI(VSUB(*r2, *r3)); + *r0 = VADD(uk, zk); + *r2 = VSUB(uk, zk); + *r1 = VSUB(uk2, zk_d); + *r3 = VADD(uk2, zk_d); } -__INLINE __m128 IMUL(__m128 d, __m128 re, __m128 im) { - re = _mm_mul_ps(re, d); - im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1))); - return _mm_sub_ps(re, im); +__INLINE V IMUL(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); } -__INLINE __m128 IMULJ(__m128 d, __m128 re, __m128 im) { - re = _mm_mul_ps(re, d); - im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1))); - return _mm_add_ps(re, im); +__INLINE V IMULJ(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); } __INLINE void -K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 uk, uk2, zk_p, zk_n, zk, zk_d; - - uk = *r0; - uk2 = *r1; +K_N(V re, V im, V *r0, V *r1, V *r2, V *r3) { + V uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; zk_p = IMUL(*r2, re, im); zk_n = IMULJ(*r3, re, im); + + zk = VADD(zk_p, zk_n); + zk_d = IMULI(VSUB(zk_p, zk_n)); - zk = _mm_add_ps(zk_p, zk_n); - zk_d = IMULI(_mm_sub_ps(zk_p, zk_n)); - - *r2 = _mm_sub_ps(uk, zk); - *r0 = _mm_add_ps(uk, zk); - *r3 = _mm_add_ps(uk2, zk_d); - *r1 = _mm_sub_ps(uk2, zk_d); + *r2 = VSUB(uk, zk); + *r0 = VADD(uk, zk); + *r3 = VADD(uk2, zk_d); + *r1 = VSUB(uk2, zk_d); } -__INLINE void TX2(__m128 *a, __m128 *b) { - __m128 TX2_t0 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(1,0,1,0)); - __m128 TX2_t1 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(3,2,3,2)); +__INLINE void TX2(V *a, V *b) { + V TX2_t0 = VUNPACKLO(*a, *b); + V TX2_t1 = VUNPACKHI(*a, *b); *a = TX2_t0; *b = TX2_t1; } -__m128 __attribute__((aligned(32))) LEAFLUT[12]; __INLINE void LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { - __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; + V r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; data_t *out0 = out + (*out_offsets)[0]; data_t *out1 = out + (*out_offsets)[1]; @@ -147,7 +143,7 @@ LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou __INLINE void LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { - __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; + V r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; data_t *out0 = out + (*out_offsets)[0]; data_t *out1 = out + (*out_offsets)[1]; @@ -170,32 +166,123 @@ LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou *is += 16; } +#ifdef __ARM_NEON__ +__INLINE void +S_4_1(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) { + register V p0 __asm__ ("q0") = r0; register V p1 __asm__ ("q1") = r1; register V p2 __asm__ ("q2") = r2; register V p3 __asm__ ("q3") = r3; + __asm__ __volatile__ ("vst4.32 {%q1,%q2}, [%0, :128]!\n\t" + "vst4.32 {%q3,%q4}, [%0, :128]!\n\t" + : + : "r" (o0), "w" (p0), "w" (p1), "w" (p2), "w" (p3) + : "memory"); +} +__INLINE void +S_4_2(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) { + register V p0 __asm__ ("q4") = r0; register V p1 __asm__ ("q5") = r1; register V p2 __asm__ ("q6") = r2; register V p3 __asm__ ("q7") = r3; + __asm__ __volatile__ ("vst4.32 {%q1,%q2}, [%0, :128]!\n\t" + "vst4.32 {%q3,%q4}, [%0, :128]!\n\t" + : + : "r" (o0), "w" (p0), "w" (p1), "w" (p2), "w" (p3) + : "memory"); +} +__INLINE void +LEAF_EE8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + V r0,r1,r2,r3,r4,r5,r6,r7; + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3); + L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7); + K_0(&r0,&r2,&r4,&r6); + K_N(LEAFLUT[0], LEAFLUT[1],&r1,&r3,&r5,&r7); + + register V p0 __asm__ ("q0") = r0; + register V p1 __asm__ ("q1") = r2; + register V p2 __asm__ ("q2") = r4; + register V p3 __asm__ ("q3") = r6; + register V p4 __asm__ ("q4") = r1; + register V p5 __asm__ ("q5") = r3; + register V p6 __asm__ ("q6") = r5; + register V p7 __asm__ ("q7") = r7; + + __asm__ __volatile__ ("vswp %f1,%e6\n\t" + "vswp %f2,%e7\n\t" + "vswp %f3,%e8\n\t" + "vswp %f4,%e9\n\t" + "vst4.32 {%q1,%q2}, [%0, :128]!\n\t" + "vst4.32 {%q3,%q4}, [%0, :128]!\n\t" + "vst4.32 {%q6,%q7}, [%5, :128]!\n\t" + "vst4.32 {%q8,%q9}, [%5, :128]!\n\t" + : + : "r" (out0), "w" (p0), "w" (p1), "w" (p2), "w" (p3), + "r" (out1), "w" (p4), "w" (p5), "w" (p6), "w" (p7) + : "memory"); +//TX2(&r0,&r1); TX2(&r2,&r3); TX2(&r4,&r5); TX2(&r6,&r7); +//S_4_1(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12); +//S_4_2(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12); + *is += 8; +} +__INLINE void +LEAF_OO8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + V r0,r1,r2,r3,r4,r5,r6,r7; + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3); + L_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7); + register V p0 __asm__ ("q0") = r0; + register V p1 __asm__ ("q1") = r2; + register V p2 __asm__ ("q2") = r4; + register V p3 __asm__ ("q3") = r6; + register V p4 __asm__ ("q4") = r1; + register V p5 __asm__ ("q5") = r3; + register V p6 __asm__ ("q6") = r5; + register V p7 __asm__ ("q7") = r7; + __asm__ __volatile__ ("vswp %f1,%e6\n\t" + "vswp %f2,%e7\n\t" + "vswp %f3,%e8\n\t" + "vswp %f4,%e9\n\t" + "vst4.32 {%q1,%q2}, [%0, :128]!\n\t" + "vst4.32 {%q3,%q4}, [%0, :128]!\n\t" + "vst4.32 {%q6,%q7}, [%5, :128]!\n\t" + "vst4.32 {%q8,%q9}, [%5, :128]!\n\t" + : + : "r" (out0), "w" (p0), "w" (p1), "w" (p2), "w" (p3), + "r" (out1), "w" (p4), "w" (p5), "w" (p6), "w" (p7) + : "memory"); +//TX2(&r0,&r1); TX2(&r2,&r3); TX2(&r4,&r5); TX2(&r6,&r7); +//S_4_1(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12); +//S_4_2(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12); + *is += 8; +} +#endif __INLINE void L_4_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 t0, t1, t2, t3, t4, t5, t6, t7; - t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); - t4 = _mm_add_ps(t0, t1); - t5 = _mm_sub_ps(t0, t1); - t6 = _mm_add_ps(t2, t3); - t7 = IMULI(_mm_sub_ps(t2, t3)); - t0 = _mm_add_ps(t4, t6); - t2 = _mm_sub_ps(t4, t6); - t1 = _mm_sub_ps(t5, t7); - t3 = _mm_add_ps(t5, t7); + V *r0, V *r1, V *r2, V *r3) { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = IMULI(VSUB(t2, t3)); + t0 = VADD(t4, t6); + t2 = VSUB(t4, t6); + t1 = VSUB(t5, t7); + t3 = VADD(t5, t7); TX2(&t0,&t1); TX2(&t2,&t3); *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; } __INLINE void L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 t0, t1, t2, t3, t4, t5, t6, t7; - t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); t4 = _mm_add_ps(t0, t1); - t5 = _mm_sub_ps(t0, t1); - t6 = _mm_add_ps(t2, t3); - t7 = _mm_sub_ps(t2, t3); + V *r0, V *r1, V *r2, V *r3) { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); TX2(&t4,&t5); TX2(&t6,&t7); *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7; @@ -203,52 +290,49 @@ L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * res __INLINE void L_2_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 t0, t1, t2, t3, t4, t5, t6, t7; - t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); - t4 = _mm_add_ps(t0, t1); - t5 = _mm_sub_ps(t0, t1); - t6 = _mm_add_ps(t2, t3); - t7 = _mm_sub_ps(t2, t3); - *r0 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(1,0,1,0)); - *r1 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(1,0,1,0)); + V *r0, V *r1, V *r2, V *r3) { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); + *r0 = VUNPACKLO(t4, t5); + *r1 = VUNPACKLO(t6, t7); t5 = IMULI(t5); - t0 = _mm_add_ps(t6, t4); - t2 = _mm_sub_ps(t6, t4); - t1 = _mm_sub_ps(t7, t5); - t3 = _mm_add_ps(t7, t5); - *r3 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,2,3,2)); - *r2 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,2,3,2)); + t0 = VADD(t6, t4); + t2 = VSUB(t6, t4); + t1 = VSUB(t7, t5); + t3 = VADD(t7, t5); + *r3 = VUNPACKHI(t0, t1); + *r2 = VUNPACKHI(t2, t3); } __INLINE void L_4_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { - __m128 t0, t1, t2, t3, t4, t5, t6, t7; - t0 = _mm_load_ps(i0); - t1 = _mm_load_ps(i1); - t6 = _mm_load_ps(i2); - t7 = _mm_load_ps(i3); - t2 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,1,0)); - t3 = _mm_shuffle_ps(t7, t6, _MM_SHUFFLE(3,2,1,0)); - t4 = _mm_add_ps(t0, t1); - t5 = _mm_sub_ps(t0, t1); - t6 = _mm_add_ps(t2, t3); - t7 = _mm_sub_ps(t2, t3); - *r2 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,2,3,2)); - *r3 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,3,2)); + V *r0, V *r1, V *r2, V *r3) { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3); + t2 = VBLEND(t6, t7); + t3 = VBLEND(t7, t6); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); + *r2 = VUNPACKHI(t4, t5); + *r3 = VUNPACKHI(t6, t7); t7 = IMULI(t7); - t0 = _mm_add_ps(t4, t6); - t2 = _mm_sub_ps(t4, t6); - t1 = _mm_sub_ps(t5, t7); - t3 = _mm_add_ps(t5, t7); - *r0 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(1,0,1,0)); - *r1 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(1,0,1,0)); + t0 = VADD(t4, t6); + t2 = VSUB(t4, t6); + t1 = VSUB(t5, t7); + t3 = VADD(t5, t7); + *r0 = VUNPACKLO(t0, t1); + *r1 = VUNPACKLO(t2, t3); } __INLINE void LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { - __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; data_t *out0 = out + (*out_offsets)[0]; data_t *out1 = out + (*out_offsets)[1]; @@ -273,7 +357,7 @@ LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou __INLINE void LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { - __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; data_t *out0 = out + (*out_offsets)[0]; data_t *out1 = out + (*out_offsets)[1]; @@ -295,6 +379,180 @@ LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou *is += 16; } +#ifdef __ARM_NEON__ +__INLINE void +LEAF_OE8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4_2(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r12_13,&r14_15); + L_4_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r8_9,&r10_11); + S_4_1(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12); + K_N(LEAFLUT[6],LEAFLUT[7],&r8_9,&r10_11,&r12_13,&r14_15); + S_4_2(r8_9,r10_11,r12_13,r14_15,out1+0,out1+4,out1+8,out1+12); + *is += 8; +} +__INLINE void +LEAF_EO8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r14_15,&r12_13); + S_4_1(r8_9,r10_11,r12_13,r14_15,out1+0,out1+4,out1+8,out1+12); + K_N(LEAFLUT[6],LEAFLUT[7],&r0_1,&r2_3,&r4_5,&r6_7); + S_4_2(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12); + + *is += 8; +} +#endif +__INLINE void +firstpass_32(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + float *LUT8 = p->ws[0]; + float *LUT16 = p->ws[1]; + float *LUT32 = p->ws[2]; + + L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19); + L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23); + K_N(VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31); + L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27); + K_N(VLD(LUT16),VLD(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13); + K_N(VLD(LUT16+8),VLD(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15); + K_N(VLD(LUT8),VLD(LUT8+4),&r16_17,&r18_19,&r20_21,&r22_23); + K_N(VLD(LUT8),VLD(LUT8+4),&r24_25,&r26_27,&r28_29,&r30_31); + K_N(VLD(LUT32),VLD(LUT32+4),&r0_1,&r8_9,&r16_17,&r24_25); + S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48); + K_N(VLD(LUT32+8),VLD(LUT32+12),&r2_3,&r10_11,&r18_19,&r26_27); + S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52); + K_N(VLD(LUT32+16),VLD(LUT32+20),&r4_5,&r12_13,&r20_21,&r28_29); + S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56); + K_N(VLD(LUT32+24),VLD(LUT32+28),&r6_7,&r14_15,&r22_23,&r30_31); + S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60); + +} + +__INLINE void +firstpass_16(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + float *LUT8 = p->ws[0]; + float *LUT16 = p->ws[1]; + + L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13); + K_N(VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(VLD(LUT16),VLD(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24); + K_N(VLD(LUT16+8),VLD(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28); +} +__INLINE void +firstpass_8(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { + V r0_1,r2_3,r4_5,r6_7; + float *LUT8 = p->ws[0]; + L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7); + K_N(VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12); +} +__INLINE void +firstpass_4_f(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { + cdata_t *i = (cdata_t *)in, *o = (cdata_t *)out; + cdata_t t0, t1, t2, t3, t4, t5, t6, t7; + t0 = i[0]; t1 = i[2]; t2 = i[1]; t3 = i[3]; + t4 = t0 + t1; + t5 = t0 - t1; + t6 = t2 + t3; + t7 = (t2 - t3); + t7 = (creal(t7))*I - (cimag(t7)); + o[0] = t4 + t6; + o[2] = t4 - t6; + o[1] = t5 - t7; + o[3] = t5 + t7; +} +__INLINE void +firstpass_4_b(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { + cdata_t *i = (cdata_t *)in, *o = (cdata_t *)out; + cdata_t t0, t1, t2, t3, t4, t5, t6, t7; + t0 = i[0]; t1 = i[2]; t2 = i[1]; t3 = i[3]; + t4 = t0 + t1; + t5 = t0 - t1; + t6 = t2 + t3; + t7 = (t2 - t3); + t7 = -(creal(t7))*I + (cimag(t7)); + o[0] = t4 + t6; + o[2] = t4 - t6; + o[1] = t5 - t7; + o[3] = t5 + t7; +} +__INLINE void +firstpass_2(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { + cdata_t t0, t1, r0,r1; + t0 = ((cdata_t *)in)[0]; t1 = ((cdata_t *)in)[1]; + r0 = t0 + t1; r1 = t0 - t1; + ((cdata_t *)out)[0] = r0; + ((cdata_t *)out)[1] = r1; +} + +__INLINE void X_8(data_t * restrict data0, size_t N, const data_t * restrict LUT) { + data_t *data2 = data0 + 2*N/4; + data_t *data4 = data0 + 4*N/4; + data_t *data6 = data0 + 6*N/4; + data_t *data1 = data0 + 1*N/4; + data_t *data3 = data0 + 3*N/4; + data_t *data5 = data0 + 5*N/4; + data_t *data7 = data0 + 7*N/4; + size_t k, n4 = N/4; + + for(k=N/8/2;k>0;--k) { + V r0, r1, r2, r3, r4, r5, r6, r7; + r0 = VLD(data0); + r1 = VLD(data1); + r2 = VLD(data2); + r3 = VLD(data3); + K_N(VLD(LUT), VLD(LUT+4), &r0, &r1, &r2, &r3); + r4 = VLD(data4); + r6 = VLD(data6); + K_N(VLD(LUT+8), VLD(LUT+12), &r0, &r2, &r4, &r6); + r5 = VLD(data5); + r7 = VLD(data7); + K_N(VLD(LUT+16), VLD(LUT+20), &r1, &r3, &r5, &r7); + LUT += 24; + VST(data0, r0); data0 += 4; + VST(data1, r1); data1 += 4; + VST(data2, r2); data2 += 4; + VST(data3, r3); data3 += 4; + VST(data4, r4); data4 += 4; + VST(data5, r5); data5 += 4; + VST(data6, r6); data6 += 4; + VST(data7, r7); data7 += 4; + } +} + +__INLINE void X_4(data_t * restrict data, size_t N, const data_t * restrict LUT) { + + size_t i; + for(i=0;i<N/4/2;i++) { + V uk = VLD(data); + V uk2 = VLD(data + 2*N/4); + V zk_p = VLD(data + 4*N/4); + V zk_n = VLD(data + 6*N/4); + + K_N(VLD(LUT), VLD(LUT+4), &uk, &uk2, &zk_p, &zk_n); + + VST(data, uk); + VST(data + 2*N/4, uk2); + VST(data + 4*N/4, zk_p); + VST(data + 6*N/4, zk_n); + + LUT += 8; + data += 4; + } +} #endif diff --git a/src/neon_float.h b/src/neon_float.h new file mode 100644 index 0000000..220219f --- /dev/null +++ b/src/neon_float.h @@ -0,0 +1,1037 @@ +#ifndef __NEON_FLOAT_H__ +#define __NEON_FLOAT_H__ + +#include <arm_neon.h> + +//#define VL 4 +#define __INLINE static inline __attribute__((always_inline)) + +typedef float32x4_t V; + +typedef float32x4x2_t VS; + +#define ADD vaddq_f32 +#define SUB vsubq_f32 +#define MUL vmulq_f32 +#define VADD vaddq_f32 +#define VSUB vsubq_f32 +#define VMUL vmulq_f32 +#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) +#define VST vst1q_f32 +#define VLD vld1q_f32 +#define VST2 vst2q_f32 +#define VLD2 vld2q_f32 + +#define VSWAPPAIRS(x) (vrev64q_f32(x)) + +#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) +#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) + +#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) + +static inline V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { + data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3}; + return VLD(d); +} + +#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) +#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) + +#define FFTS_MALLOC(d,a) (valloc(d)) +#define FFTS_FREE(d) (free(d)) +__INLINE void FMA(V *Rd, V Rn, V Rm) { + __asm__ ("vmla.f32 %q0,%q1,%q2\n\t" + : "+w" (*Rd) + : "w" (Rn), "w" (Rm) + //: "0" + ); + +} +__INLINE void FMS(V *Rd, V Rn, V Rm) { + __asm__ ("vmls.f32 %q0,%q1,%q2\n\t" + : "+w" (*Rd) + : "w" (Rn), "w" (Rm) + // : "0" + ); +} + +__INLINE VS VSMUL(VS *d, VS *w) { + VS t; + t.val[0] = vmulq_f32(d->val[0], w->val[0]); + t.val[1] = vmulq_f32(d->val[0], w->val[1]); +// t.val[0] = vmlsq_f32(t.val[0], d->val[1], w->val[1]); +// t.val[1] = vmlaq_f32(t.val[1], d->val[1], w->val[0]); + FMS(&t.val[0], d->val[1], w->val[1]); + FMA(&t.val[1], d->val[1], w->val[0]); + return t; +} +__INLINE VS VSMULJ(VS *d, VS *w) { + VS t; + t.val[0] = vmulq_f32(d->val[0], w->val[0]); + t.val[1] = vmulq_f32(d->val[1], w->val[0]); +// t.val[0] = vmlaq_f32(t.val[0], d->val[1], w->val[1]); +// t.val[1] = vmlsq_f32(t.val[1], d->val[0], w->val[1]); + FMA(&t.val[0], d->val[1], w->val[1]); + FMS(&t.val[1], d->val[0], w->val[1]); + return t; +} +__INLINE VS VSADD(VS *a, VS *b) { + VS r; + r.val[0] = vaddq_f32(a->val[0], b->val[0]); + r.val[1] = vaddq_f32(a->val[1], b->val[1]); + return r; +} +__INLINE VS VSSUB(VS *a, VS *b) { + VS r; + r.val[0] = vsubq_f32(a->val[0], b->val[0]); + r.val[1] = vsubq_f32(a->val[1], b->val[1]); + return r; +} +__INLINE VS VSSUB_MULI(VS *a, VS *b) { + VS r; + r.val[0] = vaddq_f32(a->val[0], b->val[1]); + r.val[1] = vsubq_f32(a->val[1], b->val[0]); + return r; +} +__INLINE VS VSADD_MULI(VS *a, VS *b) { + VS r; + r.val[0] = vsubq_f32(a->val[0], b->val[1]); + r.val[1] = vaddq_f32(a->val[1], b->val[0]); + return r; +} + +__INLINE void VSK_N(VS w, VS *r0, VS *r1, VS *r2, VS *r3) { + VS uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; + zk_p = VSMUL(r2, &w); + zk_n = VSMULJ(r3, &w); + + zk = VSADD(&zk_p, &zk_n); + zk_d = VSSUB(&zk_p, &zk_n); + + *r2 = VSSUB(&uk, &zk); + *r0 = VSADD(&uk, &zk); + *r3 = VSADD_MULI(&uk2, &zk_d); + *r1 = VSSUB_MULI(&uk2, &zk_d); +} + + +__INLINE float32x2x2_t HVS_ADD(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vadd_f32(a.val[0], b.val[0]); + rval.val[1] = vadd_f32(a.val[1], b.val[1]); + return rval; +} +__INLINE float32x2x2_t HVS_SUB(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vsub_f32(a.val[0], b.val[0]); + rval.val[1] = vsub_f32(a.val[1], b.val[1]); + return rval; +} +__INLINE float32x2x2_t HVS_SUB_MULI(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vadd_f32(a.val[0], b.val[1]); + rval.val[1] = vsub_f32(a.val[1], b.val[0]); + return rval; +} +__INLINE float32x2x2_t HVS_ADD_MULI(float32x2x2_t a, float32x2x2_t b) { + float32x2x2_t rval; + rval.val[0] = vsub_f32(a.val[0], b.val[1]); + rval.val[1] = vadd_f32(a.val[1], b.val[0]); + return rval; +} +__INLINE float32x2x2_t HVS_MUL(float32x2x2_t d, float32x2x2_t w) { + float32x2x2_t t; + t.val[0] = vmul_f32(d.val[0], w.val[0]); + t.val[1] = vmul_f32(d.val[0], w.val[1]); + t.val[0] = vmls_f32(t.val[0], d.val[1], w.val[1]); + t.val[1] = vmla_f32(t.val[1], d.val[1], w.val[0]); + return t; +} +__INLINE float32x2x2_t HVS_MULJ(float32x2x2_t d, float32x2x2_t w) { + float32x2x2_t t; + t.val[0] = vmul_f32(d.val[0], w.val[0]); + t.val[1] = vmul_f32(d.val[1], w.val[0]); + t.val[0] = vmla_f32(t.val[0], d.val[1], w.val[1]); + t.val[1] = vmls_f32(t.val[1], d.val[0], w.val[1]); + return t; +} +__INLINE void HVS_K_N(float32x2x2_t w, float32x2x2_t *r0, float32x2x2_t *r1, float32x2x2_t *r2, float32x2x2_t *r3) { + float32x2x2_t uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; + zk_p = HVS_MUL(*r2, w); + zk_n = HVS_MULJ(*r3, w); + zk = HVS_ADD(zk_p, zk_n); + zk_d = HVS_SUB(zk_p, zk_n); + + *r2 = HVS_SUB(uk, zk); + *r0 = HVS_ADD(uk, zk); + *r3 = HVS_ADD_MULI(uk2, zk_d); + *r1 = HVS_SUB_MULI(uk2, zk_d); +} + +typedef union { + float32x4_t f32x4; + float32x2x2_t f32x2x2; +} float_mixed_t; + +__INLINE void VSWP(float32x2x2_t *a, float32x2x2_t *b) { +//float32x2_t tmp = a->val[1]; +//a->val[1] = b->val[0]; +//b->val[0] = tmp; + __asm__ ("vswp %0,%1\n\t" + : "+w" (a->val[1]), "+w" (b->val[0]) + : + ); +} + +static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376, + -0.70710678118654757273731092936941,-0.70710678118654746171500846685376}; +__INLINE void LEAF_EE8_SPLIT(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + float32x2x2_t r0, r1, r2, r3, r4, r5, r6, r7; + float32x2x2_t t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = vld2_f32(in + (*is)[0]); t1 = vld2_f32(in + (*is)[1]); t2 = vld2_f32(in + (*is)[2]); t3 = vld2_f32(in + (*is)[3]); + + t4 = HVS_ADD (t0, t1); + t5 = HVS_SUB (t0, t1); + t6 = HVS_ADD (t2, t3); + t7 = HVS_SUB (t2, t3); + r0 = HVS_ADD (t4, t6); + r2 = HVS_SUB (t4, t6); + r1 = HVS_SUB_MULI(t5, t7); + r3 = HVS_ADD_MULI(t5, t7); + + t0 = vld2_f32(in + (*is)[4]); t1 = vld2_f32(in + (*is)[5]); t2 = vld2_f32(in + (*is)[6]); t3 = vld2_f32(in + (*is)[7]); + r4 = HVS_ADD (t0, t1); + r5 = HVS_SUB (t0, t1); + r6 = HVS_ADD (t2, t3); + r7 = HVS_SUB (t2, t3); + t0 = r0; t1 = r2; + t2 = HVS_ADD(r4, r6); + t3 = HVS_SUB(r4, r6); + r0 = HVS_ADD(t0, t2); + r4 = HVS_SUB(t0, t2); + r2 = HVS_SUB_MULI(t1, t3); + r6 = HVS_ADD_MULI(t1, t3); + + float32x4_t w = vld1q_f32(ee_w_data); + float32x2x2_t ww; + ww.val[0] = vget_low_f32(w); + ww.val[1] = vget_high_f32(w); + + HVS_K_N(ww,&r1,&r3,&r5,&r7); + +//vst2_f32(out0, r0); +//vst2_f32(out0+4, r2); +//vst2_f32(out0+8, r4); +//vst2_f32(out0+12, r6); + +//vst2_f32(out1, r1); +//vst2_f32(out1+4, r3); +//vst2_f32(out1+8, r5); +//vst2_f32(out1+12, r7); + + float32x2x2_t tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; + + tt0 = vtrn_f32(r0.val[0], r0.val[1]); + tt1 = vtrn_f32(r1.val[0], r1.val[1]); + tt2 = vtrn_f32(r2.val[0], r2.val[1]); + tt3 = vtrn_f32(r3.val[0], r3.val[1]); + tt4 = vtrn_f32(r4.val[0], r4.val[1]); + tt5 = vtrn_f32(r5.val[0], r5.val[1]); + tt6 = vtrn_f32(r6.val[0], r6.val[1]); + tt7 = vtrn_f32(r7.val[0], r7.val[1]); + +//VSWP(&tt0.f32x2x2, &tt1.f32x2x2); +//VSWP(&tt2.f32x2x2, &tt3.f32x2x2); +//VSWP(&tt4.f32x2x2, &tt5.f32x2x2); +//VSWP(&tt6.f32x2x2, &tt7.f32x2x2); + + float32x4_t z0, z1, z2, z3, z4, z5, z6, z7; + + z0 = vcombine_f32(tt0.val[0], tt1.val[0]); + z1 = vcombine_f32(tt0.val[1], tt1.val[1]); + z2 = vcombine_f32(tt2.val[0], tt3.val[0]); + z3 = vcombine_f32(tt2.val[1], tt3.val[1]); + z4 = vcombine_f32(tt4.val[0], tt5.val[0]); + z5 = vcombine_f32(tt4.val[1], tt5.val[1]); + z6 = vcombine_f32(tt6.val[0], tt7.val[0]); + z7 = vcombine_f32(tt6.val[1], tt7.val[1]); + + + vst1q_f32(out0, z0); + vst1q_f32(out0+4, z2); + vst1q_f32(out0+8, z4); + vst1q_f32(out0+12, z6); + + vst1q_f32(out1, z1); + vst1q_f32(out1+4, z3); + vst1q_f32(out1+8, z5); + vst1q_f32(out1+12, z7); +/* + vst1_f32(out0, tt0.val[0]); + vst1_f32(out0+2, tt1.val[0]); + vst1_f32(out0+4, tt2.val[0]); + vst1_f32(out0+6, tt3.val[0]); + vst1_f32(out0+8, tt4.val[0]); + vst1_f32(out0+10, tt5.val[0]); + vst1_f32(out0+12, tt6.val[0]); + vst1_f32(out0+14, tt7.val[0]); + + vst1_f32(out1, tt0.val[1]); + vst1_f32(out1+2, tt1.val[1]); + vst1_f32(out1+4, tt2.val[1]); + vst1_f32(out1+6, tt3.val[1]); + vst1_f32(out1+8, tt4.val[1]); + vst1_f32(out1+10, tt5.val[1]); + vst1_f32(out1+12, tt6.val[1]); + vst1_f32(out1+14, tt7.val[1]); + */ +/* + float32x4_t rr0 = vcombine_f32(r0.val[0], r0.val[1]); + float32x4_t rr1 = vcombine_f32(r1.val[0], r1.val[1]); + float32x4_t rr2 = vcombine_f32(r2.val[0], r2.val[1]); + float32x4_t rr3 = vcombine_f32(r3.val[0], r3.val[1]); + + float32x4x2_t tmp0, tmp1, tmp2, tmp3; + tmp0 = vtrnq_f32(rr0, rr2); + tmp1 = vtrnq_f32(rr1, rr3); + + + float32x2x2_t v0, v1, v2, v3; + v0.val[0] = vget_low_f32(tmp0.val[0]); + v0.val[1] = vget_high_f32(tmp0.val[0]); + v1.val[0] = vget_low_f32(tmp0.val[1]); + v1.val[1] = vget_high_f32(tmp0.val[1]); + v2.val[0] = vget_low_f32(tmp1.val[0]); + v2.val[1] = vget_high_f32(tmp1.val[0]); + v3.val[0] = vget_low_f32(tmp1.val[1]); + v3.val[1] = vget_high_f32(tmp1.val[1]); + + tmp2.val[0] = tmp0.val[0]; + tmp2.val[1] = tmp1.val[0]; + tmp3.val[0] = tmp0.val[1]; + tmp3.val[1] = tmp1.val[1]; + +//vst2q_f32(out0 , tmp2); +//vst2q_f32(out1 , tmp3); + vst2_f32(out0, v0); + vst2_f32(out0+4, v1); + vst2_f32(out1, v2); + vst2_f32(out1+4, v3); + + float32x4_t rr4 = vcombine_f32(r4.val[0], r4.val[1]); + float32x4_t rr5 = vcombine_f32(r5.val[0], r5.val[1]); + float32x4_t rr6 = vcombine_f32(r6.val[0], r6.val[1]); + float32x4_t rr7 = vcombine_f32(r7.val[0], r7.val[1]); + + tmp0 = vtrnq_f32(rr4, rr6); + tmp1 = vtrnq_f32(rr5, rr7); + + tmp2.val[0] = tmp0.val[0]; + tmp2.val[1] = tmp1.val[0]; + tmp3.val[0] = tmp0.val[1]; + tmp3.val[1] = tmp1.val[1]; + v0.val[0] = vget_low_f32(tmp0.val[0]); + v0.val[1] = vget_high_f32(tmp0.val[0]); + v1.val[0] = vget_low_f32(tmp0.val[1]); + v1.val[1] = vget_high_f32(tmp0.val[1]); + v2.val[0] = vget_low_f32(tmp1.val[0]); + v2.val[1] = vget_high_f32(tmp1.val[0]); + v3.val[0] = vget_low_f32(tmp1.val[1]); + v3.val[1] = vget_high_f32(tmp1.val[1]); + vst2_f32(out0+8, v0); + vst2_f32(out0+12, v1); + vst2_f32(out1+8, v1); + vst2_f32(out1+12, v3); + +//vst2q_f32(out0 + 8, tmp2); +//vst2q_f32(out1 + 8, tmp3); +//vst1q_f32(out0+8, tmp0.val[0]); +//vst1q_f32(out0+12,tmp0.val[1]); +//vst1q_f32(out1+8, tmp1.val[0]); +//vst1q_f32(out1+12,tmp1.val[1]); + */ + *is += 8; +} + +__INLINE void STORESPR(data_t * addr, VS p) { + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p.val[0]), "w" (p.val[1]) + : "memory"); +} +__INLINE void STORESPRI(data_t * restrict * addr, V p0, V p1) { + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPRI0(data_t * restrict *addr, VS r) { + register V p0 __asm__ ("q0") = r.val[0]; + register V p1 __asm__ ("q1") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRI1(data_t **addr, VS r) { + register V p0 __asm__ ("q2") = r.val[0]; + register V p1 __asm__ ("q3") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRI2(data_t **addr, VS r) { + register V p0 __asm__ ("q4") = r.val[0]; + register V p1 __asm__ ("q5") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRI3(data_t **addr, VS r) { + register V p0 __asm__ ("q6") = r.val[0]; + register V p1 __asm__ ("q7") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT0(data_t * restrict *addr, VS r) { + register V p0 __asm__ ("q0") = r.val[0]; + register V p1 __asm__ ("q1") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT1(data_t **addr, VS r) { + register V p0 __asm__ ("q2") = r.val[0]; + register V p1 __asm__ ("q3") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT2(data_t **addr, VS r) { + register V p0 __asm__ ("q4") = r.val[0]; + register V p1 __asm__ ("q5") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPRIT3(data_t **addr, VS r) { + register V p0 __asm__ ("q6") = r.val[0]; + register V p1 __asm__ ("q7") = r.val[1]; + __asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t" + : "+r" (*addr) + : "w" (p0), "w" (p1) + : "memory"); + //STORESPRI(addr, p0, p1); +} +__INLINE void STORESPR0(data_t *addr, VS r) { + register V p0 __asm__ ("q0") = r.val[0]; + register V p1 __asm__ ("q1") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPR1(data_t *addr, VS r) { + register V p0 __asm__ ("q2") = r.val[0]; + register V p1 __asm__ ("q3") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPR2(data_t *addr, VS r) { + register V p0 __asm__ ("q4") = r.val[0]; + register V p1 __asm__ ("q5") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE void STORESPR3(data_t *addr, VS r) { + register V p0 __asm__ ("q6") = r.val[0]; + register V p1 __asm__ ("q7") = r.val[1]; + __asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" + : + : "r" (addr), "w" (p0), "w" (p1) + : "memory"); +} +__INLINE VS LOADSPR0(data_t *addr) { + VS r; + register V p0 __asm__ ("q8") ; + register V p1 __asm__ ("q9") ; + __asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPR1(data_t *addr) { + VS r; + register V p0 __asm__ ("q10") ; + register V p1 __asm__ ("q11") ; + __asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPR2(data_t *addr) { + VS r; + register V p0 __asm__ ("q12") ; + register V p1 __asm__ ("q13") ; + __asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPR3(data_t *addr) { + VS r; + register V p0 __asm__ ("q14") ; + register V p1 __asm__ ("q15") ; + __asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t" + : "=&w" (p0), "=&w" (p1) + : "r" (addr) + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} +__INLINE VS LOADSPRI(data_t * restrict * addr) { + VS r; + register V p0 __asm__ ("q2") ; + register V p1 __asm__ ("q3") ; + __asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]!\n\t" + : "=&w" (p0), "=&w" (p1), "+r" (*addr) + : + ); + r.val[0] = p0; r.val[1] = p1; + return r; +} + +__INLINE void X_4_SPLIT(data_t * restrict data, size_t N, data_t * restrict LUT) { + +//size_t i; +//for(i=0;i<N/4/2/2;i++) { + VS uk = LOADSPR0(data); + VS uk2 = LOADSPR1(data + 2*N/4); + VS zk_p = LOADSPR2(data + 4*N/4); + VS zk_n = LOADSPR3(data + 6*N/4); + + VSK_N(LOADSPRI(&LUT), &uk, &uk2, &zk_p, &zk_n); + + STORESPR0(data, uk); + STORESPR1(data + 2*N/4, uk2); + STORESPR2(data + 4*N/4, zk_p); + STORESPR3(data + 6*N/4, zk_n); + +// LUT += 8; +// data += 8; +// } +} + +__INLINE void X_8_SPLIT(data_t * restrict data0, size_t N, data_t * restrict LUT) { + data_t *data2 = data0 + 2*N/4; + data_t *data4 = data0 + 4*N/4; + data_t *data6 = data0 + 6*N/4; + data_t *data1 = data0 + 1*N/4; + data_t *data3 = data0 + 3*N/4; + data_t *data5 = data0 + 5*N/4; + data_t *data7 = data0 + 7*N/4; + size_t k, n4 = N/4; + + for(k=N/8/2/2;k>0;--k) { + VS r0, r1, r2, r3, r4, r5, r6, r7,w; + r0 = LOADSPR0(data0); + r2 = LOADSPR1(data2); + r1 = LOADSPR2(data1); + r3 = LOADSPR3(data3); + VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3); + STORESPR2(data1, r1); + STORESPR3(data3, r3); + r4 = LOADSPR2(data4); + r6 = LOADSPR3(data6); + VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6); + STORESPRI0(&data0, r0); //data0 += 8; + STORESPRI1(&data2, r2); //data2 += 8; + STORESPRI2(&data4, r4); //data4 += 8; + STORESPRI3(&data6, r6); //data6 += 8; + r1 = LOADSPR0(data1); + r3 = LOADSPR1(data3); + r5 = LOADSPR2(data5); + r7 = LOADSPR3(data7); + VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7); + // LUT += 24; + STORESPRI0(&data1, r1); //data1 += 8; + STORESPRI1(&data3, r3); //data3 += 8; + STORESPRI2(&data5, r5); //data5 += 8; + STORESPRI3(&data7, r7); //data7 += 8; + } +} + +__INLINE void X_8_SPLIT_T(data_t * restrict data0, size_t N, data_t * restrict LUT) { + data_t *data2 = data0 + 2*N/4; + data_t *data4 = data0 + 4*N/4; + data_t *data6 = data0 + 6*N/4; + data_t *data1 = data0 + 1*N/4; + data_t *data3 = data0 + 3*N/4; + data_t *data5 = data0 + 5*N/4; + data_t *data7 = data0 + 7*N/4; + size_t k, n4 = N/4; + + for(k=N/8/2/2;k>0;--k) { + VS r0, r1, r2, r3, r4, r5, r6, r7,w; + r0 = LOADSPR0(data0); + r2 = LOADSPR1(data2); + r1 = LOADSPR2(data1); + r3 = LOADSPR3(data3); + VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3); + STORESPR2(data1, r1); + STORESPR3(data3, r3); + r4 = LOADSPR2(data4); + r6 = LOADSPR3(data6); + VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6); + STORESPRIT0(&data0, r0); //data0 += 8; + STORESPRIT1(&data2, r2); //data2 += 8; + STORESPRIT2(&data4, r4); //data4 += 8; + STORESPRIT3(&data6, r6); //data6 += 8; + r1 = LOADSPR0(data1); + r3 = LOADSPR1(data3); + r5 = LOADSPR2(data5); + r7 = LOADSPR3(data7); + VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7); + STORESPRIT0(&data1, r1); //data1 += 8; + STORESPRIT1(&data3, r3); //data3 += 8; + STORESPRIT2(&data5, r5); //data5 += 8; + STORESPRIT3(&data7, r7); //data7 += 8; + } +} +__INLINE V LOAD2I(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]!\n\t" + : "=w" (o), "+r" (*addr) + : + ); + + return o; +} +__INLINE V LOADI(const data_t **addr) { + float32x2_t out0, out1; + float32x4_t o; + + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t" + : "=w" (o), "+r" (*addr) + : + ); + return o; +} +__INLINE V HSP_MUL(V *d, const V *w) { + V t; + t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)), + vmul_f32(vget_low_f32(*d), vget_high_f32(*w))); + t = vcombine_f32(vmls_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)), + vmla_f32(vget_high_f32(t), vget_high_f32(*d), vget_low_f32(*w))); + return t; +} +__INLINE V HSP_MULJ(V *d, const V *w) { + V t; + t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)), + vmul_f32(vget_high_f32(*d), vget_low_f32(*w))); + t = vcombine_f32(vmla_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)), + vmls_f32(vget_high_f32(t), vget_low_f32(*d), vget_high_f32(*w))); + return t; +} +__INLINE V HSP_SUB_MULI(V *a, V *b) { + return vcombine_f32(vadd_f32(vget_low_f32(*a), vget_high_f32(*b)), vsub_f32(vget_high_f32(*a), vget_low_f32(*b))); +} +__INLINE V HSP_ADD_MULI(V *a, V *b) { + return vcombine_f32(vsub_f32(vget_low_f32(*a), vget_high_f32(*b)), vadd_f32(vget_high_f32(*a), vget_low_f32(*b))); +} + +__INLINE void K_N_HSP(const V *w, V *r0, V *r1, V *r2, V *r3) { + V uk, uk2, zk_p, zk_n, zk, zk_d; + + uk = *r0; + uk2 = *r1; + zk_p = HSP_MUL(r2, w); + zk_n = HSP_MULJ(r3, w); + zk = ADD(zk_p, zk_n); + zk_d = SUB(zk_p, zk_n); + + *r2 = SUB(uk, zk); + *r0 = ADD(uk, zk); + *r3 = HSP_ADD_MULI(&uk2, &zk_d); + *r1 = HSP_SUB_MULI(&uk2, &zk_d); +} + +__INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + + V r0, r1, r2, r3, r4, r5, r6, r7; + V t0, t1, t2, t3, t4, t5, t6, t7; + + + t0 = LOAD2I(i0); + t1 = LOAD2I(i1); + t2 = LOAD2I(i2); + t3 = LOAD2I(i3); + t4 = ADD (t0, t1); + t5 = SUB (t0, t1); + t6 = ADD (t2, t3); + t7 = SUB (t2, t3); + r0 = ADD (t4, t6); + r2 = SUB (t4, t6); + r1 = HSP_SUB_MULI(&t5, &t7); + r3 = HSP_ADD_MULI(&t5, &t7); + + t0 = LOAD2I(i4); + t1 = LOAD2I(i5); + t2 = LOAD2I(i6); + t3 = LOAD2I(i7); + r4 = ADD (t0, t1); + r5 = SUB (t0, t1); + r6 = ADD (t2, t3); + r7 = SUB (t2, t3); + + t0 = r0; t1 = r2; + t2 = ADD(r4, r6); + t3 = SUB(r4, r6); + r0 = ADD(t0, t2); + r4 = SUB(t0, t2); + r2 = HSP_SUB_MULI(&t1, &t3); + r6 = HSP_ADD_MULI(&t1, &t3); + + V w = vld1q_f32(ee_w_data); + + K_N_HSP(&w,&r1,&r3,&r5,&r7); + V uk, uk2, zk, zk_d; + + float32x4x2_t tmp1 = vtrnq_f32(r0, r2); + r0 = tmp1.val[0]; + r2 = tmp1.val[1]; + float32x4x2_t tmp4 = vtrnq_f32(r1, r3); + r1 = tmp4.val[0]; + r3 = tmp4.val[1]; + register V tt0 __asm__ ("q0") = r0; + register V tt1 __asm__ ("q1") = r1; + register V tt2 __asm__ ("q2") = r2; + register V tt3 __asm__ ("q3") = r3; + __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory"); + __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory"); + + float32x4x2_t tmp2 = vtrnq_f32(r4, r6); + r4 = tmp2.val[0]; + r6 = tmp2.val[1]; + float32x4x2_t tmp3 = vtrnq_f32(r5, r7); + r5 = tmp3.val[0]; + r7 = tmp3.val[1]; + register V tt4 __asm__ ("q4") = r4; + register V tt5 __asm__ ("q5") = r5; + register V tt6 __asm__ ("q6") = r6; + register V tt7 __asm__ ("q7") = r7; + + __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory"); + __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory"); + +} + +__INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + + V r0, r1, r2, r3, r4, r5, r6, r7; + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = LOAD2I(i0); + t1 = LOAD2I(i1); + t2 = LOAD2I(i2); + t3 = LOAD2I(i3); + t4 = ADD (t0, t1); + t5 = SUB (t0, t1); + t6 = ADD (t2, t3); + t7 = SUB (t2, t3); + r0 = ADD (t4, t6); + r2 = SUB (t4, t6); + r1 = HSP_SUB_MULI(&t5, &t7); + r3 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp1 = vtrnq_f32(r0, r2); + r0 = tmp1.val[0]; + r2 = tmp1.val[1]; + float32x4x2_t tmp4 = vtrnq_f32(r1, r3); + r1 = tmp4.val[0]; + r3 = tmp4.val[1]; + register V tt0 __asm__ ("q0") = r0; + register V tt1 __asm__ ("q1") = r1; + register V tt2 __asm__ ("q2") = r2; + register V tt3 __asm__ ("q3") = r3; + __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory"); + __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory"); + + + + t0 = LOAD2I(i4); + t1 = LOAD2I(i5); + t2 = LOAD2I(i6); + t3 = LOAD2I(i7); + t4 = ADD (t0, t1); + t5 = SUB (t0, t1); + t6 = ADD (t2, t3); + t7 = SUB (t2, t3); + r4 = ADD (t4, t6); + r6 = SUB (t4, t6); + r5 = HSP_SUB_MULI(&t5, &t7); + r7 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp2 = vtrnq_f32(r4, r6); + r4 = tmp2.val[0]; + r6 = tmp2.val[1]; + float32x4x2_t tmp3 = vtrnq_f32(r5, r7); + r5 = tmp3.val[0]; + r7 = tmp3.val[1]; + + + register V tt4 __asm__ ("q4") = r4; + register V tt5 __asm__ ("q5") = r5; + register V tt6 __asm__ ("q6") = r6; + register V tt7 __asm__ ("q7") = r7; + + __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory"); + __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory"); + + + +} + +static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; + + +__INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + /* + register V r0_1 __asm__ ("q0"); + register V r2_3 __asm__ ("q1"); + register V r4_5 __asm__ ("q2"); + register V r6_7 __asm__ ("q3"); + */ + const V w = vld1q_f32(eo_w_data); + + V r0_1, r2_3, r4_5, r6_7; + + register V r8_9 __asm__ ("q4"); + register V r10_11 __asm__ ("q5"); + register V r12_13 __asm__ ("q6"); + register V r14_15 __asm__ ("q7"); + + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I(i0); + t1 = LOAD2I(i1); + t2 = LOAD2I(i2); + t3 = LOAD2I(i3); + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + + t0 = ADD(t4, t6); + t2 = SUB(t4, t6); + t1 = HSP_SUB_MULI(&t5, &t7); + t3 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp1 = vtrnq_f32(t0, t1); + t0 = tmp1.val[0]; + t1 = tmp1.val[1]; + float32x4x2_t tmp2 = vtrnq_f32(t2, t3); + t2 = tmp2.val[0]; + t3 = tmp2.val[1]; + + r0_1 = t0; + r2_3 = t2; + r8_9 = t1; + r10_11 = t3; + __asm__ __volatile__ ("vswp d9,d10\n\t" + "vst1.32 {d8,d9,d10,d11}, [%0, :128]!\n\t" +// "vst1.32 {d8,d9}, [%0, :128]!\n\t" +// "vst1.32 {d10,d11}, [%0, :128]!\n\t" + : "+&r" (out1) + : "w" (r8_9), "w" (r10_11) + : "memory"); + + } + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I(i4); + t1 = LOAD2I(i5); + t2 = LOAD2I(i6); + t3 = LOAD2I(i7); + //t2 = HALFBLEND(t6, t7); + //t3 = HALFBLEND(t7, t6); + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + float32x4x2_t tmp1 = vtrnq_f32(t4, t5); + r4_5 = tmp1.val[0]; + float32x4x2_t tmp2 = vtrnq_f32(t6, t7); + r6_7 = tmp2.val[0]; + //t5 = MULI(t5); + t0 = ADD(t6, t4); + t2 = SUB(t6, t4); + t1 = HSP_SUB_MULI(&t7, &t5); + t3 = HSP_ADD_MULI(&t7, &t5); + + float32x4x2_t tmp3 = vtrnq_f32(t0, t1); + r12_13 = tmp3.val[1]; + float32x4x2_t tmp4 = vtrnq_f32(t2, t3); + r14_15 = tmp4.val[1]; + + + __asm__ __volatile__ ("vswp d13, d14\n\t" + "vst1.32 {d12,d13,d14,d15}, [%0, :128]!\n\t" +// "vst1.32 {d12,d13}, [%0, :128]!\n\t" +// "vst1.32 {d14,d15}, [%0, :128]!\n\t" + : "+&r" (out1) + : "w" (r12_13), "w" (r14_15) + : "memory"); + + + } + + K_N_HSP(&w,&r0_1,&r2_3,&r4_5,&r6_7); + + register V t0 __asm__ ("q0") = r0_1; + register V t1 __asm__ ("q1") = r2_3; + register V t2 __asm__ ("q2") = r4_5; + register V t3 __asm__ ("q3") = r6_7; + + __asm__ __volatile__ ("vswp d1, d2\n\t" + "vswp d5, d6\n\t" + "vstmia %0!, {q0-q3}\n\t" +// "vst1.32 {d0,d1}, [%0, :128]!\n\t" +// "vst1.32 {d2,d3}, [%0, :128]!\n\t" +// "vst1.32 {d4,d5}, [%0, :128]!\n\t" +// "vst1.32 {d6,d7}, [%0, :128]\n\t" + : "+&r" (out0) + : "w" (t0), "w" (t1), "w" (t2), "w" (t3) + : "memory"); + +} +static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; + +__INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) { + register V r0_1 __asm__ ("q0"); + register V r2_3 __asm__ ("q1"); + register V r4_5 __asm__ ("q2"); + register V r6_7 __asm__ ("q3"); + + V r8_9, r10_11, r12_13, r14_15; + const V w = vld1q_f32(oe_w_data); + + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I(i0); + t1 = LOAD2I(i1); + t6 = LOADI(i2); + t7 = LOADI(i3); + + float32x2x2_t tmp0 = vtrn_f32(vget_low_f32(t6), vget_high_f32(t7)); + float32x2x2_t tmp1 = vtrn_f32(vget_low_f32(t7), vget_high_f32(t6)); + t2 = vcombine_f32(tmp0.val[0], tmp0.val[1]); + t3 = vcombine_f32(tmp1.val[0], tmp1.val[1]); + + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + float32x4x2_t tmp2 = vtrnq_f32(t4, t5); + r12_13 = tmp2.val[1]; + float32x4x2_t tmp3 = vtrnq_f32(t6, t7); + r14_15 = tmp3.val[1]; + + t0 = ADD(t4, t6); + t2 = SUB(t4, t6); + t1 = HSP_SUB_MULI(&t5, &t7); + t3 = HSP_ADD_MULI(&t5, &t7); + float32x4x2_t tmp4 = vtrnq_f32(t0, t1); + r0_1 = tmp4.val[0]; + float32x4x2_t tmp5 = vtrnq_f32(t2, t3); + r2_3 = tmp5.val[0]; + __asm__ __volatile__ ("vswp d1, d2\n\t" + "vst1.32 {q0, q1}, [%0, :128]!\n\t" +// "vst1.32 {q1}, [%0, :128]!\n\t" + : "+&r" (out0) + : "w" (r0_1), "w" (r2_3) + : "memory"); + } + { + V t0, t1, t2, t3, t4, t5, t6, t7; + t0 = LOAD2I(i4); + t1 = LOAD2I(i5); + t2 = LOAD2I(i6); + t3 = LOAD2I(i7); + t4 = ADD(t0, t1); + t5 = SUB(t0, t1); + t6 = ADD(t2, t3); + t7 = SUB(t2, t3); + t0 = ADD(t4, t6); + t2 = SUB(t4, t6); + t1 = HSP_SUB_MULI(&t5, &t7); + t3 = HSP_ADD_MULI(&t5, &t7); + + float32x4x2_t tmp0 = vtrnq_f32(t0, t1); + r4_5 = tmp0.val[0]; + r8_9 = tmp0.val[1]; + float32x4x2_t tmp1 = vtrnq_f32(t2, t3); + r6_7 = tmp1.val[0]; + r10_11 = tmp1.val[1]; + + + __asm__ __volatile__ ("vswp d5, d6\n\t" + "vst1.32 {q2, q3}, [%0, :128]!\n\t" +// "vst1.32 {q3}, [%0, :128]!\n\t" + : "+&r" (out0) + : "w" (r4_5), "w" (r6_7) + : "memory"); + + } + + K_N_HSP(&w,&r8_9,&r10_11,&r12_13,&r14_15); + register V t0 __asm__ ("q4") = r8_9; + register V t1 __asm__ ("q5") = r10_11; + register V t2 __asm__ ("q6") = r12_13; + register V t3 __asm__ ("q7") = r14_15; + + __asm__ __volatile__ ("vswp d9, d10\n\t" + "vswp d13, d14\n\t" + "vstmia %0!, {q4-q7}\n\t" +// "vst1.32 {q4}, [%0, :128]!\n\t" +// "vst1.32 {q5}, [%0, :128]!\n\t" +// "vst1.32 {q6}, [%0, :128]!\n\t" +// "vst1.32 {q7}, [%0, :128]\n\t" + : "+&r" (out1) + : "w" (t0), "w" (t1), "w" (t2), "w" (t3) + : "memory"); + + +} +#endif diff --git a/src/patterns.c b/src/patterns.c index 12d9a4c..226a3b6 100644 --- a/src/patterns.c +++ b/src/patterns.c @@ -8,12 +8,12 @@ void permute_addr(int N, int offset, int stride, int *d) { } } -void hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) { +void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) { if(N > 4) { - hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL); - if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL); - if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL); + ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL); + if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL); + if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL); else { int temp = poffset+(1<<stride); if(temp < 0) temp += bigN; @@ -43,7 +43,7 @@ void hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offs } } -void init_is(ffts_plan_t *p, int N, int leafN, int VL) { +void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) { int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3; int stride = log(N/leafN)/log(2); @@ -53,12 +53,12 @@ void init_is(ffts_plan_t *p, int N, int leafN, int VL) { if((N/leafN) % 3 > 1) i1++; - for(i=0;i<i0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); + for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); for(i=i0;i<i0+i1;i++) { - hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL); - hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL); + ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL); + ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL); } - for(i=0-i2;i<0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); + for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); //for(i=0;i<N/VL;i++) { @@ -69,15 +69,15 @@ void init_is(ffts_plan_t *p, int N, int leafN, int VL) { p->i0 = i0; p->i1 = i1; } -void elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) { +void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) { if((even && N == leafN) || (!even && N <= leafN)) { offsets[2*(ooffset/leafN)] = ioffset*2; offsets[2*(ooffset/leafN)+1] = ooffset; }else if(N > 4) { - elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even); - elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0); + ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even); + ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0); if(N/4 >= leafN) - elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0); + ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0); } } @@ -86,11 +86,22 @@ int compare_offsets(const void *a, const void *b) { return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0]; } -void init_offsets(ffts_plan_t *p, int N, int leafN) { +uint32_t reverse_bits(uint32_t a, int n) { + uint32_t x = 0; + + int i; + for(i=0;i<n;i++) { + if(a & (1 << i)) x |= 1 << (n-i-1); + } + return x; +} + + +void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) { ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t)); - elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1); + ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1); size_t i; for(i=0;i<2*N/leafN;i+=2) { @@ -103,9 +114,9 @@ void init_offsets(ffts_plan_t *p, int N, int leafN) { for(i=0;i<N/leafN;i++) { p->offsets[i] = offsets[i*2+1]*2; } -//for(i=0;i<N/leafN;i++) { -// printf("%td\n", p->offsets[i]); -//} + for(i=0;i<N/leafN;i++) { + printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); + } free(offsets); @@ -140,7 +151,7 @@ void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) { (*p)+=2; } -void init_tree(ffts_plan_t *p, int N, int leafN) { +void ffts_init_tree(ffts_plan_t *p, int N, int leafN) { int count = tree_count(N, leafN, 0) + 1; transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t)); @@ -148,11 +159,14 @@ void init_tree(ffts_plan_t *p, int N, int leafN) { //printf("count = %d\n", count); elaborate_tree(&ps, N, leafN, 0); + #ifdef __ARM_NEON__ + ps -= 2; + #endif ps[0] = 0; ps[1] = 0; //int i; //for(i=0;i<count;i++) { -// printf("%d %d - %d\n", p->transforms[i*2], p->transforms[i*2+1], +// fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1], // __builtin_ctzl(p->transforms[i*2]) - 5); //} diff --git a/src/patterns.h b/src/patterns.h index 298ca3e..fd93042 100644 --- a/src/patterns.h +++ b/src/patterns.h @@ -3,8 +3,8 @@ #include "cp_sse.h" -void init_is(ffts_plan_t *p, int N, int leafN, int VL); -void init_offsets(ffts_plan_t *p, int N, int leafN); -void init_tree(ffts_plan_t *p, int N, int leafN); +void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL); +void ffts_init_offsets(ffts_plan_t *p, int N, int leafN); +void ffts_init_tree(ffts_plan_t *p, int N, int leafN); #endif diff --git a/src/sse_float.h b/src/sse_float.h new file mode 100644 index 0000000..9974b2b --- /dev/null +++ b/src/sse_float.h @@ -0,0 +1,33 @@ +#ifndef __SSE_FLOAT_H__ +#define __SSE_FLOAT_H__ + +#include <xmmintrin.h> + +//#define VL 4 + +typedef __m128 V; + +#define VADD _mm_add_ps +#define VSUB _mm_sub_ps +#define VMUL _mm_mul_ps +//#define VLIT4 _mm_set_ps +#define VXOR _mm_xor_ps +#define VST _mm_store_ps +#define VLD _mm_load_ps + +#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1))) + +#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2))) +#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0))) + +#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0))) + +#define VLIT4 _mm_set_ps + +#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0))) +#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1))) + +#define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) +#define FFTS_FREE(d) (_mm_free(d)) + +#endif |