diff options
-rwxr-xr-x | build_iphone.sh | 2 | ||||
-rw-r--r-- | src/ffts_nd.c | 36 | ||||
-rw-r--r-- | src/ffts_nd.h | 6 |
3 files changed, 38 insertions, 6 deletions
diff --git a/build_iphone.sh b/build_iphone.sh index 84a95f1..5695007 100755 --- a/build_iphone.sh +++ b/build_iphone.sh @@ -7,7 +7,7 @@ INSTALL_DIR="`pwd`/build" export SDKVER="6.0" export DEVROOT="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer" export SDKROOT="$DEVROOT/SDKs/iPhoneOS$SDKVER.sdk" -export CFLAGS="-O3 -Wreturn-type -Wparentheses -Wswitch -Wno-unused-parameter -Wno-unused-variable -Wunused-value -Wno-shorten-64-to-32 -Wno-trigraphs -fpascal-strings -miphoneos-version-min=5.0 -mcpu=cortex-a9 -arch armv7 -mfpu=neon -pipe -isysroot $SDKROOT -isystem $SDKROOT/usr/include -isystem $DEVROOT/usr/include -std=c99 -mno-thumb -no-integrated-as" +export CFLAGS="-Os -Wreturn-type -Wparentheses -Wswitch -Wno-unused-parameter -Wno-unused-variable -Wunused-value -Wno-shorten-64-to-32 -Wno-trigraphs -fpascal-strings -miphoneos-version-min=5.0 -mcpu=cortex-a9 -arch armv7 -mfpu=neon -pipe -isysroot $SDKROOT -isystem $SDKROOT/usr/include -isystem $DEVROOT/usr/include -std=c99 -mno-thumb -no-integrated-as" export AR="$DEVROOT/usr/bin/ar" export CC="clang" diff --git a/src/ffts_nd.c b/src/ffts_nd.c index 4e72e81..086b98c 100644 --- a/src/ffts_nd.c +++ b/src/ffts_nd.c @@ -48,13 +48,41 @@ void ffts_free_nd(ffts_plan_t *p) { free(p); } -void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) { +inline void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) { size_t i,j; - for(i=0;i<w;i++) { - for(j=0;j<h;j++) { - out[i*h + j] = in[j*w + i]; + for(i=0;i<w;i+=2) { + for(j=0;j<h;j+=2) { +#ifdef __ARM_NEON__ +// out[i*h + j] = in[j*w + i]; + float32x4_t Q0 = vld1q_f32(in + j*w + i); + float32x4_t Q1 = vld1q_f32(in + j*w + i + w); + + float32x2x2_t t0; + float32x2x2_t t1; + t0.val[0] = vget_low_f32(Q0); + t0.val[1] = vget_high_f32(Q0); + t1.val[0] = vget_low_f32(Q1); + t1.val[1] = vget_high_f32(Q1); + + __asm__ ("vswp %0,%1\n\t" + : "+w" (t0.val[1]), "+w" (t1.val[0]) + : + ); + + Q0 = vcombine_f32(t0.val[0], t0.val[1]); + Q1 = vcombine_f32(t1.val[0], t1.val[1]); + vst1q_f32(out + i*h + j, Q0); + vst1q_f32(out + i*h + j + h, Q1); +#else + __m128d q0 = _mm_load_pd(in + j*w + i); + __m128d q1 = _mm_load_pd(in + j*w + i + w); + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + _mm_store_pd(out + i*h + j, t0); + _mm_store_pd(out + i*h + j + h, t1); +#endif } } } diff --git a/src/ffts_nd.h b/src/ffts_nd.h index 2eef287..4d2474d 100644 --- a/src/ffts_nd.h +++ b/src/ffts_nd.h @@ -40,7 +40,11 @@ #include "ffts.h" - +#ifdef __ARM_NEON__ + #include <arm_neon.h> +#else + #include <xmmintrin.h> +#endif #endif |