summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2012-11-06 10:42:47 +1300
committerAnthony Blake <anthonix@me.com>2012-11-06 10:42:47 +1300
commit07a46c66535ce0f50c2d4706335d2093868a1ee9 (patch)
treee3193988ccc8b629f57b52ab57739c73926d956f
parent923a01fc8173b2c9be9e05e06c1bc24bf5abddb0 (diff)
downloadffts-07a46c66535ce0f50c2d4706335d2093868a1ee9.zip
ffts-07a46c66535ce0f50c2d4706335d2093868a1ee9.tar.gz
2D has SIMD transpose (still needs work though)
-rwxr-xr-xbuild_iphone.sh2
-rw-r--r--src/ffts_nd.c36
-rw-r--r--src/ffts_nd.h6
3 files changed, 38 insertions, 6 deletions
diff --git a/build_iphone.sh b/build_iphone.sh
index 84a95f1..5695007 100755
--- a/build_iphone.sh
+++ b/build_iphone.sh
@@ -7,7 +7,7 @@ INSTALL_DIR="`pwd`/build"
export SDKVER="6.0"
export DEVROOT="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer"
export SDKROOT="$DEVROOT/SDKs/iPhoneOS$SDKVER.sdk"
-export CFLAGS="-O3 -Wreturn-type -Wparentheses -Wswitch -Wno-unused-parameter -Wno-unused-variable -Wunused-value -Wno-shorten-64-to-32 -Wno-trigraphs -fpascal-strings -miphoneos-version-min=5.0 -mcpu=cortex-a9 -arch armv7 -mfpu=neon -pipe -isysroot $SDKROOT -isystem $SDKROOT/usr/include -isystem $DEVROOT/usr/include -std=c99 -mno-thumb -no-integrated-as"
+export CFLAGS="-Os -Wreturn-type -Wparentheses -Wswitch -Wno-unused-parameter -Wno-unused-variable -Wunused-value -Wno-shorten-64-to-32 -Wno-trigraphs -fpascal-strings -miphoneos-version-min=5.0 -mcpu=cortex-a9 -arch armv7 -mfpu=neon -pipe -isysroot $SDKROOT -isystem $SDKROOT/usr/include -isystem $DEVROOT/usr/include -std=c99 -mno-thumb -no-integrated-as"
export AR="$DEVROOT/usr/bin/ar"
export CC="clang"
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index 4e72e81..086b98c 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -48,13 +48,41 @@ void ffts_free_nd(ffts_plan_t *p) {
free(p);
}
-void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) {
+inline void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) {
size_t i,j;
- for(i=0;i<w;i++) {
- for(j=0;j<h;j++) {
- out[i*h + j] = in[j*w + i];
+ for(i=0;i<w;i+=2) {
+ for(j=0;j<h;j+=2) {
+#ifdef __ARM_NEON__
+// out[i*h + j] = in[j*w + i];
+ float32x4_t Q0 = vld1q_f32(in + j*w + i);
+ float32x4_t Q1 = vld1q_f32(in + j*w + i + w);
+
+ float32x2x2_t t0;
+ float32x2x2_t t1;
+ t0.val[0] = vget_low_f32(Q0);
+ t0.val[1] = vget_high_f32(Q0);
+ t1.val[0] = vget_low_f32(Q1);
+ t1.val[1] = vget_high_f32(Q1);
+
+ __asm__ ("vswp %0,%1\n\t"
+ : "+w" (t0.val[1]), "+w" (t1.val[0])
+ :
+ );
+
+ Q0 = vcombine_f32(t0.val[0], t0.val[1]);
+ Q1 = vcombine_f32(t1.val[0], t1.val[1]);
+ vst1q_f32(out + i*h + j, Q0);
+ vst1q_f32(out + i*h + j + h, Q1);
+#else
+ __m128d q0 = _mm_load_pd(in + j*w + i);
+ __m128d q1 = _mm_load_pd(in + j*w + i + w);
+ __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+ __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+ _mm_store_pd(out + i*h + j, t0);
+ _mm_store_pd(out + i*h + j + h, t1);
+#endif
}
}
}
diff --git a/src/ffts_nd.h b/src/ffts_nd.h
index 2eef287..4d2474d 100644
--- a/src/ffts_nd.h
+++ b/src/ffts_nd.h
@@ -40,7 +40,11 @@
#include "ffts.h"
-
+#ifdef __ARM_NEON__
+ #include <arm_neon.h>
+#else
+ #include <xmmintrin.h>
+#endif
#endif
OpenPOWER on IntegriCloud