summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/codegen.c5
-rw-r--r--src/ffts.c15
-rw-r--r--src/neon.s16
-rw-r--r--src/vfp.s38
4 files changed, 45 insertions, 29 deletions
diff --git a/src/codegen.c b/src/codegen.c
index 01ca3e8..08274c3 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -302,6 +302,11 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
fp += (neon_oo - neon_ee) / 4;
#else
memcpy(fp, vfp_e, vfp_o - vfp_e);
+ if(sign > 0) {
+ fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
fp += (vfp_o - vfp_e) / 4;
#endif
#else
diff --git a/src/ffts.c b/src/ffts.c
index a50360a..fe4b590 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -88,11 +88,20 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
size_t leafN = 8;
size_t i;
+#ifdef __arm__
+#ifdef HAVE_NEON
V MULI_SIGN;
if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+#endif
+#else
+ V MULI_SIGN;
+ if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+ else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+#endif
+
p->transform = NULL;
p->transform_base = NULL;
p->transforms = NULL;
@@ -164,7 +173,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
for(i=0;i<n_luts;i++) {
if(!i || hardcoded) {
- #ifdef HAVE_NEON
+ #ifdef __arm__
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
else lut_size += n/4 * sizeof(cdata_t);
#else
@@ -172,7 +181,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
#endif
n *= 2;
} else {
- #ifdef HAVE_NEON
+ #ifdef __arm__
lut_size += n/8 * 3 * sizeof(cdata_t);
#else
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
@@ -221,6 +230,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
float *fw = (float *)w;
V temp0, temp1, temp2;
for(j=0;j<n/4;j+=2) {
+ #ifdef HAVE_NEON
temp0 = VLD(fw0 + j*2);
V re, im;
re = VDUPRE(temp0);
@@ -228,6 +238,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
im = VXOR(im, MULI_SIGN);
VST(fw + j*4 , re);
VST(fw + j*4+4, im);
+ #endif
}
w += n/4 * 2;
}else{
diff --git a/src/neon.s b/src/neon.s
index e30fd05..6995066 100644
--- a/src/neon.s
+++ b/src/neon.s
@@ -378,15 +378,15 @@ _neon_ee_loop:
vsub.f32 d31, d5, d2 @
vsub.f32 d28, d4, d3 @
vadd.f32 d30, d4, d3 @
- vadd.f32 d5, d19, d14 @
- vadd.f32 d7, d31, d26 @
+ vadd.f32 d5, d19, d14 @-
+ vadd.f32 d7, d31, d26 @-
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
- vsub.f32 d6, d30, d27 @
- vsub.f32 d4, d18, d15 @
- vsub.f32 d13, d19, d14 @
- vadd.f32 d12, d18, d15 @
- vsub.f32 d15, d31, d26 @
+ vsub.f32 d6, d30, d27 @-
+ vsub.f32 d4, d18, d15 @-
+ vsub.f32 d13, d19, d14 @-
+ vadd.f32 d12, d18, d15 @-
+ vsub.f32 d15, d31, d26 @-
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
@@ -395,7 +395,7 @@ _neon_ee_loop:
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
- vadd.f32 d14, d30, d27 @
+ vadd.f32 d14, d30, d27 @-
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
diff --git a/src/vfp.s b/src/vfp.s
index cd865b8..16bc37f 100644
--- a/src/vfp.s
+++ b/src/vfp.s
@@ -107,41 +107,41 @@ _vfp_e_loop:
vadd.f32 s1, s4, s0
vadd.f32 s9, s2, s18
vsub.f32 s11, s2, s18
- vadd.f32 s15, s22, s12
- vadd.f32 s7, s5, s14
- vsub.f32 s18, s5, s14
+ vadd.f32 s15, s22, s12 @*
+ vadd.f32 s7, s5, s14 @*
+ vsub.f32 s18, s5, s14 @*
vsub.f32 s5, s4, s0
- vsub.f32 s22, s22, s12
+ vsub.f32 s22, s22, s12 @*
vadd.f32 s12, s10, s8
vadd.f32 s2, s6, s3
vsub.f32 s0, s6, s3
vsub.f32 s8, s10, s8
- vsub.f32 s4, s16, s13
+ vsub.f32 s4, s16, s13
vadd.f32 s14, s16, s13
vadd.f32 s3, s28, s26
- vsub.f32 s6, s28, s26
+ vsub.f32 s6, s28, s26
vadd.f32 s20, s9, s1
- vsub.f32 s16, s9, s1
+ vsub.f32 s16, s9, s1
vadd.f32 s10, s2, s12
- vadd.f32 s24, s15, s7
- vsub.f32 s13, s15, s7
- vadd.f32 s28, s11, s18
- vsub.f32 s7, s11, s18
- vadd.f32 s9, s0, s6
+ vadd.f32 s24, s15, s7
+ vsub.f32 s13, s15, s7
+ vadd.f32 s28, s11, s18 @
+ vsub.f32 s7, s11, s18 @
+ vadd.f32 s9, s0, s6 @
ldr lr, [r12], #4
add lr, r0, lr, lsl #2
subs r11, r11, #1
vstr s20, [lr]
- vsub.f32 s6, s0, s6
- vsub.f32 s26, s22, s5
- vadd.f32 s1, s22, s5
+ vsub.f32 s6, s0, s6 @
+ vsub.f32 s26, s22, s5 @
+ vadd.f32 s1, s22, s5 @
vstr s24, [lr, #4]
vadd.f32 s5, s4, s3
- vsub.f32 s11, s14, s8
+ vsub.f32 s11, s14, s8 @
vstr s28, [lr, #16]
- vsub.f32 s2, s2, s12
- vsub.f32 s4, s4, s3
- vadd.f32 s0, s14, s8
+ vsub.f32 s2, s2, s12
+ vsub.f32 s4, s4, s3
+ vadd.f32 s0, s14, s8 @
vstr s26, [lr, #20]
vstr s16, [lr, #32]
vstr s13, [lr, #36]
OpenPOWER on IntegriCloud