diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/codegen.c | 5 | ||||
-rw-r--r-- | src/ffts.c | 15 | ||||
-rw-r--r-- | src/neon.s | 16 | ||||
-rw-r--r-- | src/vfp.s | 38 |
4 files changed, 45 insertions, 29 deletions
diff --git a/src/codegen.c b/src/codegen.c index 01ca3e8..08274c3 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -302,6 +302,11 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { fp += (neon_oo - neon_ee) / 4; #else memcpy(fp, vfp_e, vfp_o - vfp_e); + if(sign > 0) { + fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; + } fp += (vfp_o - vfp_e) / 4; #endif #else @@ -88,11 +88,20 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { size_t leafN = 8; size_t i; +#ifdef __arm__ +#ifdef HAVE_NEON V MULI_SIGN; if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); +#endif +#else + V MULI_SIGN; + if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); +#endif + p->transform = NULL; p->transform_base = NULL; p->transforms = NULL; @@ -164,7 +173,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { for(i=0;i<n_luts;i++) { if(!i || hardcoded) { - #ifdef HAVE_NEON + #ifdef __arm__ if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); else lut_size += n/4 * sizeof(cdata_t); #else @@ -172,7 +181,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { #endif n *= 2; } else { - #ifdef HAVE_NEON + #ifdef __arm__ lut_size += n/8 * 3 * sizeof(cdata_t); #else lut_size += n/8 * 3 * 2 * sizeof(cdata_t); @@ -221,6 +230,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { float *fw = (float *)w; V temp0, temp1, temp2; for(j=0;j<n/4;j+=2) { + #ifdef HAVE_NEON temp0 = VLD(fw0 + j*2); V re, im; re = VDUPRE(temp0); @@ -228,6 +238,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { im = VXOR(im, MULI_SIGN); VST(fw + j*4 , re); VST(fw + j*4+4, im); + #endif } w += n/4 * 2; }else{ @@ -378,15 +378,15 @@ _neon_ee_loop: vsub.f32 d31, d5, d2 @ vsub.f32 d28, d4, d3 @ vadd.f32 d30, d4, d3 @ - vadd.f32 d5, d19, d14 @ - vadd.f32 d7, d31, d26 @ + vadd.f32 d5, d19, d14 @- + vadd.f32 d7, d31, d26 @- vadd.f32 q1, q14, q5 vadd.f32 q0, q11, q10 - vsub.f32 d6, d30, d27 @ - vsub.f32 d4, d18, d15 @ - vsub.f32 d13, d19, d14 @ - vadd.f32 d12, d18, d15 @ - vsub.f32 d15, d31, d26 @ + vsub.f32 d6, d30, d27 @- + vsub.f32 d4, d18, d15 @- + vsub.f32 d13, d19, d14 @- + vadd.f32 d12, d18, d15 @- + vsub.f32 d15, d31, d26 @- ldr r2, [r12], #4 vtrn.32 q1, q3 ldr lr, [r12], #4 @@ -395,7 +395,7 @@ _neon_ee_loop: vsub.f32 q4, q11, q10 add lr, r0, lr, lsl #2 vsub.f32 q5, q14, q5 - vadd.f32 d14, d30, d27 @ + vadd.f32 d14, d30, d27 @- vst2.32 {q0,q1}, [r2, :128]! vst2.32 {q2,q3}, [lr, :128]! vtrn.32 q4, q6 @@ -107,41 +107,41 @@ _vfp_e_loop: vadd.f32 s1, s4, s0 vadd.f32 s9, s2, s18 vsub.f32 s11, s2, s18 - vadd.f32 s15, s22, s12 - vadd.f32 s7, s5, s14 - vsub.f32 s18, s5, s14 + vadd.f32 s15, s22, s12 @* + vadd.f32 s7, s5, s14 @* + vsub.f32 s18, s5, s14 @* vsub.f32 s5, s4, s0 - vsub.f32 s22, s22, s12 + vsub.f32 s22, s22, s12 @* vadd.f32 s12, s10, s8 vadd.f32 s2, s6, s3 vsub.f32 s0, s6, s3 vsub.f32 s8, s10, s8 - vsub.f32 s4, s16, s13 + vsub.f32 s4, s16, s13 vadd.f32 s14, s16, s13 vadd.f32 s3, s28, s26 - vsub.f32 s6, s28, s26 + vsub.f32 s6, s28, s26 vadd.f32 s20, s9, s1 - vsub.f32 s16, s9, s1 + vsub.f32 s16, s9, s1 vadd.f32 s10, s2, s12 - vadd.f32 s24, s15, s7 - vsub.f32 s13, s15, s7 - vadd.f32 s28, s11, s18 - vsub.f32 s7, s11, s18 - vadd.f32 s9, s0, s6 + vadd.f32 s24, s15, s7 + vsub.f32 s13, s15, s7 + vadd.f32 s28, s11, s18 @ + vsub.f32 s7, s11, s18 @ + vadd.f32 s9, s0, s6 @ ldr lr, [r12], #4 add lr, r0, lr, lsl #2 subs r11, r11, #1 vstr s20, [lr] - vsub.f32 s6, s0, s6 - vsub.f32 s26, s22, s5 - vadd.f32 s1, s22, s5 + vsub.f32 s6, s0, s6 @ + vsub.f32 s26, s22, s5 @ + vadd.f32 s1, s22, s5 @ vstr s24, [lr, #4] vadd.f32 s5, s4, s3 - vsub.f32 s11, s14, s8 + vsub.f32 s11, s14, s8 @ vstr s28, [lr, #16] - vsub.f32 s2, s2, s12 - vsub.f32 s4, s4, s3 - vadd.f32 s0, s14, s8 + vsub.f32 s2, s2, s12 + vsub.f32 s4, s4, s3 + vadd.f32 s0, s14, s8 @ vstr s26, [lr, #20] vstr s16, [lr, #32] vstr s13, [lr, #36] |