diff options
-rw-r--r-- | src/codegen.c | 144 | ||||
-rw-r--r-- | src/cp_sse.c | 85 | ||||
-rw-r--r-- | src/cp_sse.h | 1 | ||||
-rw-r--r-- | src/neon.s | 33 | ||||
-rw-r--r-- | src/neon_float.h | 22 |
5 files changed, 135 insertions, 150 deletions
diff --git a/src/codegen.c b/src/codegen.c index 72daf89..d1cce37 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -32,22 +32,9 @@ void elaborate_tree(size_t **p, int N, int leafN, int offset) { (*p)+=2; } -void -ffts_x_8(float *data, size_t N, float *LUT) { - X_8_SPLIT(data, N, LUT); -} -void -ffts_x_8_t(float *data, size_t N, float *LUT) { - X_8_SPLIT_T(data, N, LUT); -} -void -ffts_x_4(float *data, size_t N, float *LUT) { - //fprintf(stderr, "X_4 %zu\n", N); - X_4_SPLIT(data, N, LUT); -} -void -dummy(float *data, size_t N, float *LUT) { +void neon_X_8(data_t * restrict data0, size_t N, data_t * restrict LUT) { + X_8_SPLIT(data0, N, LUT); } uint32_t BL(void *pos, void *target) { @@ -86,7 +73,7 @@ void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) { *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); - if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2)))); + if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2)))); } } @@ -95,8 +82,15 @@ uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) { | ((base & 0xf) << 16) | (offset & 0xfff) ; } -uint32_t MOVI(uint8_t dst, uint16_t val) { - return 0xe3a00000 | ((dst & 0xf) << 12) | (val & 0xffff) ; +uint32_t MOVI(uint32_t **p, uint8_t dst, uint32_t imm) { + uint32_t oimm = imm; + + uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ; + if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2)))); } uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; } @@ -151,11 +145,22 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) uint32_t *func = p->transform_base;//valloc(8192); uint32_t *fp = func; + fprintf(stderr, "Allocating %d bytes \n", p->transform_size); + if(!func) { fprintf(stderr, "NOMEM\n"); exit(1); } + if(N < 32) { + + + + + + } + + uint32_t *x_8_addr = fp; memcpy(fp, neon_x8, neon_x8_t - neon_x8); fp += (neon_x8_t - neon_x8) / 4; @@ -168,22 +173,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) uint32_t *start = fp; -//fprintf(stderr, "X_4: %08x START: %08x\n", x_4_addr, start); -//fprintf(stderr, "X_8: %08x\n", x_8_addr, start); -//fprintf(stderr, "X_8_T: %08x\n", x_8_t_addr, start); - - fprintf(stderr, "LUT: %08x\n", p->ws); - fprintf(stderr, "offsets: %08x\n", p->offsets); *fp++ = PUSH_LR(); -// *fp++ = MOV(2, 1); -// *fp++ = BL(fp+2, start); - - - - - -//ADDI(0, 1, 0); // mov r1 -> r0 -//ADDI(1, 2, 0); // mov r2 -> r1 ADDI(&fp, 3, 1, 0); ADDI(&fp, 7, 1, N); @@ -204,34 +194,93 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) p->ee_ws = ee_w_data; p->eo_ws = eo_w_data; - fprintf(stderr, "p = %08x\n", p); + fprintf(stderr, "p = %08x i0 = %d i1 = %d\n", p, p->i0, p->i1); fprintf(stderr, "start of ee %08x\n", fp); *fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); - + MOVI(&fp, 11, p->i0); + + fprintf(stderr, "p->i0 insn = %d - %08x %08x\n", p->i0, fp[-2], fp[-1]); + //fp++; memcpy(fp, neon_ee, neon_oo - neon_ee); fp += (neon_oo - neon_ee) / 4; + if(__builtin_ctzl(N) & 1){ + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + fp += (neon_eo - neon_oo) / 4; + } + + *fp++ = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); + memcpy(fp, neon_oe, neon_end - neon_oe); + fp += (neon_end - neon_oe) / 4; + + }else{ + + *fp++ = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); + + memcpy(fp, neon_eo, neon_oe - neon_eo); + fp += (neon_oe - neon_eo) / 4; + + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + fp += (neon_eo - neon_oo) / 4; + } + + } + - *fp++ = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); + if(p->i1) { + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); - fprintf(stderr, "start of oe %08x\n", fp); - memcpy(fp, neon_oe, neon_end - neon_oe); - fp += (neon_end - neon_oe) / 4; + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + *fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_ee, neon_oo - neon_ee); + fp += (neon_oo - neon_ee) / 4; + + } *fp++ = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); // load offsets into r12 //ADDI(&fp, 2, 1, 0); - *fp++ = MOVI(1, 0); + MOVI(&fp, 1, 0); // args: r0 - out // r1 - N @@ -246,7 +295,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) // fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); if(!pN) { - *fp++ = MOVI(1, pps[0]); + MOVI(&fp, 1, pps[0]); }else{ if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); @@ -282,7 +331,6 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) // fprintf(stderr, "%08x\n", x_4_addr[i]); //fprintf(stderr, "\n"); //for(int i=0;i<count;i++) -// fprintf(stderr, "%08x\n", start[i]); free(ps); diff --git a/src/cp_sse.c b/src/cp_sse.c index f36f90b..d32155f 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -2,74 +2,6 @@ #include "macros.h" #include "patterns.h" -__INLINE void -firstpass_type_1(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { - size_t i, ii0 = p->i0, ii1 = p->i1; - size_t *offsets = (size_t *)p->offsets; - size_t *is = (size_t *)p->is; -#ifdef __ARM_NEON__ - const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7]; - for(i=ii0;i>0;--i) { - neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); - offsets += 2; - } - for(i=ii1;i>0;--i) { - neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); - offsets += 2; - } - neon_shl8_oe(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); - offsets += 2; - for(i=ii1;i>0;--i) { - neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2); - offsets += 2; - } - -#else - for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out); - for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out); - LEAF_OE(&is, in, &offsets, out); - for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out); -#endif -} - -__INLINE void -firstpass_type_2(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { - size_t i, ii0 = p->i0, ii1 = p->i1; - size_t *offsets = (size_t *)p->offsets; - size_t *is = (size_t *)p->is; -#ifdef __ARM_NEON__ - const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7]; - - for(i=ii0;i>0;--i) { - neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); - offsets+=2; - } - neon_shl8_eo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); - offsets += 2; - for(i=ii1;i>0;--i) { - neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); - offsets += 2; - } - for(i=ii1;i>0;--i) { - neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2); - offsets += 2; - } - -#else - for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out); - LEAF_EO(&is, in, &offsets, out); - for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out); - for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out); -#endif -} - -__INLINE void -firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { - size_t *offsets = (size_t *)p->offsets; - size_t *is = (size_t *)p->is; - LEAF_EE(&is, in, &offsets, out); - LEAF_OE(&is, in, &offsets, out); -} void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) { transform_index_t *ps = p->transforms; @@ -119,8 +51,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) { // ffts_init_tree(p, N, leafN); // if(N == 64) p->firstpass = &firstpass_64; - if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_1; - else p->firstpass = &firstpass_type_2; LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941); LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); @@ -157,12 +87,12 @@ ffts_plan_t *ffts_init(size_t N, int sign) { p->transforms = malloc(2 * sizeof(transform_index_t)); p->transforms[0] = 0; p->transforms[1] = 1; - if(N == 2) p->firstpass = &firstpass_2; - else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f; - else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b; - else if(N == 8) p->firstpass = &firstpass_8; - else if(N == 16) p->firstpass = &firstpass_16; - else if(N == 32) p->firstpass = &firstpass_32; +// if(N == 2) p->firstpass = &firstpass_2; +// else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f; +// else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b; +// else if(N == 8) p->firstpass = &firstpass_8; +// else if(N == 16) p->firstpass = &firstpass_16; +// else if(N == 32) p->firstpass = &firstpass_32; p->is = NULL; p->offsets = NULL; @@ -362,7 +292,8 @@ ffts_plan_t *ffts_init(size_t N, int sign) { // tmp[4], tmp[5], tmp[6], tmp[7]); // tmp += 8; //} - + + fprintf(stderr, "p0 %d p1 %d\n", p->i0, p->i1); p->N = N; p->lastlut = w; p->n_luts = n_luts; diff --git a/src/cp_sse.h b/src/cp_sse.h index 7729eb8..0136173 100644 --- a/src/cp_sse.h +++ b/src/cp_sse.h @@ -27,7 +27,6 @@ struct _ffts_plan_t { void __attribute__ ((aligned(32))) *oe_ws, *eo_ws, *ee_ws; ptrdiff_t *is; size_t *ws_is; - void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict); size_t i0, i1, n_luts; size_t N; void *lastlut; @@ -266,10 +266,6 @@ neon_x8_t_loop: _neon_ee: vld1.32 {d16, d17}, [r2, :128] _neon_ee_loop: - ldr r2, [r12], #4 - ldr lr, [r12], #4 - add r2, r0, r2, lsl #2 - add lr, r0, lr, lsl #2 vld2.32 {q15}, [r10, :128]! vld2.32 {q13}, [r8, :128]! vld2.32 {q14}, [r7, :128]! @@ -277,9 +273,9 @@ _neon_ee_loop: vld2.32 {q10}, [r3, :128]! vld2.32 {q11}, [r6, :128]! vld2.32 {q12}, [r5, :128]! - subs r11, r11, #1 vsub.f32 q1, q14, q13 vld2.32 {q0}, [r9, :128]! + subs r11, r11, #1 vsub.f32 q2, q0, q15 vadd.f32 q0, q0, q15 vmul.f32 d10, d2, d17 @@ -318,8 +314,12 @@ _neon_ee_loop: vadd.f32 d13, d19, d14 vsub.f32 d12, d18, d15 vadd.f32 d15, d31, d26 + ldr r2, [r12], #4 + ldr lr, [r12], #4 vtrn.32 q1, q3 vtrn.32 q0, q2 + add r2, r0, r2, lsl #2 + add lr, r0, lr, lsl #2 vsub.f32 q4, q11, q10 vsub.f32 q5, q14, q5 vsub.f32 d14, d30, d27 @@ -329,6 +329,7 @@ _neon_ee_loop: vtrn.32 q5, q7 vst2.32 {q4,q5}, [r2, :128]! vst2.32 {q6,q7}, [lr, :128]! + bne _neon_ee_loop @ assumes r0 = out @ @@ -339,6 +340,8 @@ _neon_ee_loop: .globl _neon_oo .align 4 _neon_oo: + +_neon_oo_loop: vld2.32 {q8}, [r6, :128]! vld2.32 {q9}, [r5, :128]! vld2.32 {q10}, [r4, :128]! @@ -347,6 +350,7 @@ _neon_oo: vsub.f32 q8, q9, q8 vsub.f32 q9, q13, q10 vadd.f32 q12, q13, q10 + subs r11, r11, #1 vld2.32 {q10}, [r7, :128]! vld2.32 {q13}, [r9, :128]! vsub.f32 q2, q12, q11 @@ -364,8 +368,8 @@ _neon_oo: vsub.f32 q6, q12, q11 vadd.f32 q4, q12, q11 vtrn.32 q0, q2 - ldr r2, [r12]! - ldr lr, [r12]! + ldr r2, [r12], #4 + ldr lr, [r12], #4 vadd.f32 d15, d19, d16 vsub.f32 d11, d19, d16 vsub.f32 d14, d18, d17 @@ -379,6 +383,7 @@ _neon_oo: vtrn.32 q5, q7 vst2.32 {q4,q5}, [r2, :128]! vst2.32 {q6,q7}, [lr, :128]! + bne _neon_oo_loop @ assumes r0 = out @ @@ -404,15 +409,15 @@ _neon_eo: vsub.f32 q8, q12, q8 vadd.f32 d8, d22, d21 vsub.f32 d10, d22, d21 - ldr r2, [r12]! - ldr lr, [r12]! + ldr r2, [r12], #4 + ldr lr, [r12], #4 vld1.32 {d20, d21}, [r11, :128] vtrn.32 q9, q4 - vtrn.32 q8, q5 - vswp d9,d10 add r2, r0, r2, lsl #2 add lr, r0, lr, lsl #2 - vst1.32 {d8,d9,d10,d11}, [r2, :128]! + vtrn.32 q8, q5 + vswp d9,d10 + vst1.32 {d8,d9,d10,d11}, [lr, :128]! vld2.32 {q13}, [r10, :128]! @tag7 vld2.32 {q15}, [r9, :128]! @tag6 vld2.32 {q11}, [r8, :128]! @tag5 @@ -429,7 +434,7 @@ _neon_eo: vsub.f32 q15, q13, q11 vtrn.32 q15, q7 vswp d13, d14 - vst1.32 {d12,d13,d14,d15}, [r2, :128]! + vst1.32 {d12,d13,d14,d15}, [lr, :128]! vtrn.32 q13, q14 vtrn.32 q11, q12 vmul.f32 d24, d26, d21 @@ -454,7 +459,7 @@ _neon_eo: vsub.f32 d6, d16, d21 vswp d1, d2 vswp d5, d6 - vstmia lr!, {q0-q3} + vstmia r2!, {q0-q3} @ assumes r0 = out diff --git a/src/neon_float.h b/src/neon_float.h index 41c9ecf..d793691 100644 --- a/src/neon_float.h +++ b/src/neon_float.h @@ -46,19 +46,21 @@ static inline V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { #define FFTS_MALLOC(d,a) (valloc(d)) #define FFTS_FREE(d) (free(d)) __INLINE void FMA(V *Rd, V Rn, V Rm) { - __asm__ ("vmla.f32 %q0,%q1,%q2\n\t" - : "+w" (*Rd) - : "w" (Rn), "w" (Rm) - //: "0" - ); + *Rd = vmlaq_f32(*Rd, Rn, Rm); +// __asm__ ("vmla.f32 %q0,%q1,%q2\n\t" +// : "+w" (*Rd) +// : "w" (Rn), "w" (Rm) +// //: "0" +// ); } __INLINE void FMS(V *Rd, V Rn, V Rm) { - __asm__ ("vmls.f32 %q0,%q1,%q2\n\t" - : "+w" (*Rd) - : "w" (Rn), "w" (Rm) - // : "0" - ); + *Rd = vmlsq_f32(*Rd, Rn, Rm); +// __asm__ ("vmls.f32 %q0,%q1,%q2\n\t" +// : "+w" (*Rd) +// : "w" (Rn), "w" (Rm) +// // : "0" +// ); } __INLINE VS VSMUL(VS *d, VS *w) { |