diff options
author | Anthony Blake <anthonix@me.com> | 2012-08-20 15:26:47 +1200 |
---|---|---|
committer | Anthony Blake <anthonix@me.com> | 2012-08-20 15:26:47 +1200 |
commit | 7877c36f6fc90bb88c8b81a05e66c0f48bf3ca02 (patch) | |
tree | 1d36e795f6fce05e40fa00b380c6f7688b8cc5ba | |
parent | 81023395c68710f8732a22ac46b511bce64e012b (diff) | |
download | ffts-7877c36f6fc90bb88c8b81a05e66c0f48bf3ca02.zip ffts-7877c36f6fc90bb88c8b81a05e66c0f48bf3ca02.tar.gz |
Full custom FFT32 works
-rw-r--r-- | include/ffts.h | 18 | ||||
-rw-r--r-- | src/codegen.c | 158 | ||||
-rw-r--r-- | src/cp_sse.c | 35 | ||||
-rw-r--r-- | src/cp_sse.h | 9 | ||||
-rw-r--r-- | src/neon.s | 304 | ||||
-rw-r--r-- | src/neon_float.h | 124 | ||||
-rw-r--r-- | src/patterns.c | 6 |
7 files changed, 545 insertions, 109 deletions
diff --git a/include/ffts.h b/include/ffts.h index 9bd0dbe..ba3d858 100644 --- a/include/ffts.h +++ b/include/ffts.h @@ -41,18 +41,24 @@ typedef size_t transform_index_t; + struct _ffts_plan_t { - ptrdiff_t *is; ptrdiff_t *offsets; - void __attribute__ ((aligned(32))) **ws; + void __attribute__ ((aligned(32))) *ws; + void __attribute__ ((aligned(32))) *other_ws; + ptrdiff_t *is; + size_t *ws_is; void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict); - size_t i0, i1, i2; - uint64_t n_bits, leaftime; - + size_t i0, i1, n_luts; + size_t N; + void *lastlut; transform_index_t *transforms; + //transform_func_t transform; + void (*transform)(struct _ffts_plan_t * restrict, const float * restrict, float * restrict); + void *transform_base; + size_t transform_size; }; - typedef struct _ffts_plan_t ffts_plan_t; void ffts_execute(ffts_plan_t * restrict, const void * restrict, const void * restrict); diff --git a/src/codegen.c b/src/codegen.c index 8593f12..72daf89 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -2,6 +2,7 @@ #include "macros.h" #include "neon_float.h" #include "neon.h" +#include <libkern/OSCacheControl.h> int tree_count(int N, int leafN, int offset) { @@ -61,33 +62,45 @@ uint32_t MOV(uint8_t dst, uint8_t src) { return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12); } -uint32_t ADDI(uint8_t dst, uint8_t src, int32_t imm) { +void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) { + int32_t oimm = imm; if(imm < 0) { imm = -imm; uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); if(shamt & 1) shamt -= 1; imm >>= shamt; shamt = (32 - shamt)/2; - return 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); + *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2)))); + + }else{ + uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + +// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); + *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2)))); } - uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); - fprintf(stderr, "pre ADDI shamt:%d imm:%d\n", shamt, imm); - if(shamt & 1) shamt -= 1; - fprintf(stderr, "ADDI shamt:%d imm:%d\n", shamt, imm); - imm >>= shamt; - shamt = (32 - shamt)/2; - - - return 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); +} + +uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) { + return 0xe5900000 | ((dst & 0xf) << 12) + | ((base & 0xf) << 16) | (offset & 0xfff) ; } uint32_t MOVI(uint8_t dst, uint16_t val) { return 0xe3a00000 | ((dst & 0xf) << 12) | (val & 0xffff) ; } -uint32_t PUSH_LR() { return 0xe92d4000; } -uint32_t POP_LR() { return 0xe8bd8000; } +uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; } +uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; } uint32_t LUT_offset(size_t N, size_t leafN) { int i; @@ -131,52 +144,125 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) pps = ps; - uint32_t *func = valloc(8192); + if(N < 8192) p->transform_size = 8192; + else p->transform_size = N; + + p->transform_base = valloc(p->transform_size);//(void *)func; + uint32_t *func = p->transform_base;//valloc(8192); uint32_t *fp = func; - //p->transform_base = func; + if(!func) { + fprintf(stderr, "NOMEM\n"); + exit(1); + } uint32_t *x_8_addr = fp; memcpy(fp, neon_x8, neon_x8_t - neon_x8); fp += (neon_x8_t - neon_x8) / 4; - uint32_t *x_8_t_addr = fp; - memcpy(fp, neon_x8_t, neon_end - neon_x8_t); - fp += (neon_end - neon_x8_t) / 4; +//uint32_t *x_8_t_addr = fp; +//memcpy(fp, neon_x8_t, neon_end - neon_x8_t); +//fp += (neon_end - neon_x8_t) / 4; uint32_t *x_4_addr = fp; memcpy(fp, neon_x4, neon_x8 - neon_x4); fp += (neon_x8 - neon_x4) / 4; uint32_t *start = fp; - fprintf(stderr, "X_4: %08x START: %08x\n", x_4_addr, start); - fprintf(stderr, "X_8: %08x\n", x_8_addr, start); - fprintf(stderr, "X_8_T: %08x\n", x_8_t_addr, start); +//fprintf(stderr, "X_4: %08x START: %08x\n", x_4_addr, start); +//fprintf(stderr, "X_8: %08x\n", x_8_addr, start); +//fprintf(stderr, "X_8_T: %08x\n", x_8_t_addr, start); + fprintf(stderr, "LUT: %08x\n", p->ws); + fprintf(stderr, "offsets: %08x\n", p->offsets); *fp++ = PUSH_LR(); // *fp++ = MOV(2, 1); // *fp++ = BL(fp+2, start); + + + + +//ADDI(0, 1, 0); // mov r1 -> r0 +//ADDI(1, 2, 0); // mov r2 -> r1 + + ADDI(&fp, 3, 1, 0); + ADDI(&fp, 7, 1, N); + ADDI(&fp, 5, 1, 2*N); + ADDI(&fp, 10, 7, 2*N); + ADDI(&fp, 4, 5, 2*N); + ADDI(&fp, 8, 10, 2*N); + ADDI(&fp, 6, 4, 2*N); + ADDI(&fp, 9, 8, 2*N); + + *fp++ = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); // load offsets into r12 +// *fp++ = LDRI(1, 0, 4); // load ws into r1 + ADDI(&fp, 1, 0, 0); + + ADDI(&fp, 0, 2, 0), // mov out into r0 + + p->oe_ws = oe_w_data; + p->ee_ws = ee_w_data; + p->eo_ws = eo_w_data; + + fprintf(stderr, "p = %08x\n", p); + + + fprintf(stderr, "start of ee %08x\n", fp); + *fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + + memcpy(fp, neon_ee, neon_oo - neon_ee); + fp += (neon_oo - neon_ee) / 4; + + + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp++ = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); + + fprintf(stderr, "start of oe %08x\n", fp); + memcpy(fp, neon_oe, neon_end - neon_oe); + fp += (neon_end - neon_oe) / 4; + + + *fp++ = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); // load offsets into r12 + //ADDI(&fp, 2, 1, 0); + *fp++ = MOVI(1, 0); + + // args: r0 - out + // r1 - N + // r2 - ws +// ADDI(&fp, 3, 1, 0); // put N into r3 for counter + int32_t pAddr = 0; int32_t pN = 0; int32_t pLUT = 0; count = 2; while(pps[0]) { - fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); +// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); if(!pN) { *fp++ = MOVI(1, pps[0]); }else{ - *fp++ = ADDI(0, 0, (pps[1] * 4)- pAddr); - *fp++ = ADDI(1, 1, pps[0] - pN); + if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); + if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); } - //*fp++ = ADDI(2, 2, LUT_offset(pps[0], leafN) - pLUT); - *fp++ = ADDI(2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); + + if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) + ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); if(pps[0] == 2*leafN) { *fp++ = BL(fp+2, x_4_addr); }else if(!pps[2]){ - *fp++ = BL(fp+2, x_8_t_addr); + //uint32_t *x_8_t_addr = fp; + memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); + fp += (neon_ee - neon_x8_t) / 4; + //*fp++ = BL(fp+2, x_8_t_addr); }else{ *fp++ = BL(fp+2, x_8_addr); } @@ -184,7 +270,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) pAddr = pps[1] * 4; pN = pps[0]; pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); - fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); +// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); count += 4; pps += 2; } @@ -192,19 +278,23 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) *fp++ = POP_LR(); count++; // *fp++ = B(14); count++; - for(int i=0;i<(neon_x8 - neon_x4)/4;i++) - fprintf(stderr, "%08x\n", x_4_addr[i]); - fprintf(stderr, "\n"); - for(int i=0;i<count;i++) - fprintf(stderr, "%08x\n", start[i]); +//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) +// fprintf(stderr, "%08x\n", x_4_addr[i]); +//fprintf(stderr, "\n"); +//for(int i=0;i<count;i++) +// fprintf(stderr, "%08x\n", start[i]); free(ps); - if (mprotect(func, 8192, PROT_READ | PROT_EXEC)) { + if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) { perror("Couldn't mprotect"); return NULL; } + sys_icache_invalidate(func, p->transform_size); + + + fprintf(stderr, "size of transform = %d\n", (fp-func)*4); return (transform_func_t)start; } diff --git a/src/cp_sse.c b/src/cp_sse.c index 1356c7b..f36f90b 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -73,8 +73,9 @@ firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * rest void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) { transform_index_t *ps = p->transforms; - p->firstpass((const float *)in, (float *)out, p); - if(p->transform) p->transform(out, p->N, p->ws); + //p->firstpass((const float *)in, (float *)out, p); + p->transform(p, (const float *)in, (float *)out); + //if(p->transform) p->transform(out, p->N, p->ws); } void ffts_free(ffts_plan_t *p) { @@ -88,8 +89,13 @@ void ffts_free(ffts_plan_t *p) { if(p->offsets) free(p->offsets); //free(p->transforms); -// if(p->transform_base) free(p->transform_base); - + if(p->transform_base) { + if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) { + perror("Couldn't mprotect"); + exit(errno); + } + free(p->transform_base); + } free(p); } @@ -197,7 +203,10 @@ ffts_plan_t *ffts_init(size_t N, int sign) { } n *= 2; } - + +// lut_size *= 16; + + // fprintf(stderr, "lut size = %zu\n", lut_size); if(n_luts) { p->ws = FFTS_MALLOC(lut_size,32); p->ws_is = malloc(n_luts * sizeof(size_t)); @@ -213,7 +222,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) { for(i=0;i<n_luts;i++) { p->ws_is[i] = w - (cdata_t *)p->ws; - fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); + //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); if(!i || hardcoded) { cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); @@ -346,13 +355,13 @@ ffts_plan_t *ffts_init(size_t N, int sign) { } float *tmp = (float *)p->ws; - for(i=0;i<lut_size*2;i+=8) { - fprintf(stderr, "%08x %f %f %f %f - %f %f %f %f\n", - tmp, - tmp[0], tmp[1], tmp[2], tmp[3], - tmp[4], tmp[5], tmp[6], tmp[7]); - tmp += 8; - } +//for(i=0;i<lut_size*2;i+=8) { +// fprintf(stderr, "%08x %f %f %f %f - %f %f %f %f\n", +// tmp, +// tmp[0], tmp[1], tmp[2], tmp[3], +// tmp[4], tmp[5], tmp[6], tmp[7]); +// tmp += 8; +//} p->N = N; p->lastlut = w; diff --git a/src/cp_sse.h b/src/cp_sse.h index 6f793d8..7729eb8 100644 --- a/src/cp_sse.h +++ b/src/cp_sse.h @@ -18,19 +18,24 @@ typedef alignas(16) float data_t; typedef size_t transform_index_t; +//typedef void (*transform_func_t)(float *data, size_t N, float *LUT); typedef void (*transform_func_t)(float *data, size_t N, float *LUT); struct _ffts_plan_t { - ptrdiff_t *is; ptrdiff_t *offsets; void __attribute__ ((aligned(32))) *ws; + void __attribute__ ((aligned(32))) *oe_ws, *eo_ws, *ee_ws; + ptrdiff_t *is; size_t *ws_is; void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict); size_t i0, i1, n_luts; size_t N; void *lastlut; transform_index_t *transforms; - transform_func_t transform, transform_base; + //transform_func_t transform; + void (*transform)(struct _ffts_plan_t * restrict, const float * restrict, float * restrict); + void *transform_base; + size_t transform_size; }; typedef struct _ffts_plan_t ffts_plan_t; @@ -1,8 +1,7 @@ .globl _neon_x4 - .align 2 + .align 4 _neon_x4: - push {r4,r5,r6} add r3, r0, #0 add r4, r3, r1, lsl #1 add r5, r3, r1, lsl #2 @@ -40,13 +39,11 @@ _neon_x4: vst1.32 {q2,q3}, [r4, :128] vst1.32 {q4,q5}, [r5, :128] vst1.32 {q6,q7}, [r6, :128] - pop {r4,r5,r6} bx lr .globl _neon_x8 - .align 2 + .align 4 _neon_x8: - push {r4,r5,r6,r7,r8,r9,r10,r11} mov r11, #0 add r3, r0, #0 @ data0 add r5, r0, r1, lsl #1 @ data2 @@ -59,7 +56,7 @@ _neon_x8: add r12, r2, #0 @ LUT sub r11, r11, r1, lsr #5 - + nop neon_x8_loop: vld1.32 {q10,q11}, [r5, :128] vld1.32 {q12,q13}, [r4, :128] @@ -149,14 +146,11 @@ neon_x8_loop: vst1.32 {q6,q7}, [r10, :128]! bne neon_x8_loop - pop {r4,r5,r6,r7,r8,r9,r10,r11} bx lr .globl _neon_x8_t - .align 2 + .align 4 _neon_x8_t: - push {r4,r5,r6,r7,r8,r9,r10,r11} - mov r11, #0 add r3, r0, #0 @ data0 add r5, r0, r1, lsl #1 @ data2 @@ -169,6 +163,7 @@ _neon_x8_t: add r12, r2, #0 @ LUT sub r11, r11, r1, lsr #5 + nop neon_x8_t_loop: vld1.32 {q10,q11}, [r5, :128] vld1.32 {q12,q13}, [r4, :128] @@ -258,10 +253,293 @@ neon_x8_t_loop: vst2.32 {q6,q7}, [r10, :128]! bne neon_x8_t_loop - pop {r4,r5,r6,r7,r8,r9,r10,r11} - bx lr + @bx lr + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .globl _neon_ee + .align 4 +_neon_ee: + vld1.32 {d16, d17}, [r2, :128] +_neon_ee_loop: + ldr r2, [r12], #4 + ldr lr, [r12], #4 + add r2, r0, r2, lsl #2 + add lr, r0, lr, lsl #2 + vld2.32 {q15}, [r10, :128]! + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vld2.32 {q9}, [r4, :128]! + vld2.32 {q10}, [r3, :128]! + vld2.32 {q11}, [r6, :128]! + vld2.32 {q12}, [r5, :128]! + subs r11, r11, #1 + vsub.f32 q1, q14, q13 + vld2.32 {q0}, [r9, :128]! + vsub.f32 q2, q0, q15 + vadd.f32 q0, q0, q15 + vmul.f32 d10, d2, d17 + vmul.f32 d11, d3, d16 + vmul.f32 d12, d3, d17 + vmul.f32 d6, d4, d17 + vmul.f32 d7, d5, d16 + vmul.f32 d8, d4, d16 + vmul.f32 d9, d5, d17 + vmul.f32 d13, d2, d16 + vsub.f32 d7, d7, d6 + vadd.f32 d11, d11, d10 + vsub.f32 q1, q12, q11 + vsub.f32 q2, q10, q9 + vadd.f32 d6, d9, d8 + vadd.f32 q4, q14, q13 + vadd.f32 q11, q12, q11 + vadd.f32 q12, q10, q9 + vsub.f32 d10, d13, d12 + vsub.f32 q7, q4, q0 + vsub.f32 q9, q12, q11 + vsub.f32 q13, q5, q3 + vsub.f32 d29, d5, d2 + vadd.f32 q5, q5, q3 + vadd.f32 q10, q4, q0 + vadd.f32 q11, q12, q11 + vadd.f32 d31, d5, d2 + vadd.f32 d28, d4, d3 + vsub.f32 d30, d4, d3 + vsub.f32 d5, d19, d14 + vsub.f32 d7, d31, d26 + vadd.f32 q1, q14, q5 + vadd.f32 q0, q11, q10 + vadd.f32 d6, d30, d27 + vadd.f32 d4, d18, d15 + vadd.f32 d13, d19, d14 + vsub.f32 d12, d18, d15 + vadd.f32 d15, d31, d26 + vtrn.32 q1, q3 + vtrn.32 q0, q2 + vsub.f32 q4, q11, q10 + vsub.f32 q5, q14, q5 + vsub.f32 d14, d30, d27 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = loop iterations +@ r2 & lr = temps + .globl _neon_oo + .align 4 +_neon_oo: + vld2.32 {q8}, [r6, :128]! + vld2.32 {q9}, [r5, :128]! + vld2.32 {q10}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vadd.f32 q11, q9, q8 + vsub.f32 q8, q9, q8 + vsub.f32 q9, q13, q10 + vadd.f32 q12, q13, q10 + vld2.32 {q10}, [r7, :128]! + vld2.32 {q13}, [r9, :128]! + vsub.f32 q2, q12, q11 + vadd.f32 d7, d19, d16 + vsub.f32 d3, d19, d16 + vsub.f32 d6, d18, d17 + vadd.f32 d2, d18, d17 + vld2.32 {q9}, [r8, :128]! + vld2.32 {q8}, [r10, :128]! + vadd.f32 q0, q12, q11 + vadd.f32 q11, q13, q8 + vadd.f32 q12, q10, q9 + vsub.f32 q8, q13, q8 + vsub.f32 q9, q10, q9 + vsub.f32 q6, q12, q11 + vadd.f32 q4, q12, q11 + vtrn.32 q0, q2 + ldr r2, [r12]! + ldr lr, [r12]! + vadd.f32 d15, d19, d16 + vsub.f32 d11, d19, d16 + vsub.f32 d14, d18, d17 + vadd.f32 d10, d18, d17 + add r2, r0, r2, lsl #2 + add lr, r0, lr, lsl #2 + vtrn.32 q1, q3 + vst2.32 {q0,q1}, [r2, :128]! + vst2.32 {q2,q3}, [lr, :128]! + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vst2.32 {q4,q5}, [r2, :128]! + vst2.32 {q6,q7}, [lr, :128]! + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = addr of twiddle +@ r2 & lr = temps + .globl _neon_eo + .align 4 +_neon_eo: + vld2.32 {q9}, [r5, :128]! @tag2 + vld2.32 {q13}, [r3, :128]! @tag0 + vld2.32 {q12}, [r4, :128]! @tag1 + vld2.32 {q0}, [r7, :128]! @tag4 + vsub.f32 q11, q13, q12 + vld2.32 {q8}, [r6, :128]! @tag3 + vadd.f32 q12, q13, q12 + vsub.f32 q10, q9, q8 + vadd.f32 q8, q9, q8 + vadd.f32 q9, q12, q8 + vsub.f32 d9, d23, d20 + vadd.f32 d11, d23, d20 + vsub.f32 q8, q12, q8 + vadd.f32 d8, d22, d21 + vsub.f32 d10, d22, d21 + ldr r2, [r12]! + ldr lr, [r12]! + vld1.32 {d20, d21}, [r11, :128] + vtrn.32 q9, q4 + vtrn.32 q8, q5 + vswp d9,d10 + add r2, r0, r2, lsl #2 + add lr, r0, lr, lsl #2 + vst1.32 {d8,d9,d10,d11}, [r2, :128]! + vld2.32 {q13}, [r10, :128]! @tag7 + vld2.32 {q15}, [r9, :128]! @tag6 + vld2.32 {q11}, [r8, :128]! @tag5 + vsub.f32 q14, q15, q13 + vsub.f32 q12, q0, q11 + vadd.f32 q11, q0, q11 + vadd.f32 q13, q15, q13 + vsub.f32 d13, d29, d24 + vadd.f32 q15, q13, q11 + vadd.f32 d12, d28, d25 + vadd.f32 d15, d29, d24 + vsub.f32 d14, d28, d25 + vtrn.32 q15, q6 + vsub.f32 q15, q13, q11 + vtrn.32 q15, q7 + vswp d13, d14 + vst1.32 {d12,d13,d14,d15}, [r2, :128]! + vtrn.32 q13, q14 + vtrn.32 q11, q12 + vmul.f32 d24, d26, d21 + vmul.f32 d28, d27, d20 + vmul.f32 d25, d26, d20 + vmul.f32 d26, d27, d21 + vmul.f32 d27, d22, d21 + vmul.f32 d30, d23, d20 + vmul.f32 d29, d23, d21 + vmul.f32 d22, d22, d20 + vsub.f32 d21, d28, d24 + vadd.f32 d20, d26, d25 + vadd.f32 d25, d30, d27 + vsub.f32 d24, d22, d29 + vadd.f32 q11, q12, q10 + vsub.f32 q10, q12, q10 + vadd.f32 q0, q9, q11 + vsub.f32 q2, q9, q11 + vsub.f32 d3, d17, d20 + vadd.f32 d7, d17, d20 + vadd.f32 d2, d16, d21 + vsub.f32 d6, d16, d21 + vswp d1, d2 + vswp d5, d6 + vstmia lr!, {q0-q3} + + +@ assumes r0 = out +@ +@ r12 = offsets +@ r3-r10 = data pointers +@ r11 = addr of twiddle +@ r2 & lr = temps + .globl _neon_oe + .align 4 +_neon_oe: + vld1.32 {q8}, [r5, :128]! + vld1.32 {q10}, [r6, :128]! + vld2.32 {q11}, [r4, :128]! + vld2.32 {q13}, [r3, :128]! + vld2.32 {q15}, [r10, :128]! + vorr d25, d17, d17 + vorr d24, d20, d20 + vorr d20, d16, d16 + vsub.f32 q9, q13, q11 + vadd.f32 q11, q13, q11 + ldr r2, [r12], #4 + ldr lr, [r12], #4 + vtrn.32 d24, d25 + vtrn.32 d20, d21 + add r2, r0, r2, lsl #2 + add lr, r0, lr, lsl #2 + vsub.f32 q8, q10, q12 + vadd.f32 q10, q10, q12 + vadd.f32 q0, q11, q10 + vsub.f32 d25, d19, d16 + vadd.f32 d27, d19, d16 + vsub.f32 q1, q11, q10 + vadd.f32 d24, d18, d17 + vsub.f32 d26, d18, d17 + vtrn.32 q0, q12 + vtrn.32 q1, q13 + vld1.32 {d24, d25}, [r11, :128] + vswp d1, d2 + vst1.32 {q0, q1}, [r2, :128]! + vld2.32 {q0}, [r9, :128]! + vadd.f32 q1, q0, q15 + vld2.32 {q13}, [r8, :128]! + vld2.32 {q14}, [r7, :128]! + vsub.f32 q15, q0, q15 + vsub.f32 q0, q14, q13 + vadd.f32 q3, q14, q13 + vadd.f32 q2, q3, q1 + vsub.f32 d29, d1, d30 + vadd.f32 d27, d1, d30 + vsub.f32 q3, q3, q1 + vadd.f32 d28, d0, d31 + vsub.f32 d26, d0, d31 + vtrn.32 q2, q14 + vtrn.32 q3, q13 + vswp d5, d6 + vst1.32 {q2, q3}, [r2, :128]! + vtrn.32 q11, q9 + vtrn.32 q10, q8 + vmul.f32 d20, d18, d25 + vmul.f32 d22, d19, d24 + vmul.f32 d21, d19, d25 + vmul.f32 d18, d18, d24 + vmul.f32 d19, d16, d25 + vmul.f32 d30, d17, d24 + vmul.f32 d23, d16, d24 + vmul.f32 d24, d17, d25 + vadd.f32 d17, d22, d20 + vsub.f32 d16, d18, d21 + vsub.f32 d21, d30, d19 + vadd.f32 d20, d24, d23 + vadd.f32 q9, q8, q10 + vsub.f32 q8, q8, q10 + vadd.f32 q4, q14, q9 + vsub.f32 q6, q14, q9 + vsub.f32 d11, d27, d16 + vadd.f32 d15, d27, d16 + vadd.f32 d10, d26, d17 + vsub.f32 d14, d26, d17 + vswp d9, d10 + vswp d13, d14 + vstmia lr!, {q4-q7} + .globl _neon_end - .align 2 + .align 4 _neon_end: bx lr diff --git a/src/neon_float.h b/src/neon_float.h index 0e192a1..41c9ecf 100644 --- a/src/neon_float.h +++ b/src/neon_float.h @@ -644,14 +644,62 @@ __INLINE V LOAD2I(const data_t **addr) { return o; } +__INLINE V LOAD2I_0(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag0\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_1(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag1\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_2(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag2\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_3(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag3\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_4(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag4\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_5(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag5\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_6(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag6\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOAD2I_7(const data_t **addr) { + float32x4_t o; + __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag7\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} + + + __INLINE V LOADI(const data_t **addr) { - float32x2_t out0, out1; float32x4_t o; - - __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t" - : "=w" (o), "+r" (*addr) - : - ); + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOADI_2(const data_t **addr) { + float32x4_t o; + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag2" : "=w" (o), "+r" (*addr) : ); + return o; +} +__INLINE V LOADI_3(const data_t **addr) { + float32x4_t o; + __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag3" : "=w" (o), "+r" (*addr) : ); return o; } __INLINE V HSP_MUL(V *d, const V *w) { @@ -699,10 +747,10 @@ __INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const da V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = LOAD2I(i0); - t1 = LOAD2I(i1); - t2 = LOAD2I(i2); - t3 = LOAD2I(i3); + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t2 = LOAD2I_2(i2); + t3 = LOAD2I_3(i3); t4 = ADD (t0, t1); t5 = SUB (t0, t1); t6 = ADD (t2, t3); @@ -712,10 +760,10 @@ __INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const da r1 = HSP_SUB_MULI(&t5, &t7); r3 = HSP_ADD_MULI(&t5, &t7); - t0 = LOAD2I(i4); - t1 = LOAD2I(i5); - t2 = LOAD2I(i6); - t3 = LOAD2I(i7); + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); r4 = ADD (t0, t1); r5 = SUB (t0, t1); r6 = ADD (t2, t3); @@ -768,10 +816,10 @@ __INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const da V r0, r1, r2, r3, r4, r5, r6, r7; V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = LOAD2I(i0); - t1 = LOAD2I(i1); - t2 = LOAD2I(i2); - t3 = LOAD2I(i3); + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t2 = LOAD2I_2(i2); + t3 = LOAD2I_3(i3); t4 = ADD (t0, t1); t5 = SUB (t0, t1); t6 = ADD (t2, t3); @@ -796,10 +844,10 @@ __INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const da - t0 = LOAD2I(i4); - t1 = LOAD2I(i5); - t2 = LOAD2I(i6); - t3 = LOAD2I(i7); + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); t4 = ADD (t0, t1); t5 = SUB (t0, t1); t6 = ADD (t2, t3); @@ -850,10 +898,10 @@ __INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const da { V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = LOAD2I(i0); - t1 = LOAD2I(i1); - t2 = LOAD2I(i2); - t3 = LOAD2I(i3); + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t2 = LOAD2I_2(i2); + t3 = LOAD2I_3(i3); t4 = ADD(t0, t1); t5 = SUB(t0, t1); t6 = ADD(t2, t3); @@ -886,10 +934,10 @@ __INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const da } { V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = LOAD2I(i4); - t1 = LOAD2I(i5); - t2 = LOAD2I(i6); - t3 = LOAD2I(i7); + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); //t2 = HALFBLEND(t6, t7); //t3 = HALFBLEND(t7, t6); t4 = ADD(t0, t1); @@ -955,10 +1003,10 @@ __INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const da { V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = LOAD2I(i0); - t1 = LOAD2I(i1); - t6 = LOADI(i2); - t7 = LOADI(i3); + t0 = LOAD2I_0(i0); + t1 = LOAD2I_1(i1); + t6 = LOADI_2(i2); + t7 = LOADI_3(i3); float32x2x2_t tmp0 = vtrn_f32(vget_low_f32(t6), vget_high_f32(t7)); float32x2x2_t tmp1 = vtrn_f32(vget_low_f32(t7), vget_high_f32(t6)); @@ -991,10 +1039,10 @@ __INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const da } { V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = LOAD2I(i4); - t1 = LOAD2I(i5); - t2 = LOAD2I(i6); - t3 = LOAD2I(i7); + t0 = LOAD2I_4(i4); + t1 = LOAD2I_5(i5); + t2 = LOAD2I_6(i6); + t3 = LOAD2I_7(i7); t4 = ADD(t0, t1); t5 = SUB(t0, t1); t6 = ADD(t2, t3); diff --git a/src/patterns.c b/src/patterns.c index 29fa5ae..664f20e 100644 --- a/src/patterns.c +++ b/src/patterns.c @@ -114,9 +114,9 @@ void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) { for(i=0;i<N/leafN;i++) { p->offsets[i] = offsets[i*2+1]*2; } - for(i=0;i<N/leafN;i++) { - printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); - } +//for(i=0;i<N/leafN;i++) { +// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); +//} free(offsets); |