summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/codegen.c144
-rw-r--r--src/cp_sse.c85
-rw-r--r--src/cp_sse.h1
-rw-r--r--src/neon.s33
-rw-r--r--src/neon_float.h22
5 files changed, 135 insertions, 150 deletions
diff --git a/src/codegen.c b/src/codegen.c
index 72daf89..d1cce37 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -32,22 +32,9 @@ void elaborate_tree(size_t **p, int N, int leafN, int offset) {
(*p)+=2;
}
-void
-ffts_x_8(float *data, size_t N, float *LUT) {
- X_8_SPLIT(data, N, LUT);
-}
-void
-ffts_x_8_t(float *data, size_t N, float *LUT) {
- X_8_SPLIT_T(data, N, LUT);
-}
-void
-ffts_x_4(float *data, size_t N, float *LUT) {
- //fprintf(stderr, "X_4 %zu\n", N);
- X_4_SPLIT(data, N, LUT);
-}
-void
-dummy(float *data, size_t N, float *LUT) {
+void neon_X_8(data_t * restrict data0, size_t N, data_t * restrict LUT) {
+ X_8_SPLIT(data0, N, LUT);
}
uint32_t BL(void *pos, void *target) {
@@ -86,7 +73,7 @@ void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
*(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
- if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
+ if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
}
}
@@ -95,8 +82,15 @@ uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
| ((base & 0xf) << 16) | (offset & 0xfff) ;
}
-uint32_t MOVI(uint8_t dst, uint16_t val) {
- return 0xe3a00000 | ((dst & 0xf) << 12) | (val & 0xffff) ;
+uint32_t MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
+ uint32_t oimm = imm;
+
+ uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+ *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
+ if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
}
uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
@@ -151,11 +145,22 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
uint32_t *func = p->transform_base;//valloc(8192);
uint32_t *fp = func;
+ fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
+
if(!func) {
fprintf(stderr, "NOMEM\n");
exit(1);
}
+ if(N < 32) {
+
+
+
+
+
+ }
+
+
uint32_t *x_8_addr = fp;
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
fp += (neon_x8_t - neon_x8) / 4;
@@ -168,22 +173,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
uint32_t *start = fp;
-//fprintf(stderr, "X_4: %08x START: %08x\n", x_4_addr, start);
-//fprintf(stderr, "X_8: %08x\n", x_8_addr, start);
-//fprintf(stderr, "X_8_T: %08x\n", x_8_t_addr, start);
-
- fprintf(stderr, "LUT: %08x\n", p->ws);
- fprintf(stderr, "offsets: %08x\n", p->offsets);
*fp++ = PUSH_LR();
-// *fp++ = MOV(2, 1);
-// *fp++ = BL(fp+2, start);
-
-
-
-
-
-//ADDI(0, 1, 0); // mov r1 -> r0
-//ADDI(1, 2, 0); // mov r2 -> r1
ADDI(&fp, 3, 1, 0);
ADDI(&fp, 7, 1, N);
@@ -204,34 +194,93 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
p->ee_ws = ee_w_data;
p->eo_ws = eo_w_data;
- fprintf(stderr, "p = %08x\n", p);
+ fprintf(stderr, "p = %08x i0 = %d i1 = %d\n", p, p->i0, p->i1);
fprintf(stderr, "start of ee %08x\n", fp);
*fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
-
+ MOVI(&fp, 11, p->i0);
+
+ fprintf(stderr, "p->i0 insn = %d - %08x %08x\n", p->i0, fp[-2], fp[-1]);
+ //fp++;
memcpy(fp, neon_ee, neon_oo - neon_ee);
fp += (neon_oo - neon_ee) / 4;
+ if(__builtin_ctzl(N) & 1){
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ if(p->i1) {
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_oo, neon_eo - neon_oo);
+ fp += (neon_eo - neon_oo) / 4;
+ }
+
+ *fp++ = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
- ADDI(&fp, 2, 7, 0);
- ADDI(&fp, 7, 9, 0);
- ADDI(&fp, 9, 2, 0);
+ memcpy(fp, neon_oe, neon_end - neon_oe);
+ fp += (neon_end - neon_oe) / 4;
+
+ }else{
+
+ *fp++ = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
+
+ memcpy(fp, neon_eo, neon_oe - neon_eo);
+ fp += (neon_oe - neon_eo) / 4;
+
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
- ADDI(&fp, 2, 8, 0);
- ADDI(&fp, 8, 10, 0);
- ADDI(&fp, 10, 2, 0);
+ if(p->i1) {
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_oo, neon_eo - neon_oo);
+ fp += (neon_eo - neon_oo) / 4;
+ }
+
+ }
+
- *fp++ = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
+ if(p->i1) {
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
- fprintf(stderr, "start of oe %08x\n", fp);
- memcpy(fp, neon_oe, neon_end - neon_oe);
- fp += (neon_end - neon_oe) / 4;
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+ *fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_ee, neon_oo - neon_ee);
+ fp += (neon_oo - neon_ee) / 4;
+
+ }
*fp++ = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); // load offsets into r12
//ADDI(&fp, 2, 1, 0);
- *fp++ = MOVI(1, 0);
+ MOVI(&fp, 1, 0);
// args: r0 - out
// r1 - N
@@ -246,7 +295,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
if(!pN) {
- *fp++ = MOVI(1, pps[0]);
+ MOVI(&fp, 1, pps[0]);
}else{
if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
@@ -282,7 +331,6 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
// fprintf(stderr, "%08x\n", x_4_addr[i]);
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++)
-// fprintf(stderr, "%08x\n", start[i]);
free(ps);
diff --git a/src/cp_sse.c b/src/cp_sse.c
index f36f90b..d32155f 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -2,74 +2,6 @@
#include "macros.h"
#include "patterns.h"
-__INLINE void
-firstpass_type_1(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
- size_t i, ii0 = p->i0, ii1 = p->i1;
- size_t *offsets = (size_t *)p->offsets;
- size_t *is = (size_t *)p->is;
-#ifdef __ARM_NEON__
- const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7];
- for(i=ii0;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
- offsets += 2;
- }
- for(i=ii1;i>0;--i) {
- neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
- offsets += 2;
- }
- neon_shl8_oe(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
- offsets += 2;
- for(i=ii1;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2);
- offsets += 2;
- }
-
-#else
- for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
- LEAF_OE(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
-#endif
-}
-
-__INLINE void
-firstpass_type_2(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
- size_t i, ii0 = p->i0, ii1 = p->i1;
- size_t *offsets = (size_t *)p->offsets;
- size_t *is = (size_t *)p->is;
-#ifdef __ARM_NEON__
- const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7];
-
- for(i=ii0;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
- offsets+=2;
- }
- neon_shl8_eo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
- offsets += 2;
- for(i=ii1;i>0;--i) {
- neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
- offsets += 2;
- }
- for(i=ii1;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2);
- offsets += 2;
- }
-
-#else
- for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
- LEAF_EO(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
-#endif
-}
-
-__INLINE void
-firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
- size_t *offsets = (size_t *)p->offsets;
- size_t *is = (size_t *)p->is;
- LEAF_EE(&is, in, &offsets, out);
- LEAF_OE(&is, in, &offsets, out);
-}
void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) {
transform_index_t *ps = p->transforms;
@@ -119,8 +51,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
// ffts_init_tree(p, N, leafN);
// if(N == 64) p->firstpass = &firstpass_64;
- if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_1;
- else p->firstpass = &firstpass_type_2;
LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
@@ -157,12 +87,12 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
p->transforms = malloc(2 * sizeof(transform_index_t));
p->transforms[0] = 0;
p->transforms[1] = 1;
- if(N == 2) p->firstpass = &firstpass_2;
- else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f;
- else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b;
- else if(N == 8) p->firstpass = &firstpass_8;
- else if(N == 16) p->firstpass = &firstpass_16;
- else if(N == 32) p->firstpass = &firstpass_32;
+// if(N == 2) p->firstpass = &firstpass_2;
+// else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f;
+// else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b;
+// else if(N == 8) p->firstpass = &firstpass_8;
+// else if(N == 16) p->firstpass = &firstpass_16;
+// else if(N == 32) p->firstpass = &firstpass_32;
p->is = NULL;
p->offsets = NULL;
@@ -362,7 +292,8 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
// tmp[4], tmp[5], tmp[6], tmp[7]);
// tmp += 8;
//}
-
+
+ fprintf(stderr, "p0 %d p1 %d\n", p->i0, p->i1);
p->N = N;
p->lastlut = w;
p->n_luts = n_luts;
diff --git a/src/cp_sse.h b/src/cp_sse.h
index 7729eb8..0136173 100644
--- a/src/cp_sse.h
+++ b/src/cp_sse.h
@@ -27,7 +27,6 @@ struct _ffts_plan_t {
void __attribute__ ((aligned(32))) *oe_ws, *eo_ws, *ee_ws;
ptrdiff_t *is;
size_t *ws_is;
- void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict);
size_t i0, i1, n_luts;
size_t N;
void *lastlut;
diff --git a/src/neon.s b/src/neon.s
index b998c38..e9e300f 100644
--- a/src/neon.s
+++ b/src/neon.s
@@ -266,10 +266,6 @@ neon_x8_t_loop:
_neon_ee:
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_loop:
- ldr r2, [r12], #4
- ldr lr, [r12], #4
- add r2, r0, r2, lsl #2
- add lr, r0, lr, lsl #2
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
@@ -277,9 +273,9 @@ _neon_ee_loop:
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
- subs r11, r11, #1
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
+ subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
@@ -318,8 +314,12 @@ _neon_ee_loop:
vadd.f32 d13, d19, d14
vsub.f32 d12, d18, d15
vadd.f32 d15, d31, d26
+ ldr r2, [r12], #4
+ ldr lr, [r12], #4
vtrn.32 q1, q3
vtrn.32 q0, q2
+ add r2, r0, r2, lsl #2
+ add lr, r0, lr, lsl #2
vsub.f32 q4, q11, q10
vsub.f32 q5, q14, q5
vsub.f32 d14, d30, d27
@@ -329,6 +329,7 @@ _neon_ee_loop:
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_ee_loop
@ assumes r0 = out
@
@@ -339,6 +340,8 @@ _neon_ee_loop:
.globl _neon_oo
.align 4
_neon_oo:
+
+_neon_oo_loop:
vld2.32 {q8}, [r6, :128]!
vld2.32 {q9}, [r5, :128]!
vld2.32 {q10}, [r4, :128]!
@@ -347,6 +350,7 @@ _neon_oo:
vsub.f32 q8, q9, q8
vsub.f32 q9, q13, q10
vadd.f32 q12, q13, q10
+ subs r11, r11, #1
vld2.32 {q10}, [r7, :128]!
vld2.32 {q13}, [r9, :128]!
vsub.f32 q2, q12, q11
@@ -364,8 +368,8 @@ _neon_oo:
vsub.f32 q6, q12, q11
vadd.f32 q4, q12, q11
vtrn.32 q0, q2
- ldr r2, [r12]!
- ldr lr, [r12]!
+ ldr r2, [r12], #4
+ ldr lr, [r12], #4
vadd.f32 d15, d19, d16
vsub.f32 d11, d19, d16
vsub.f32 d14, d18, d17
@@ -379,6 +383,7 @@ _neon_oo:
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
+ bne _neon_oo_loop
@ assumes r0 = out
@
@@ -404,15 +409,15 @@ _neon_eo:
vsub.f32 q8, q12, q8
vadd.f32 d8, d22, d21
vsub.f32 d10, d22, d21
- ldr r2, [r12]!
- ldr lr, [r12]!
+ ldr r2, [r12], #4
+ ldr lr, [r12], #4
vld1.32 {d20, d21}, [r11, :128]
vtrn.32 q9, q4
- vtrn.32 q8, q5
- vswp d9,d10
add r2, r0, r2, lsl #2
add lr, r0, lr, lsl #2
- vst1.32 {d8,d9,d10,d11}, [r2, :128]!
+ vtrn.32 q8, q5
+ vswp d9,d10
+ vst1.32 {d8,d9,d10,d11}, [lr, :128]!
vld2.32 {q13}, [r10, :128]! @tag7
vld2.32 {q15}, [r9, :128]! @tag6
vld2.32 {q11}, [r8, :128]! @tag5
@@ -429,7 +434,7 @@ _neon_eo:
vsub.f32 q15, q13, q11
vtrn.32 q15, q7
vswp d13, d14
- vst1.32 {d12,d13,d14,d15}, [r2, :128]!
+ vst1.32 {d12,d13,d14,d15}, [lr, :128]!
vtrn.32 q13, q14
vtrn.32 q11, q12
vmul.f32 d24, d26, d21
@@ -454,7 +459,7 @@ _neon_eo:
vsub.f32 d6, d16, d21
vswp d1, d2
vswp d5, d6
- vstmia lr!, {q0-q3}
+ vstmia r2!, {q0-q3}
@ assumes r0 = out
diff --git a/src/neon_float.h b/src/neon_float.h
index 41c9ecf..d793691 100644
--- a/src/neon_float.h
+++ b/src/neon_float.h
@@ -46,19 +46,21 @@ static inline V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
#define FFTS_MALLOC(d,a) (valloc(d))
#define FFTS_FREE(d) (free(d))
__INLINE void FMA(V *Rd, V Rn, V Rm) {
- __asm__ ("vmla.f32 %q0,%q1,%q2\n\t"
- : "+w" (*Rd)
- : "w" (Rn), "w" (Rm)
- //: "0"
- );
+ *Rd = vmlaq_f32(*Rd, Rn, Rm);
+// __asm__ ("vmla.f32 %q0,%q1,%q2\n\t"
+// : "+w" (*Rd)
+// : "w" (Rn), "w" (Rm)
+// //: "0"
+// );
}
__INLINE void FMS(V *Rd, V Rn, V Rm) {
- __asm__ ("vmls.f32 %q0,%q1,%q2\n\t"
- : "+w" (*Rd)
- : "w" (Rn), "w" (Rm)
- // : "0"
- );
+ *Rd = vmlsq_f32(*Rd, Rn, Rm);
+// __asm__ ("vmls.f32 %q0,%q1,%q2\n\t"
+// : "+w" (*Rd)
+// : "w" (Rn), "w" (Rm)
+// // : "0"
+// );
}
__INLINE VS VSMUL(VS *d, VS *w) {
OpenPOWER on IntegriCloud