summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2012-08-20 15:26:47 +1200
committerAnthony Blake <anthonix@me.com>2012-08-20 15:26:47 +1200
commit7877c36f6fc90bb88c8b81a05e66c0f48bf3ca02 (patch)
tree1d36e795f6fce05e40fa00b380c6f7688b8cc5ba
parent81023395c68710f8732a22ac46b511bce64e012b (diff)
downloadffts-7877c36f6fc90bb88c8b81a05e66c0f48bf3ca02.zip
ffts-7877c36f6fc90bb88c8b81a05e66c0f48bf3ca02.tar.gz
Full custom FFT32 works
-rw-r--r--include/ffts.h18
-rw-r--r--src/codegen.c158
-rw-r--r--src/cp_sse.c35
-rw-r--r--src/cp_sse.h9
-rw-r--r--src/neon.s304
-rw-r--r--src/neon_float.h124
-rw-r--r--src/patterns.c6
7 files changed, 545 insertions, 109 deletions
diff --git a/include/ffts.h b/include/ffts.h
index 9bd0dbe..ba3d858 100644
--- a/include/ffts.h
+++ b/include/ffts.h
@@ -41,18 +41,24 @@
typedef size_t transform_index_t;
+
struct _ffts_plan_t {
- ptrdiff_t *is;
ptrdiff_t *offsets;
- void __attribute__ ((aligned(32))) **ws;
+ void __attribute__ ((aligned(32))) *ws;
+ void __attribute__ ((aligned(32))) *other_ws;
+ ptrdiff_t *is;
+ size_t *ws_is;
void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict);
- size_t i0, i1, i2;
- uint64_t n_bits, leaftime;
-
+ size_t i0, i1, n_luts;
+ size_t N;
+ void *lastlut;
transform_index_t *transforms;
+ //transform_func_t transform;
+ void (*transform)(struct _ffts_plan_t * restrict, const float * restrict, float * restrict);
+ void *transform_base;
+ size_t transform_size;
};
-
typedef struct _ffts_plan_t ffts_plan_t;
void ffts_execute(ffts_plan_t * restrict, const void * restrict, const void * restrict);
diff --git a/src/codegen.c b/src/codegen.c
index 8593f12..72daf89 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -2,6 +2,7 @@
#include "macros.h"
#include "neon_float.h"
#include "neon.h"
+#include <libkern/OSCacheControl.h>
int tree_count(int N, int leafN, int offset) {
@@ -61,33 +62,45 @@ uint32_t MOV(uint8_t dst, uint8_t src) {
return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
}
-uint32_t ADDI(uint8_t dst, uint8_t src, int32_t imm) {
+void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
+ int32_t oimm = imm;
if(imm < 0) {
imm = -imm;
uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
if(shamt & 1) shamt -= 1;
imm >>= shamt;
shamt = (32 - shamt)/2;
- return 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+ *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
+
+ }else{
+ uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+
+// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+ *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
}
- uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
- fprintf(stderr, "pre ADDI shamt:%d imm:%d\n", shamt, imm);
- if(shamt & 1) shamt -= 1;
- fprintf(stderr, "ADDI shamt:%d imm:%d\n", shamt, imm);
- imm >>= shamt;
- shamt = (32 - shamt)/2;
-
-
- return 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+}
+
+uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
+ return 0xe5900000 | ((dst & 0xf) << 12)
+ | ((base & 0xf) << 16) | (offset & 0xfff) ;
}
uint32_t MOVI(uint8_t dst, uint16_t val) {
return 0xe3a00000 | ((dst & 0xf) << 12) | (val & 0xffff) ;
}
-uint32_t PUSH_LR() { return 0xe92d4000; }
-uint32_t POP_LR() { return 0xe8bd8000; }
+uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
+uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
uint32_t LUT_offset(size_t N, size_t leafN) {
int i;
@@ -131,52 +144,125 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
pps = ps;
- uint32_t *func = valloc(8192);
+ if(N < 8192) p->transform_size = 8192;
+ else p->transform_size = N;
+
+ p->transform_base = valloc(p->transform_size);//(void *)func;
+ uint32_t *func = p->transform_base;//valloc(8192);
uint32_t *fp = func;
- //p->transform_base = func;
+ if(!func) {
+ fprintf(stderr, "NOMEM\n");
+ exit(1);
+ }
uint32_t *x_8_addr = fp;
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
fp += (neon_x8_t - neon_x8) / 4;
- uint32_t *x_8_t_addr = fp;
- memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
- fp += (neon_end - neon_x8_t) / 4;
+//uint32_t *x_8_t_addr = fp;
+//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
+//fp += (neon_end - neon_x8_t) / 4;
uint32_t *x_4_addr = fp;
memcpy(fp, neon_x4, neon_x8 - neon_x4);
fp += (neon_x8 - neon_x4) / 4;
uint32_t *start = fp;
- fprintf(stderr, "X_4: %08x START: %08x\n", x_4_addr, start);
- fprintf(stderr, "X_8: %08x\n", x_8_addr, start);
- fprintf(stderr, "X_8_T: %08x\n", x_8_t_addr, start);
+//fprintf(stderr, "X_4: %08x START: %08x\n", x_4_addr, start);
+//fprintf(stderr, "X_8: %08x\n", x_8_addr, start);
+//fprintf(stderr, "X_8_T: %08x\n", x_8_t_addr, start);
+
fprintf(stderr, "LUT: %08x\n", p->ws);
+ fprintf(stderr, "offsets: %08x\n", p->offsets);
*fp++ = PUSH_LR();
// *fp++ = MOV(2, 1);
// *fp++ = BL(fp+2, start);
+
+
+
+
+//ADDI(0, 1, 0); // mov r1 -> r0
+//ADDI(1, 2, 0); // mov r2 -> r1
+
+ ADDI(&fp, 3, 1, 0);
+ ADDI(&fp, 7, 1, N);
+ ADDI(&fp, 5, 1, 2*N);
+ ADDI(&fp, 10, 7, 2*N);
+ ADDI(&fp, 4, 5, 2*N);
+ ADDI(&fp, 8, 10, 2*N);
+ ADDI(&fp, 6, 4, 2*N);
+ ADDI(&fp, 9, 8, 2*N);
+
+ *fp++ = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); // load offsets into r12
+// *fp++ = LDRI(1, 0, 4); // load ws into r1
+ ADDI(&fp, 1, 0, 0);
+
+ ADDI(&fp, 0, 2, 0), // mov out into r0
+
+ p->oe_ws = oe_w_data;
+ p->ee_ws = ee_w_data;
+ p->eo_ws = eo_w_data;
+
+ fprintf(stderr, "p = %08x\n", p);
+
+
+ fprintf(stderr, "start of ee %08x\n", fp);
+ *fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+
+ memcpy(fp, neon_ee, neon_oo - neon_ee);
+ fp += (neon_oo - neon_ee) / 4;
+
+
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp++ = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
+
+ fprintf(stderr, "start of oe %08x\n", fp);
+ memcpy(fp, neon_oe, neon_end - neon_oe);
+ fp += (neon_end - neon_oe) / 4;
+
+
+ *fp++ = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); // load offsets into r12
+ //ADDI(&fp, 2, 1, 0);
+ *fp++ = MOVI(1, 0);
+
+ // args: r0 - out
+ // r1 - N
+ // r2 - ws
+// ADDI(&fp, 3, 1, 0); // put N into r3 for counter
+
int32_t pAddr = 0;
int32_t pN = 0;
int32_t pLUT = 0;
count = 2;
while(pps[0]) {
- fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
+// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
if(!pN) {
*fp++ = MOVI(1, pps[0]);
}else{
- *fp++ = ADDI(0, 0, (pps[1] * 4)- pAddr);
- *fp++ = ADDI(1, 1, pps[0] - pN);
+ if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
+ if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
}
- //*fp++ = ADDI(2, 2, LUT_offset(pps[0], leafN) - pLUT);
- *fp++ = ADDI(2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
+
+ if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+ ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
if(pps[0] == 2*leafN) {
*fp++ = BL(fp+2, x_4_addr);
}else if(!pps[2]){
- *fp++ = BL(fp+2, x_8_t_addr);
+ //uint32_t *x_8_t_addr = fp;
+ memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+ fp += (neon_ee - neon_x8_t) / 4;
+ //*fp++ = BL(fp+2, x_8_t_addr);
}else{
*fp++ = BL(fp+2, x_8_addr);
}
@@ -184,7 +270,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
pAddr = pps[1] * 4;
pN = pps[0];
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
- fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
count += 4;
pps += 2;
}
@@ -192,19 +278,23 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
*fp++ = POP_LR(); count++;
// *fp++ = B(14); count++;
- for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
- fprintf(stderr, "%08x\n", x_4_addr[i]);
- fprintf(stderr, "\n");
- for(int i=0;i<count;i++)
- fprintf(stderr, "%08x\n", start[i]);
+//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
+// fprintf(stderr, "%08x\n", x_4_addr[i]);
+//fprintf(stderr, "\n");
+//for(int i=0;i<count;i++)
+// fprintf(stderr, "%08x\n", start[i]);
free(ps);
- if (mprotect(func, 8192, PROT_READ | PROT_EXEC)) {
+ if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
perror("Couldn't mprotect");
return NULL;
}
+ sys_icache_invalidate(func, p->transform_size);
+
+
+ fprintf(stderr, "size of transform = %d\n", (fp-func)*4);
return (transform_func_t)start;
}
diff --git a/src/cp_sse.c b/src/cp_sse.c
index 1356c7b..f36f90b 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -73,8 +73,9 @@ firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * rest
void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) {
transform_index_t *ps = p->transforms;
- p->firstpass((const float *)in, (float *)out, p);
- if(p->transform) p->transform(out, p->N, p->ws);
+ //p->firstpass((const float *)in, (float *)out, p);
+ p->transform(p, (const float *)in, (float *)out);
+ //if(p->transform) p->transform(out, p->N, p->ws);
}
void ffts_free(ffts_plan_t *p) {
@@ -88,8 +89,13 @@ void ffts_free(ffts_plan_t *p) {
if(p->offsets) free(p->offsets);
//free(p->transforms);
-// if(p->transform_base) free(p->transform_base);
-
+ if(p->transform_base) {
+ if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
+ perror("Couldn't mprotect");
+ exit(errno);
+ }
+ free(p->transform_base);
+ }
free(p);
}
@@ -197,7 +203,10 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
}
n *= 2;
}
-
+
+// lut_size *= 16;
+
+ // fprintf(stderr, "lut size = %zu\n", lut_size);
if(n_luts) {
p->ws = FFTS_MALLOC(lut_size,32);
p->ws_is = malloc(n_luts * sizeof(size_t));
@@ -213,7 +222,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
for(i=0;i<n_luts;i++) {
p->ws_is[i] = w - (cdata_t *)p->ws;
- fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
+ //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
if(!i || hardcoded) {
cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
@@ -346,13 +355,13 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
}
float *tmp = (float *)p->ws;
- for(i=0;i<lut_size*2;i+=8) {
- fprintf(stderr, "%08x %f %f %f %f - %f %f %f %f\n",
- tmp,
- tmp[0], tmp[1], tmp[2], tmp[3],
- tmp[4], tmp[5], tmp[6], tmp[7]);
- tmp += 8;
- }
+//for(i=0;i<lut_size*2;i+=8) {
+// fprintf(stderr, "%08x %f %f %f %f - %f %f %f %f\n",
+// tmp,
+// tmp[0], tmp[1], tmp[2], tmp[3],
+// tmp[4], tmp[5], tmp[6], tmp[7]);
+// tmp += 8;
+//}
p->N = N;
p->lastlut = w;
diff --git a/src/cp_sse.h b/src/cp_sse.h
index 6f793d8..7729eb8 100644
--- a/src/cp_sse.h
+++ b/src/cp_sse.h
@@ -18,19 +18,24 @@ typedef alignas(16) float data_t;
typedef size_t transform_index_t;
+//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
struct _ffts_plan_t {
- ptrdiff_t *is;
ptrdiff_t *offsets;
void __attribute__ ((aligned(32))) *ws;
+ void __attribute__ ((aligned(32))) *oe_ws, *eo_ws, *ee_ws;
+ ptrdiff_t *is;
size_t *ws_is;
void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict);
size_t i0, i1, n_luts;
size_t N;
void *lastlut;
transform_index_t *transforms;
- transform_func_t transform, transform_base;
+ //transform_func_t transform;
+ void (*transform)(struct _ffts_plan_t * restrict, const float * restrict, float * restrict);
+ void *transform_base;
+ size_t transform_size;
};
typedef struct _ffts_plan_t ffts_plan_t;
diff --git a/src/neon.s b/src/neon.s
index bbcd0c7..b998c38 100644
--- a/src/neon.s
+++ b/src/neon.s
@@ -1,8 +1,7 @@
.globl _neon_x4
- .align 2
+ .align 4
_neon_x4:
- push {r4,r5,r6}
add r3, r0, #0
add r4, r3, r1, lsl #1
add r5, r3, r1, lsl #2
@@ -40,13 +39,11 @@ _neon_x4:
vst1.32 {q2,q3}, [r4, :128]
vst1.32 {q4,q5}, [r5, :128]
vst1.32 {q6,q7}, [r6, :128]
- pop {r4,r5,r6}
bx lr
.globl _neon_x8
- .align 2
+ .align 4
_neon_x8:
- push {r4,r5,r6,r7,r8,r9,r10,r11}
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
@@ -59,7 +56,7 @@ _neon_x8:
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
-
+ nop
neon_x8_loop:
vld1.32 {q10,q11}, [r5, :128]
vld1.32 {q12,q13}, [r4, :128]
@@ -149,14 +146,11 @@ neon_x8_loop:
vst1.32 {q6,q7}, [r10, :128]!
bne neon_x8_loop
- pop {r4,r5,r6,r7,r8,r9,r10,r11}
bx lr
.globl _neon_x8_t
- .align 2
+ .align 4
_neon_x8_t:
- push {r4,r5,r6,r7,r8,r9,r10,r11}
-
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
@@ -169,6 +163,7 @@ _neon_x8_t:
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
+ nop
neon_x8_t_loop:
vld1.32 {q10,q11}, [r5, :128]
vld1.32 {q12,q13}, [r4, :128]
@@ -258,10 +253,293 @@ neon_x8_t_loop:
vst2.32 {q6,q7}, [r10, :128]!
bne neon_x8_t_loop
- pop {r4,r5,r6,r7,r8,r9,r10,r11}
- bx lr
+ @bx lr
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 & lr = temps
+ .globl _neon_ee
+ .align 4
+_neon_ee:
+ vld1.32 {d16, d17}, [r2, :128]
+_neon_ee_loop:
+ ldr r2, [r12], #4
+ ldr lr, [r12], #4
+ add r2, r0, r2, lsl #2
+ add lr, r0, lr, lsl #2
+ vld2.32 {q15}, [r10, :128]!
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vld2.32 {q9}, [r4, :128]!
+ vld2.32 {q10}, [r3, :128]!
+ vld2.32 {q11}, [r6, :128]!
+ vld2.32 {q12}, [r5, :128]!
+ subs r11, r11, #1
+ vsub.f32 q1, q14, q13
+ vld2.32 {q0}, [r9, :128]!
+ vsub.f32 q2, q0, q15
+ vadd.f32 q0, q0, q15
+ vmul.f32 d10, d2, d17
+ vmul.f32 d11, d3, d16
+ vmul.f32 d12, d3, d17
+ vmul.f32 d6, d4, d17
+ vmul.f32 d7, d5, d16
+ vmul.f32 d8, d4, d16
+ vmul.f32 d9, d5, d17
+ vmul.f32 d13, d2, d16
+ vsub.f32 d7, d7, d6
+ vadd.f32 d11, d11, d10
+ vsub.f32 q1, q12, q11
+ vsub.f32 q2, q10, q9
+ vadd.f32 d6, d9, d8
+ vadd.f32 q4, q14, q13
+ vadd.f32 q11, q12, q11
+ vadd.f32 q12, q10, q9
+ vsub.f32 d10, d13, d12
+ vsub.f32 q7, q4, q0
+ vsub.f32 q9, q12, q11
+ vsub.f32 q13, q5, q3
+ vsub.f32 d29, d5, d2
+ vadd.f32 q5, q5, q3
+ vadd.f32 q10, q4, q0
+ vadd.f32 q11, q12, q11
+ vadd.f32 d31, d5, d2
+ vadd.f32 d28, d4, d3
+ vsub.f32 d30, d4, d3
+ vsub.f32 d5, d19, d14
+ vsub.f32 d7, d31, d26
+ vadd.f32 q1, q14, q5
+ vadd.f32 q0, q11, q10
+ vadd.f32 d6, d30, d27
+ vadd.f32 d4, d18, d15
+ vadd.f32 d13, d19, d14
+ vsub.f32 d12, d18, d15
+ vadd.f32 d15, d31, d26
+ vtrn.32 q1, q3
+ vtrn.32 q0, q2
+ vsub.f32 q4, q11, q10
+ vsub.f32 q5, q14, q5
+ vsub.f32 d14, d30, d27
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = loop iterations
+@ r2 & lr = temps
+ .globl _neon_oo
+ .align 4
+_neon_oo:
+ vld2.32 {q8}, [r6, :128]!
+ vld2.32 {q9}, [r5, :128]!
+ vld2.32 {q10}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vadd.f32 q11, q9, q8
+ vsub.f32 q8, q9, q8
+ vsub.f32 q9, q13, q10
+ vadd.f32 q12, q13, q10
+ vld2.32 {q10}, [r7, :128]!
+ vld2.32 {q13}, [r9, :128]!
+ vsub.f32 q2, q12, q11
+ vadd.f32 d7, d19, d16
+ vsub.f32 d3, d19, d16
+ vsub.f32 d6, d18, d17
+ vadd.f32 d2, d18, d17
+ vld2.32 {q9}, [r8, :128]!
+ vld2.32 {q8}, [r10, :128]!
+ vadd.f32 q0, q12, q11
+ vadd.f32 q11, q13, q8
+ vadd.f32 q12, q10, q9
+ vsub.f32 q8, q13, q8
+ vsub.f32 q9, q10, q9
+ vsub.f32 q6, q12, q11
+ vadd.f32 q4, q12, q11
+ vtrn.32 q0, q2
+ ldr r2, [r12]!
+ ldr lr, [r12]!
+ vadd.f32 d15, d19, d16
+ vsub.f32 d11, d19, d16
+ vsub.f32 d14, d18, d17
+ vadd.f32 d10, d18, d17
+ add r2, r0, r2, lsl #2
+ add lr, r0, lr, lsl #2
+ vtrn.32 q1, q3
+ vst2.32 {q0,q1}, [r2, :128]!
+ vst2.32 {q2,q3}, [lr, :128]!
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vst2.32 {q4,q5}, [r2, :128]!
+ vst2.32 {q6,q7}, [lr, :128]!
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = addr of twiddle
+@ r2 & lr = temps
+ .globl _neon_eo
+ .align 4
+_neon_eo:
+ vld2.32 {q9}, [r5, :128]! @tag2
+ vld2.32 {q13}, [r3, :128]! @tag0
+ vld2.32 {q12}, [r4, :128]! @tag1
+ vld2.32 {q0}, [r7, :128]! @tag4
+ vsub.f32 q11, q13, q12
+ vld2.32 {q8}, [r6, :128]! @tag3
+ vadd.f32 q12, q13, q12
+ vsub.f32 q10, q9, q8
+ vadd.f32 q8, q9, q8
+ vadd.f32 q9, q12, q8
+ vsub.f32 d9, d23, d20
+ vadd.f32 d11, d23, d20
+ vsub.f32 q8, q12, q8
+ vadd.f32 d8, d22, d21
+ vsub.f32 d10, d22, d21
+ ldr r2, [r12]!
+ ldr lr, [r12]!
+ vld1.32 {d20, d21}, [r11, :128]
+ vtrn.32 q9, q4
+ vtrn.32 q8, q5
+ vswp d9,d10
+ add r2, r0, r2, lsl #2
+ add lr, r0, lr, lsl #2
+ vst1.32 {d8,d9,d10,d11}, [r2, :128]!
+ vld2.32 {q13}, [r10, :128]! @tag7
+ vld2.32 {q15}, [r9, :128]! @tag6
+ vld2.32 {q11}, [r8, :128]! @tag5
+ vsub.f32 q14, q15, q13
+ vsub.f32 q12, q0, q11
+ vadd.f32 q11, q0, q11
+ vadd.f32 q13, q15, q13
+ vsub.f32 d13, d29, d24
+ vadd.f32 q15, q13, q11
+ vadd.f32 d12, d28, d25
+ vadd.f32 d15, d29, d24
+ vsub.f32 d14, d28, d25
+ vtrn.32 q15, q6
+ vsub.f32 q15, q13, q11
+ vtrn.32 q15, q7
+ vswp d13, d14
+ vst1.32 {d12,d13,d14,d15}, [r2, :128]!
+ vtrn.32 q13, q14
+ vtrn.32 q11, q12
+ vmul.f32 d24, d26, d21
+ vmul.f32 d28, d27, d20
+ vmul.f32 d25, d26, d20
+ vmul.f32 d26, d27, d21
+ vmul.f32 d27, d22, d21
+ vmul.f32 d30, d23, d20
+ vmul.f32 d29, d23, d21
+ vmul.f32 d22, d22, d20
+ vsub.f32 d21, d28, d24
+ vadd.f32 d20, d26, d25
+ vadd.f32 d25, d30, d27
+ vsub.f32 d24, d22, d29
+ vadd.f32 q11, q12, q10
+ vsub.f32 q10, q12, q10
+ vadd.f32 q0, q9, q11
+ vsub.f32 q2, q9, q11
+ vsub.f32 d3, d17, d20
+ vadd.f32 d7, d17, d20
+ vadd.f32 d2, d16, d21
+ vsub.f32 d6, d16, d21
+ vswp d1, d2
+ vswp d5, d6
+ vstmia lr!, {q0-q3}
+
+
+@ assumes r0 = out
+@
+@ r12 = offsets
+@ r3-r10 = data pointers
+@ r11 = addr of twiddle
+@ r2 & lr = temps
+ .globl _neon_oe
+ .align 4
+_neon_oe:
+ vld1.32 {q8}, [r5, :128]!
+ vld1.32 {q10}, [r6, :128]!
+ vld2.32 {q11}, [r4, :128]!
+ vld2.32 {q13}, [r3, :128]!
+ vld2.32 {q15}, [r10, :128]!
+ vorr d25, d17, d17
+ vorr d24, d20, d20
+ vorr d20, d16, d16
+ vsub.f32 q9, q13, q11
+ vadd.f32 q11, q13, q11
+ ldr r2, [r12], #4
+ ldr lr, [r12], #4
+ vtrn.32 d24, d25
+ vtrn.32 d20, d21
+ add r2, r0, r2, lsl #2
+ add lr, r0, lr, lsl #2
+ vsub.f32 q8, q10, q12
+ vadd.f32 q10, q10, q12
+ vadd.f32 q0, q11, q10
+ vsub.f32 d25, d19, d16
+ vadd.f32 d27, d19, d16
+ vsub.f32 q1, q11, q10
+ vadd.f32 d24, d18, d17
+ vsub.f32 d26, d18, d17
+ vtrn.32 q0, q12
+ vtrn.32 q1, q13
+ vld1.32 {d24, d25}, [r11, :128]
+ vswp d1, d2
+ vst1.32 {q0, q1}, [r2, :128]!
+ vld2.32 {q0}, [r9, :128]!
+ vadd.f32 q1, q0, q15
+ vld2.32 {q13}, [r8, :128]!
+ vld2.32 {q14}, [r7, :128]!
+ vsub.f32 q15, q0, q15
+ vsub.f32 q0, q14, q13
+ vadd.f32 q3, q14, q13
+ vadd.f32 q2, q3, q1
+ vsub.f32 d29, d1, d30
+ vadd.f32 d27, d1, d30
+ vsub.f32 q3, q3, q1
+ vadd.f32 d28, d0, d31
+ vsub.f32 d26, d0, d31
+ vtrn.32 q2, q14
+ vtrn.32 q3, q13
+ vswp d5, d6
+ vst1.32 {q2, q3}, [r2, :128]!
+ vtrn.32 q11, q9
+ vtrn.32 q10, q8
+ vmul.f32 d20, d18, d25
+ vmul.f32 d22, d19, d24
+ vmul.f32 d21, d19, d25
+ vmul.f32 d18, d18, d24
+ vmul.f32 d19, d16, d25
+ vmul.f32 d30, d17, d24
+ vmul.f32 d23, d16, d24
+ vmul.f32 d24, d17, d25
+ vadd.f32 d17, d22, d20
+ vsub.f32 d16, d18, d21
+ vsub.f32 d21, d30, d19
+ vadd.f32 d20, d24, d23
+ vadd.f32 q9, q8, q10
+ vsub.f32 q8, q8, q10
+ vadd.f32 q4, q14, q9
+ vsub.f32 q6, q14, q9
+ vsub.f32 d11, d27, d16
+ vadd.f32 d15, d27, d16
+ vadd.f32 d10, d26, d17
+ vsub.f32 d14, d26, d17
+ vswp d9, d10
+ vswp d13, d14
+ vstmia lr!, {q4-q7}
+
.globl _neon_end
- .align 2
+ .align 4
_neon_end:
bx lr
diff --git a/src/neon_float.h b/src/neon_float.h
index 0e192a1..41c9ecf 100644
--- a/src/neon_float.h
+++ b/src/neon_float.h
@@ -644,14 +644,62 @@ __INLINE V LOAD2I(const data_t **addr) {
return o;
}
+__INLINE V LOAD2I_0(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag0\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_1(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag1\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_2(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag2\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_3(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag3\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_4(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag4\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_5(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag5\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_6(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag6\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOAD2I_7(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld2.32 {%q0}, [%1, :128]! @tag7\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+
+
+
__INLINE V LOADI(const data_t **addr) {
- float32x2_t out0, out1;
float32x4_t o;
-
- __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t"
- : "=w" (o), "+r" (*addr)
- :
- );
+ __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOADI_2(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag2" : "=w" (o), "+r" (*addr) : );
+ return o;
+}
+__INLINE V LOADI_3(const data_t **addr) {
+ float32x4_t o;
+ __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t @tag3" : "=w" (o), "+r" (*addr) : );
return o;
}
__INLINE V HSP_MUL(V *d, const V *w) {
@@ -699,10 +747,10 @@ __INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const da
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = LOAD2I(i0);
- t1 = LOAD2I(i1);
- t2 = LOAD2I(i2);
- t3 = LOAD2I(i3);
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t2 = LOAD2I_2(i2);
+ t3 = LOAD2I_3(i3);
t4 = ADD (t0, t1);
t5 = SUB (t0, t1);
t6 = ADD (t2, t3);
@@ -712,10 +760,10 @@ __INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const da
r1 = HSP_SUB_MULI(&t5, &t7);
r3 = HSP_ADD_MULI(&t5, &t7);
- t0 = LOAD2I(i4);
- t1 = LOAD2I(i5);
- t2 = LOAD2I(i6);
- t3 = LOAD2I(i7);
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
r4 = ADD (t0, t1);
r5 = SUB (t0, t1);
r6 = ADD (t2, t3);
@@ -768,10 +816,10 @@ __INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const da
V r0, r1, r2, r3, r4, r5, r6, r7;
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = LOAD2I(i0);
- t1 = LOAD2I(i1);
- t2 = LOAD2I(i2);
- t3 = LOAD2I(i3);
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t2 = LOAD2I_2(i2);
+ t3 = LOAD2I_3(i3);
t4 = ADD (t0, t1);
t5 = SUB (t0, t1);
t6 = ADD (t2, t3);
@@ -796,10 +844,10 @@ __INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const da
- t0 = LOAD2I(i4);
- t1 = LOAD2I(i5);
- t2 = LOAD2I(i6);
- t3 = LOAD2I(i7);
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
t4 = ADD (t0, t1);
t5 = SUB (t0, t1);
t6 = ADD (t2, t3);
@@ -850,10 +898,10 @@ __INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const da
{
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = LOAD2I(i0);
- t1 = LOAD2I(i1);
- t2 = LOAD2I(i2);
- t3 = LOAD2I(i3);
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t2 = LOAD2I_2(i2);
+ t3 = LOAD2I_3(i3);
t4 = ADD(t0, t1);
t5 = SUB(t0, t1);
t6 = ADD(t2, t3);
@@ -886,10 +934,10 @@ __INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const da
}
{
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = LOAD2I(i4);
- t1 = LOAD2I(i5);
- t2 = LOAD2I(i6);
- t3 = LOAD2I(i7);
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
//t2 = HALFBLEND(t6, t7);
//t3 = HALFBLEND(t7, t6);
t4 = ADD(t0, t1);
@@ -955,10 +1003,10 @@ __INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const da
{
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = LOAD2I(i0);
- t1 = LOAD2I(i1);
- t6 = LOADI(i2);
- t7 = LOADI(i3);
+ t0 = LOAD2I_0(i0);
+ t1 = LOAD2I_1(i1);
+ t6 = LOADI_2(i2);
+ t7 = LOADI_3(i3);
float32x2x2_t tmp0 = vtrn_f32(vget_low_f32(t6), vget_high_f32(t7));
float32x2x2_t tmp1 = vtrn_f32(vget_low_f32(t7), vget_high_f32(t6));
@@ -991,10 +1039,10 @@ __INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const da
}
{
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = LOAD2I(i4);
- t1 = LOAD2I(i5);
- t2 = LOAD2I(i6);
- t3 = LOAD2I(i7);
+ t0 = LOAD2I_4(i4);
+ t1 = LOAD2I_5(i5);
+ t2 = LOAD2I_6(i6);
+ t3 = LOAD2I_7(i7);
t4 = ADD(t0, t1);
t5 = SUB(t0, t1);
t6 = ADD(t2, t3);
diff --git a/src/patterns.c b/src/patterns.c
index 29fa5ae..664f20e 100644
--- a/src/patterns.c
+++ b/src/patterns.c
@@ -114,9 +114,9 @@ void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
for(i=0;i<N/leafN;i++) {
p->offsets[i] = offsets[i*2+1]*2;
}
- for(i=0;i<N/leafN;i++) {
- printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
- }
+//for(i=0;i<N/leafN;i++) {
+// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
+//}
free(offsets);
OpenPOWER on IntegriCloud