summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/codegen.c1
-rw-r--r--src/cp_sse.c54
-rw-r--r--src/cp_sse.h2
-rw-r--r--src/macros.h2
4 files changed, 13 insertions, 46 deletions
diff --git a/src/codegen.c b/src/codegen.c
index ab1f87a..8593f12 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -134,6 +134,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
uint32_t *func = valloc(8192);
uint32_t *fp = func;
+ //p->transform_base = func;
uint32_t *x_8_addr = fp;
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
diff --git a/src/cp_sse.c b/src/cp_sse.c
index b35605c..1356c7b 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -73,45 +73,8 @@ firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * rest
void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) {
transform_index_t *ps = p->transforms;
- int leafN = 8;
p->firstpass((const float *)in, (float *)out, p);
- p->transform(out, p->N, p->ws);
-
- /*
- size_t ps0_next = ps[0];
- while(ps0_next) {
- size_t ps0 = ps0_next;
- size_t ps1 = ps[1];
- ps0_next = ps[2];
- ps += 2;
-
- if(ps0 == 2*leafN) {
- float *LUT = (float *)p->ws[0];
- float *data = (float *)(out) + ps1;
- #ifdef __ARM_NEON__
- X_4_SPLIT(data, 16, LUT);
- #else
- X_4(data, 16, LUT);
- #endif
-
- }else{
- int index = __builtin_ctzl(ps0)-4;
- float *LUT = (float *)p->ws[__builtin_ctzl(ps0)-4];
- #ifdef __ARM_NEON__
- X_8_SPLIT(((float *)out) + ps1, ps0, LUT);
- #else
- X_8(((float *)out) + ps1, ps0, LUT);
- #endif
- }
-
-
- }
-
- #ifdef __ARM_NEON__
- if(p->N>32)
- X_8_SPLIT_T((float *)out, p->N, p->lastlut);
- #endif
-*/
+ if(p->transform) p->transform(out, p->N, p->ws);
}
void ffts_free(ffts_plan_t *p) {
@@ -119,15 +82,14 @@ void ffts_free(ffts_plan_t *p) {
size_t i;
if(p->ws) {
-// for(i=0;i<p->n_luts;i++) {
-// FFTS_FREE(p->ws[i]);
-// }
FFTS_FREE(p->ws);
}
if(p->is) free(p->is);
if(p->offsets) free(p->offsets);
//free(p->transforms);
+// if(p->transform_base) free(p->transform_base);
+
free(p);
}
@@ -143,8 +105,9 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
else SCALAR_MULI_SIGN = -0.0f;
p->transform = NULL;
+ p->transform_base = NULL;
- if(N > 32) {
+ if(N >= 32) {
ffts_init_offsets(p, N, leafN);
ffts_init_is(p, N, leafN, 2);
// ffts_init_tree(p, N, leafN);
@@ -203,7 +166,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
/* LUTS */
size_t n_luts = __builtin_ctzl(N/leafN);
- if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+ if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
if(n_luts >= 32) n_luts = 0;
@@ -245,6 +208,9 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
w = p->ws;
n = leafN*2;
+ if(hardcoded) n = 8;
+
+
for(i=0;i<n_luts;i++) {
p->ws_is[i] = w - (cdata_t *)p->ws;
fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
@@ -391,7 +357,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
p->N = N;
p->lastlut = w;
p->n_luts = n_luts;
- if(N>32)
+ if(N>=32)
p->transform = ffts_generate_func_code(p, N, leafN);
// fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t));
diff --git a/src/cp_sse.h b/src/cp_sse.h
index d692fc9..6f793d8 100644
--- a/src/cp_sse.h
+++ b/src/cp_sse.h
@@ -30,7 +30,7 @@ struct _ffts_plan_t {
size_t N;
void *lastlut;
transform_index_t *transforms;
- transform_func_t transform;
+ transform_func_t transform, transform_base;
};
typedef struct _ffts_plan_t ffts_plan_t;
diff --git a/src/macros.h b/src/macros.h
index 8dc2a8a..fcb2258 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -442,7 +442,7 @@ __INLINE void
firstpass_16(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
float *LUT8 = p->ws + p->ws_is[0];
- float *LUT16 = p->ws + p->ws_is[1];
+ float *LUT16 = ((float *)p->ws) + 8;//(p->ws_is[1]*4);
L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11);
L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13);
OpenPOWER on IntegriCloud