summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/ffts.h1
-rw-r--r--src/codegen.c3
-rw-r--r--src/codegen_sse.h1
-rw-r--r--src/ffts.c63
-rw-r--r--src/macros.h2
-rw-r--r--src/sse.s22
6 files changed, 17 insertions, 75 deletions
diff --git a/include/ffts.h b/include/ffts.h
index 4ea454e..75392eb 100644
--- a/include/ffts.h
+++ b/include/ffts.h
@@ -63,6 +63,7 @@ struct _ffts_plan_t {
};
*/
+struct _ffts_plan_t;
typedef struct _ffts_plan_t ffts_plan_t;
void ffts_execute(ffts_plan_t * , const void * , const void * );
diff --git a/src/codegen.c b/src/codegen.c
index 1f1a26a..107f41b 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -156,7 +156,8 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
size_t *pps = ps;
#ifdef __x86_64__
- p->constants = sse_constants;
+ if(sign < 0) p->constants = sse_constants;
+ else p->constants = sse_constants_inv;
#endif
elaborate_tree(&pps, N, leafN, 0);
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index be0de12..ec85667 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -50,6 +50,7 @@ void x8_soft();
void x8_hard();
void sse_constants();
+void sse_constants_inv();
// typedef uint8_t insns_t;
diff --git a/src/ffts.c b/src/ffts.c
index 20a32d5..6a9258b 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -46,9 +46,7 @@
void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) {
transform_index_t *ps = p->transforms;
- //p->firstpass((const float *)in, (float *)out, p);
p->transform(p, (const float *)in, (float *)out);
- //if(p->transform) p->transform(out, p->N, p->ws);
}
void ffts_free(ffts_plan_t *p) {
@@ -100,34 +98,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
if(N >= 32) {
ffts_init_offsets(p, N, leafN);
ffts_init_is(p, N, leafN, 2);
- // ffts_init_tree(p, N, leafN);
-
- // if(N == 64) p->firstpass = &firstpass_64;
-
- LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
- LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
- LEAFLUT[2] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011);
- LEAFLUT[3] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199);
- LEAFLUT[4] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981);
- LEAFLUT[5] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011);
-
- LEAFLUT[6] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1);
- LEAFLUT[7] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0);
- LEAFLUT[8] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1);
- LEAFLUT[9] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0);
- LEAFLUT[10] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
- LEAFLUT[11] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
-
-
- if(sign > 0) {
- V neg = VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
- LEAFLUT[1] = VXOR(LEAFLUT[1], neg);
- LEAFLUT[3] = VXOR(LEAFLUT[3], neg);
- LEAFLUT[5] = VXOR(LEAFLUT[5], neg);
- LEAFLUT[7] = VXOR(LEAFLUT[7], neg);
- LEAFLUT[9] = VXOR(LEAFLUT[9], neg);
- LEAFLUT[11] = VXOR(LEAFLUT[11], neg);
- }
p->i0 = N/leafN/3+1;
p->i1 = N/leafN/3;
@@ -144,7 +114,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
else if(N == 8) p->transform = &firstpass_8;
else if(N == 16) p->transform = &firstpass_16;
- else if(N == 32) p->transform = &firstpass_32;
p->is = NULL;
p->offsets = NULL;
@@ -261,10 +230,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
}
w += n/4 * 2;
#endif
- // for(j=0;j<n/2;j++) {
- // printf("%f %f\n", creal(w[j]), cimag(w[j]));
-
- // }
FFTS_FREE(w0);
}else{
@@ -296,18 +261,12 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2*3, temp0);
- //VST(fw + j*2*3, temp0.val[0]);
- //VST(fw + j*2*3 + 4, temp0.val[1]);
temp1 = VLD2(fw1 + j*2);
temp1.val[1] = VXOR(temp1.val[1], neg);
STORESPR(fw + j*2*3 + 8, temp1);
- //VST(fw + j*2*3 + 8, temp1.val[0]);
- //VST(fw + j*2*3 + 12, temp1.val[1]);
temp2 = VLD2(fw2 + j*2);
temp2.val[1] = VXOR(temp2.val[1], neg);
STORESPR(fw + j*2*3 + 16, temp2);
- //VST(fw + j*2*3 + 16, temp2.val[0]);
- //VST(fw + j*2*3 + 20, temp2.val[1]);
}
w += n/8 * 3;
#else
@@ -371,28 +330,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
p->lastlut = w;
p->n_luts = n_luts;
if(N>=32) ffts_generate_func_code(p, N, leafN, sign);
-#ifdef __x86_64__
- float *temp_consts = (float *)p->constants;
- if(sign > 0 && N>=32) {
- temp_consts[0] = -0.0f;
- temp_consts[1] = 0.0f;
- temp_consts[2] = -0.0f;
- temp_consts[3] = 0.0f;
-
- float temp;
- temp = temp_consts[9];
- temp_consts[9] = temp_consts[8];
- temp_consts[8] = temp;
- temp = temp_consts[11];
- temp_consts[11] = temp_consts[10];
- temp_consts[10] = temp;
-
- temp = temp_consts[18];
- temp_consts[18] = temp_consts[19];
- temp_consts[19] = temp;
- }
-#endif
-// fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t));
return p;
}
diff --git a/src/macros.h b/src/macros.h
index 712a6ba..d0907b6 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -70,7 +70,7 @@ typedef float32x4x2_t VS;
#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
-static inline V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
+inline V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
return VLD(d);
}
diff --git a/src/sse.s b/src/sse.s
index aca6e88..79dd6ec 100644
--- a/src/sse.s
+++ b/src/sse.s
@@ -859,18 +859,20 @@ _sse_constants:
.globl sse_constants
sse_constants:
#endif
-L_sse_constants:
-L_2il0floatpacket.719:
.long 0x00000000,0x80000000,0x00000000,0x80000000
- .align 4
-L_2il0floatpacket.720:
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
- .align 4
-L_2il0floatpacket.721:
.long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
- .align 4
-L_2il0floatpacket.722:
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
- .align 4
-L_2il0floatpacket.723:
.long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__
+ .globl _sse_constants_inv
+_sse_constants_inv:
+#else
+ .globl sse_constants_inv
+sse_constants_inv:
+#endif
+ .long 0x80000000,0x00000000,0x80000000,0x00000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
OpenPOWER on IntegriCloud