summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/codegen_sse.h9
-rw-r--r--src/ffts.c39
2 files changed, 31 insertions, 17 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index c518481..c0a34fe 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -38,7 +38,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "arch/x64/x64-codegen.h"
#include <assert.h>
-#include <string.h>
static const FFTS_ALIGN(16) unsigned int sse_constants[20] = {
/* 0.0, -0.0, 0.0, -0.0 */
@@ -741,12 +740,12 @@ generate_leaf_eo(insns_t **fp, uint32_t *offsets)
insns_t *ins = *fp;
#ifdef _M_X64
- x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
- x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
- x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2);
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM5);
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4);
x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4);
diff --git a/src/ffts.c b/src/ffts.c
index 2b6b647..41df886 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -205,7 +205,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
V4SF MULI_SIGN;
size_t n_luts;
ffts_cpx_32f *w;
- size_t i, n;
+ ffts_cpx_32f *tmp;
+ size_t i, j, m, n;
+ int stride;
if (sign < 0) {
MULI_SIGN = V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f);
@@ -246,19 +248,28 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
V4SF neg = (sign < 0) ? V4SF_LIT4(0.0f, 0.0f, 0.0f, 0.0f) : V4SF_LIT4(-0.0f, -0.0f, -0.0f, -0.0f);
#endif
+ /* calculate factors */
+ m = leaf_N << (n_luts - 2);
+ tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32);
+
+ for (i = 0; i < m; i++) {
+ tmp[i][0] = W_re(4*m, i);
+ tmp[i][1] = W_im(4*m, i);
+ }
+
+ /* generate lookup tables */
+ stride = 1 << (n_luts - 1);
for (i = 0; i < n_luts; i++) {
p->ws_is[i] = w - (ffts_cpx_32f*) p->ws;
- //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
if (!i) {
ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32);
float *fw0 = (float*) w0;
float *fw = (float*) w;
- size_t j;
for (j = 0; j < n/4; j++) {
- w0[j][0] = W_re(n, j);
- w0[j][1] = W_im(n, j);
+ w0[j][0] = tmp[j * stride][0];
+ w0[j][1] = tmp[j * stride][1];
}
#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
@@ -301,14 +312,15 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
float *fw = (float *)w;
- size_t j;
for (j = 0; j < n/8; j++) {
- w0[j][0] = W_re((float) n, (float) 2*j);
- w0[j][1] = W_im((float) n, (float) 2*j);
- w1[j][0] = W_re((float) n, (float) j);
- w1[j][1] = W_im((float) n, (float) j);
- w2[j][0] = W_re((float) n, (float) (j + (n/8)));
- w2[j][1] = W_im((float) n, (float) (j + (n/8)));
+ w0[j][0] = tmp[2 * j * stride][0];
+ w0[j][1] = tmp[2 * j * stride][1];
+
+ w1[j][0] = tmp[j * stride][0];
+ w1[j][1] = tmp[j * stride][1];
+
+ w2[j][0] = tmp[(j + (n/8)) * stride][0];
+ w2[j][1] = tmp[(j + (n/8)) * stride][1];
}
#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
@@ -374,6 +386,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
}
n *= 2;
+ stride >>= 1;
}
#if defined(__arm__) && !defined(DYNAMIC_DISABLED)
@@ -388,6 +401,8 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
}
#endif
+ FFTS_FREE(tmp);
+
p->lastlut = w;
p->n_luts = n_luts;
return 0;
OpenPOWER on IntegriCloud