diff options
-rw-r--r-- | src/codegen_sse.h | 9 | ||||
-rw-r--r-- | src/ffts.c | 39 |
2 files changed, 31 insertions, 17 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h index c518481..c0a34fe 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -38,7 +38,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "arch/x64/x64-codegen.h" #include <assert.h> -#include <string.h> static const FFTS_ALIGN(16) unsigned int sse_constants[20] = { /* 0.0, -0.0, 0.0, -0.0 */ @@ -741,12 +740,12 @@ generate_leaf_eo(insns_t **fp, uint32_t *offsets) insns_t *ins = *fp; #ifdef _M_X64 - x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2); - x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9); - x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7); - x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2); x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM5); x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4); x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4); @@ -205,7 +205,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) V4SF MULI_SIGN; size_t n_luts; ffts_cpx_32f *w; - size_t i, n; + ffts_cpx_32f *tmp; + size_t i, j, m, n; + int stride; if (sign < 0) { MULI_SIGN = V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f); @@ -246,19 +248,28 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) V4SF neg = (sign < 0) ? V4SF_LIT4(0.0f, 0.0f, 0.0f, 0.0f) : V4SF_LIT4(-0.0f, -0.0f, -0.0f, -0.0f); #endif + /* calculate factors */ + m = leaf_N << (n_luts - 2); + tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32); + + for (i = 0; i < m; i++) { + tmp[i][0] = W_re(4*m, i); + tmp[i][1] = W_im(4*m, i); + } + + /* generate lookup tables */ + stride = 1 << (n_luts - 1); for (i = 0; i < n_luts; i++) { p->ws_is[i] = w - (ffts_cpx_32f*) p->ws; - //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); if (!i) { ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32); float *fw0 = (float*) w0; float *fw = (float*) w; - size_t j; for (j = 0; j < n/4; j++) { - w0[j][0] = W_re(n, j); - w0[j][1] = W_im(n, j); + w0[j][0] = tmp[j * stride][0]; + w0[j][1] = tmp[j * stride][1]; } #if defined(__arm__) && !defined(DYNAMIC_DISABLED) @@ -301,14 +312,15 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) float *fw = (float *)w; - size_t j; for (j = 0; j < n/8; j++) { - w0[j][0] = W_re((float) n, (float) 2*j); - w0[j][1] = W_im((float) n, (float) 2*j); - w1[j][0] = W_re((float) n, (float) j); - w1[j][1] = W_im((float) n, (float) j); - w2[j][0] = W_re((float) n, (float) (j + (n/8))); - w2[j][1] = W_im((float) n, (float) (j + (n/8))); + w0[j][0] = tmp[2 * j * stride][0]; + w0[j][1] = tmp[2 * j * stride][1]; + + w1[j][0] = tmp[j * stride][0]; + w1[j][1] = tmp[j * stride][1]; + + w2[j][0] = tmp[(j + (n/8)) * stride][0]; + w2[j][1] = tmp[(j + (n/8)) * stride][1]; } #if defined(__arm__) && !defined(DYNAMIC_DISABLED) @@ -374,6 +386,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) } n *= 2; + stride >>= 1; } #if defined(__arm__) && !defined(DYNAMIC_DISABLED) @@ -388,6 +401,8 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) } #endif + FFTS_FREE(tmp); + p->lastlut = w; p->n_luts = n_luts; return 0; |