From 36e24f0144c8f44dc282642c962b4d7003e74909 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 11 Nov 2014 13:48:47 +0200 Subject: generate_leaf_init, generate_leaf_ee, generate_leaf_eo, generate_leaf_oe and generate_leaf_oo Multiple offset constants by 4, and remove multiply by 4 from "offset fixing" loops. --- src/codegen.c | 131 ++------------ src/codegen_sse.h | 509 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 521 insertions(+), 119 deletions(-) diff --git a/src/codegen.c b/src/codegen.c index 6c6c887..86c7369 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -100,8 +100,8 @@ static void ffts_elaborate_tree(size_t **p, int N, int leaf_N, int offset) transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) { - uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; - uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; + uint32_t offsets[8] = {0, 4*N, 2*N, 6*N, N, 5*N, 7*N, 3*N}; + uint32_t offsets_o[8] = {0, 4*N, 2*N, 6*N, 7*N, 3*N, N, 5*N}; int32_t pAddr = 0; int32_t pN = 0; @@ -189,128 +189,33 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N fp += (vfp_o - vfp_e) / 4; #endif #else - /* generate function */ + /* generate functions */ start = generate_prologue(&fp, p); - loop_count = 4 * p->i0; - -#ifdef _M_X64 - /* set loop counter */ - x86_mov_reg_imm(fp, X86_EBX, loop_count); - - /* clear */ - x86_clear_reg(fp, X86_EAX); - - /* set "pointer" to offsets */ - x64_mov_reg_membase(fp, X64_R9, X64_RCX, 0x0, 8); - - /* set "pointer" to constants */ - x64_mov_reg_membase(fp, X64_RSI, X64_RCX, 0xE0, 8); -#else - /* set loop counter */ - x86_mov_reg_imm(fp, X86_ECX, loop_count); - - /* copy function */ - assert((char*) leaf_ee > (char*) leaf_ee_init); - len = (char*) leaf_ee - (char*) leaf_ee_init; - memcpy(fp, leaf_ee_init, (size_t) len); - fp += len; - - ffts_align_mem16(&fp, 9); -#endif - - /* copy function */ - assert((char*) leaf_oo > (char*) leaf_ee); - len = (char*) leaf_oo - (char*) leaf_ee; - memcpy(fp, leaf_ee, (size_t) len); - - /* patch offsets */ - for (i = 0; i < 8; i++) { - IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets[i]); - } - - fp += len; + + loop_count = 4 * p->i0; + generate_leaf_init(&fp, loop_count); + generate_leaf_ee(&fp, offsets); if (ffts_ctzl(N) & 1) { if (p->i1) { loop_count += 4 * p->i1; - - /* align loop/jump destination */ -#ifdef _M_X64 - x86_mov_reg_imm(fp, X86_EBX, loop_count); - ffts_align_mem16(&fp, 3); -#else - x86_mov_reg_imm(fp, X86_ECX, loop_count); - ffts_align_mem16(&fp, 4); -#endif - - /* copy function */ - assert((char*) leaf_eo > (char*) leaf_oo); - len = (char*) leaf_eo - (char*) leaf_oo; - memcpy(fp, leaf_oo, len); - - /* patch offsets */ - for (i = 0; i < 8; i++) { - IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]); - } - - fp += len; + generate_leaf_oo(&fp, loop_count, offsets_o); } - loop_count += 4; - - /* copy function */ - assert((char*) leaf_end > (char*) leaf_oe); - len = (char*) leaf_end - (char*) leaf_oe; - memcpy(fp, leaf_oe, len); - - /* patch offsets */ - for (i = 0; i < 8; i++) { - IMM32_NI(fp + sse_leaf_oe_offsets[i], 4 * offsets_o[i]); - } - - fp += len; + loop_count += 4; + generate_leaf_oe(&fp, offsets_o); } else { loop_count += 4; - - /* copy function */ - assert((char*) leaf_oe > (char*) leaf_eo); - len = (char*) leaf_oe - (char*) leaf_eo; - memcpy(fp, leaf_eo, len); - - /* patch offsets */ - for (i = 0; i < 8; i++) { - IMM32_NI(fp + sse_leaf_eo_offsets[i], 4 * offsets[i]); - } - - fp += len; + generate_leaf_eo(&fp, offsets); if (p->i1) { loop_count += 4 * p->i1; - - /* align loop/jump destination */ -#ifdef _M_X64 - x86_mov_reg_imm(fp, X86_EBX, loop_count); - ffts_align_mem16(&fp, 3); -#else - x86_mov_reg_imm(fp, X86_ECX, loop_count); - ffts_align_mem16(&fp, 4); -#endif - - /* copy function */ - assert((char*) leaf_eo > (char*) leaf_oo); - len = (char*) leaf_eo - (char*) leaf_oo; - memcpy(fp, leaf_oo, len); - - for (i = 0; i < 8; i++) { - IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]); - } - - fp += len; + generate_leaf_oo(&fp, loop_count, offsets_o); } } if (p->i1) { - uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; + uint32_t offsets_oe[8] = {7*N, 3*N, N, 5*N, 0, 4*N, 6*N, 2*N}; loop_count += 4 * p->i1; @@ -323,15 +228,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N ffts_align_mem16(&fp, 9); #endif - assert((char*) leaf_oo > (char*) leaf_ee); - len = (char*) leaf_oo - (char*) leaf_ee; - memcpy(fp, leaf_ee, len); - - for (i = 0; i < 8; i++) { - IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets_oe[i]); - } - - fp += len; + generate_leaf_ee(&fp, offsets_oe); } generate_transform_init(&fp); diff --git a/src/codegen_sse.h b/src/codegen_sse.h index 40bfa3f..20c0f00 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -60,9 +60,9 @@ void sse_constants_inv(); // typedef uint8_t insns_t; extern const uint32_t sse_leaf_ee_offsets[8]; -extern const uint32_t sse_leaf_oo_offsets[8]; extern const uint32_t sse_leaf_eo_offsets[8]; extern const uint32_t sse_leaf_oe_offsets[8]; +extern const uint32_t sse_leaf_oo_offsets[8]; #define P(x) (*(*p)++ = x) @@ -153,7 +153,7 @@ static FFTS_INLINE void ffts_insert_nops(uint8_t **p, uint32_t count) static FFTS_INLINE void ffts_align_mem16(uint8_t **p, uint32_t offset) { - int r = (16 - (offset & 0xf)) - ((uintptr_t)(*p) & 0xf); + int r = (16 - (offset & 0xf)) - (int) ((uintptr_t)(*p) & 0xf); r = (16 + r) & 0xf; ffts_insert_nops(p, r); } @@ -345,6 +345,509 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign) return x4_addr; } +static FFTS_INLINE void generate_leaf_init(insns_t **fp, uint32_t loop_count) +{ +#ifndef _M_X64 + size_t len; +#endif + + /* to avoid deferring */ + insns_t *ins = *fp; + +#ifdef _M_X64 + /* set loop counter */ + x86_mov_reg_imm(ins, X86_EBX, loop_count); + + /* generate function */ + + /* clear */ + x86_clear_reg(ins, X86_EAX); + + /* set "pointer" to offsets */ + x64_mov_reg_membase(ins, X64_R9, X64_RCX, 0x0, 8); + + /* set "pointer" to constants */ + x64_mov_reg_membase(ins, X64_RSI, X64_RCX, 0xE0, 8); +#else + /* set loop counter */ + x86_mov_reg_imm(ins, X86_ECX, loop_count); + + /* copy function */ + assert((char*) leaf_ee > (char*) leaf_ee_init); + len = (char*) leaf_ee - (char*) leaf_ee_init; + memcpy(ins, leaf_ee_init, (size_t) len); + ins += len; + + /* align loop/jump destination */ + ffts_align_mem16(&ins, 9); +#endif + + *fp = ins; +} + +static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets) +{ +#ifdef _M_X64 + insns_t *leaf_ee_loop; +#else + size_t len; + int i; +#endif + + /* to avoid deferring */ + insns_t *ins = *fp; + +#ifdef _M_X64 + x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RSI, 32); + x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RSI, 0); + + /* beginning of the loop (make sure it's 16 byte aligned) */ + leaf_ee_loop = ins; + assert(!(((uintptr_t) leaf_ee_loop) & 0xF)); + + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7); + x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); + x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10); + x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10); + x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8); + x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[1], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[4], X64_RAX, 2); + x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9); + x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM9); + x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[5], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); + x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RDX, offsets[6], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6); + x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM7); + x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2); + x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM13); + x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM14); + x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM11); + x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM11); + x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM12); + x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM12); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13); + x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14); + x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 16); + x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9); + + /* TODO?? */ + x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 16); + + x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM5); + x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10); + x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15); + x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15); + x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3); + x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM1); + x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM1); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); + x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM8); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM6); + x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0); + x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM4); + x64_sse_mulps_reg_reg(ins, X64_XMM3, X64_XMM0); + x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10); + x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM12); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM7); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM9, 0xB1); + x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM11); + x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM11); + x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM9); + x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM3); + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM3); + x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8); + x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6); + x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); + x64_sse_movlhps_reg_reg(ins, X64_XMM3, X64_XMM4); + x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1); + x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12); + x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12); + x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE); + x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM5); + x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13); + x64_sse_movlhps_reg_reg(ins, X64_XMM9, X64_XMM14); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM13, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM4); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM7); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM9); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM1); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM5); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM6); + + /* loop condition */ + x64_alu_reg_reg_size(ins, X86_CMP, X64_RBX, X64_RAX, 8); + x64_branch_size(ins, X86_CC_NE, leaf_ee_loop, 0, 4); +#else + /* copy function */ + assert((char*) leaf_oo > (char*) leaf_ee); + len = (char*) leaf_oo - (char*) leaf_ee; + memcpy(ins, leaf_ee, (size_t) len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(ins + sse_leaf_ee_offsets[i], offsets[i]); + } + + ins += len; +#endif + + *fp = ins; +} + +static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets) +{ +#ifndef _M_X64 + size_t len; + int i; +#endif + + /* to avoid deferring */ + insns_t *ins = *fp; + +#ifdef _M_X64 + x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9); + x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7); + x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2); + x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM5); + x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4); + x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4); + x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5); + x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0); + x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11); + x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1); + x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6); + x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6); + x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7); + x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM10); + x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM11); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM9, 0xEE); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM10); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM11); + x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[4], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[5], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM15); + x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[6], X64_RAX, 2); + x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12); + x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM12); + x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[7], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM4); + x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM14); + x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM13); + x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM13); + x64_sse_movlhps_reg_reg(ins, X64_XMM2, X64_XMM8); + x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM5); + x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM15); + x64_sse_xorps_reg_reg(ins, X64_XMM15, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM5); + x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM14); + x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM14); + x64_sse_movlhps_reg_reg(ins, X64_XMM1, X64_XMM9); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4); + x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4); + x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM1); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM15, X64_XMM15, 0xB1); + x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 48); + x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); + x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM15); + x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM7); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM15); + + /* TODO? */ + x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RSI, 48); + + x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RSI, 64); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1); + x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8); + x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM15); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1); + x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM7); + x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM15); + x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE); + x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9); + x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9); + x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1); + x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM10); + x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM10); + x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11); + x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM4, 0xEE); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM5); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM2); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM1); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM12); +#else + /* copy function */ + assert((char*) leaf_oe > (char*) leaf_eo); + len = (char*) leaf_oe - (char*) leaf_eo; + memcpy(ins, leaf_eo, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(ins + sse_leaf_eo_offsets[i], offsets[i]); + } + + ins += len; +#endif + + *fp = ins; +} + +static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets) +{ +#ifndef _M_X64 + size_t len; + int i; +#endif + + /* to avoid deferring */ + insns_t *ins = *fp; + +#ifdef _M_X64 + x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RSI, 0); + x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RDX, offsets[2], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM6); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xE4); + x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM6, 0xE4); + x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[0], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM12); + x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8); + x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8); + x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM7); + x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM7); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9); + x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE); + x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM0); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); + x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM14); + x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM9); + x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM10); + x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM9); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM12, 0xEE); + x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM10); + x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); + x64_sse_movlhps_reg_reg(ins, X64_XMM13, X64_XMM11); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM13); + x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RSI, 48); + x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM12); + x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 64); + x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM5); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM5, 0xB1); + x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM12); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM14); + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5); + + /* TODO? */ + x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RSI, 48); + + x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM4); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM4, 0xB1); + x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM12); + x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[4], X64_RAX, 2); + x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4); + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[6], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM9); + x64_sse_movaps_reg_memindex(ins, X64_XMM2, X64_RDX, offsets[7], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7); + x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[5], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM13); + x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM2); + x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM15); + x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15); + x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2); + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM5); + x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM0); + x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); + x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM3); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9); + x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM0); + x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6); + x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7); + x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM6); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7); + x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM2); + x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM3); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM8, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM9, 0xEE); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM2); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1); + x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM4); + x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM3); + x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM13); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM13); + x64_sse_movlhps_reg_reg(ins, X64_XMM10, X64_XMM8); + x64_sse_movlhps_reg_reg(ins, X64_XMM11, X64_XMM9); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM10); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM11); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM14); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM4); +#else + /* copy function */ + assert((char*) leaf_end > (char*) leaf_oe); + len = (char*) leaf_end - (char*) leaf_oe; + memcpy(ins, leaf_oe, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(ins + sse_leaf_oe_offsets[i], offsets[i]); + } + + ins += len; +#endif + + *fp = ins; +} + +static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets) +{ +#ifdef _M_X64 + insns_t *leaf_oo_loop; +#else + size_t len; + int i; +#endif + + /* to avoid deferring */ + insns_t *ins = *fp; + +#ifdef _M_X64 + /* align loop/jump destination */ + x86_mov_reg_imm(ins, X86_EBX, loop_count); + ffts_align_mem16(&ins, 3); + + x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RSI, 0); + + /* beginning of the loop (make sure it's 16 byte aligned) */ + leaf_oo_loop = ins; + assert(!(((uintptr_t) leaf_oo_loop) & 0xF)); + + x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4); + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2); + x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2); + x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM7); + x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM7); + x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); + x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM6); + x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM1); + x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM4); + x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[7], X64_RAX, 2); + x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14); + x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2); + x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8); + x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM11); + x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM12); + x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11); + x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12); + x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM9); + x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9); + x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM5); + x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1); + x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM6); + x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); + x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM10); + x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13); + x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM13); + x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM10); + x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1); + x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2); + x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM4); + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14); + x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM14); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM4, 0xEE); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9); + x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM15); + x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM13); + x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM1); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM15, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM13, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM1, 0xEE); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM14); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM7); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM4); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM8); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM6); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM9); + x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM2); + + /* loop condition */ + x64_alu_reg_reg_size(ins, X86_CMP, X64_RBX, X64_RAX, 8); + x64_branch_size(ins, X86_CC_NE, leaf_oo_loop, 0, 4); +#else + /* align loop/jump destination */ + x86_mov_reg_imm(ins, X86_ECX, loop_count); + ffts_align_mem16(&ins, 4); + + /* copy function */ + assert((char*) leaf_eo > (char*) leaf_oo); + len = (char*) leaf_eo - (char*) leaf_oo; + memcpy(ins, leaf_oo, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(ins + sse_leaf_oo_offsets[i], offsets[i]); + } + + ins += len; +#endif + + *fp = ins; +} + static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) { insns_t *ins; @@ -370,6 +873,8 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) #endif #ifdef _M_X64 + /* generate function */ + /* input */ x64_mov_reg_reg(ins, X64_RAX, X64_R9, 8); -- cgit v1.1