diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/codegen.c | 12 | ||||
-rw-r--r-- | src/codegen_sse.h | 203 |
2 files changed, 126 insertions, 89 deletions
diff --git a/src/codegen.c b/src/codegen.c index e1ed11f..9f2921a 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -194,23 +194,26 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N loop_count = 4 * p->i0; generate_leaf_init(&fp, loop_count); - generate_leaf_ee(&fp, offsets); if (ffts_ctzl(N) & 1) { + generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0); + if (p->i1) { loop_count += 4 * p->i1; - generate_leaf_oo(&fp, loop_count, offsets_o); + generate_leaf_oo(&fp, loop_count, offsets_o, 7); } loop_count += 4; generate_leaf_oe(&fp, offsets_o); } else { + generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8); + loop_count += 4; generate_leaf_eo(&fp, offsets); if (p->i1) { loop_count += 4 * p->i1; - generate_leaf_oo(&fp, loop_count, offsets_o); + generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7); } } @@ -222,13 +225,12 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N /* align loop/jump destination */ #ifdef _M_X64 x86_mov_reg_imm(fp, X86_EBX, loop_count); - ffts_align_mem16(&fp, 8); #else x86_mov_reg_imm(fp, X86_ECX, loop_count); ffts_align_mem16(&fp, 9); #endif - generate_leaf_ee(&fp, offsets_oe); + generate_leaf_ee(&fp, offsets_oe, 0); } generate_transform_init(&fp); diff --git a/src/codegen_sse.h b/src/codegen_sse.h index 739335f..558a015 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -404,6 +404,9 @@ static FFTS_INLINE void generate_leaf_init(insns_t **fp, uint32_t loop_count) /* set "pointer" to constants */ x64_mov_reg_membase(ins, X64_RSI, X64_RCX, 0xE0, 8); + + /* use XMM3 for sign change */ + x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0); #else /* set loop counter */ x86_mov_reg_imm(ins, X86_ECX, loop_count); @@ -421,7 +424,7 @@ static FFTS_INLINE void generate_leaf_init(insns_t **fp, uint32_t loop_count) *fp = ins; } -static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets) +static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend) { #ifdef _M_X64 insns_t *leaf_ee_loop; @@ -434,39 +437,47 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets) insns_t *ins = *fp; #ifdef _M_X64 - x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RSI, 32); - x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RSI, 0); - + x64_sse_movaps_reg_membase_size(ins, X64_XMM0, X64_RSI, 32, 1); + /* beginning of the loop (make sure it's 16 byte aligned) */ leaf_ee_loop = ins; assert(!(((uintptr_t) leaf_ee_loop) & 0xF)); x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0); + extend--; + x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10); x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10); - + /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8); - + x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3); + x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[1], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[4], X64_RAX, 2); x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9); x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM9); x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[5], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); - x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RDX, offsets[6], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6); + x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0); + extend--; + x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8); x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); - x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM7); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0); + extend--; + x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2); x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM13); - x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM14); + x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM14); x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM11); x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM11); x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM12); @@ -475,60 +486,74 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets) x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14); x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 16); x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9); + x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); - /* TODO?? */ - x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 16); + x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0); + extend--; - x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM5); x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10); x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15); x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15); - x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3); - x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM1); - x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM1); + x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8); + + x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0); + extend--; + + x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0); + extend--; + x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); - + /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM8); - - x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1); - x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM6); + x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM3); + + x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0); + extend--; + x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0); x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM4); - x64_sse_mulps_reg_reg(ins, X64_XMM3, X64_XMM0); + x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM0); x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10); - x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM3); - x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM12); + x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM8); + x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM12); x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM7); x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM9, 0xB1); x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM11); - x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM11); + x64_sse_addps_reg_reg(ins, X64_XMM8, X64_XMM11); x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM9); x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9); - x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM3); - x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM3); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM8); + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM8); /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8); + x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3); - x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2); + x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2); x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6); x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); - x64_sse_movlhps_reg_reg(ins, X64_XMM3, X64_XMM4); + x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4); x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE); - x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0); + extend--; + x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12); x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12); x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7); x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE); - x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM5); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0); + extend--; + x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13); x64_sse_movlhps_reg_reg(ins, X64_XMM9, X64_XMM14); x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM13, 0xEE); x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE); - x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM8); x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM4); x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM7); x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM9); @@ -578,7 +603,6 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets) x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4); x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4); x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5); - x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0); x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11); /* change sign */ @@ -627,12 +651,9 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets) x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 48); x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM15); + x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM11); x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM7); x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM15); - - /* TODO? */ - x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RSI, 48); - x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RSI, 64); x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1); x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8); @@ -649,9 +670,9 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets) /* change sign */ x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3); - x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2); + x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM2); x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1); - x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM10); + x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM10); x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM10); x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11); x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11); @@ -660,7 +681,7 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets) x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6); x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM2); x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM1); - x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM0); x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM12); #else /* copy function */ @@ -690,7 +711,6 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets) insns_t *ins = *fp; #ifdef _M_X64 - x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RSI, 0); x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RDX, offsets[2], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM6); @@ -710,7 +730,7 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets) x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE); /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM0); + x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3); x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12); @@ -725,66 +745,63 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets) x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM13); x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RSI, 48); x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM12); + x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM13); x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 64); x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM5); x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM5, 0xB1); x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM12); x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM14); x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5); - - /* TODO? */ - x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RSI, 48); - - x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM4); + x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4); x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM4, 0xB1); x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM12); x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[4], X64_RAX, 2); - x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4); + x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM4); x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[6], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM9); + x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM9); x64_sse_movaps_reg_memindex(ins, X64_XMM2, X64_RDX, offsets[7], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7); x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[5], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM13); x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM2); - x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM15); + x64_sse_addps_reg_reg(ins, X64_XMM0, X64_XMM15); x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15); x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2); - x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5); - x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM5); + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM1); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM1); /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM0); + x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3); x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); - x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM0); x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1); x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9); /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM0); + x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3); x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6); x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7); - x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM6); + x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM6); x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7); x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM2); - x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM0); x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM8, 0xEE); - x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM9, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM9, 0xEE); x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM2); x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1); x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM4); x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4); - x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM3); - x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM13); + x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM0); + x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM13); x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM13); x64_sse_movlhps_reg_reg(ins, X64_XMM10, X64_XMM8); x64_sse_movlhps_reg_reg(ins, X64_XMM11, X64_XMM9); x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM10); x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM11); x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2); - x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM0); x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM14); x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM4); #else @@ -804,7 +821,7 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets) *fp = ins; } -static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets) +static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int extend) { #ifdef _M_X64 insns_t *leaf_oo_loop; @@ -819,26 +836,37 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint #ifdef _M_X64 /* align loop/jump destination */ x86_mov_reg_imm(ins, X86_EBX, loop_count); - ffts_align_mem16(&ins, 3); - x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RSI, 0); - /* beginning of the loop (make sure it's 16 byte aligned) */ leaf_oo_loop = ins; assert(!(((uintptr_t) leaf_oo_loop) & 0xF)); x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0); + extend--; + x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2); x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2); - x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM7); - x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM7); + + x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0); + extend--; + + x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0); + extend--; + x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10); x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM6); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0); + extend--; + x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2); - x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM1); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0); + extend--; + x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2); x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM4); x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[7], X64_RAX, 2); @@ -850,19 +878,23 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM12); x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11); x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12); - x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM9); - x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9); - + /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM5); + x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3); + + x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM9); + x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9); /* change sign */ - x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5); + x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM3); x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2); x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1); - x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM6); + + x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0); + extend--; + x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2); x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8); x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM10); @@ -871,23 +903,26 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM10); x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1); x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2); - x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM4); + + x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0); + extend--; + x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14); x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM14); x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM4, 0xEE); - x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM3); + x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM5); x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9); x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM15); x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM13); x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM1); - x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM15, 0xEE); + x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM15, 0xEE); x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM13, 0xEE); x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM1, 0xEE); x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM14); x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM7); x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM4); x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM8); - x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM3); + x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM5); x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM6); x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM9); x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM2); |