diff options
-rw-r--r-- | src/codegen_sse.h | 48 |
1 files changed, 24 insertions, 24 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h index fcab9f3..6cf33bd 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -278,26 +278,26 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign) x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7); x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4); x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_RDI, 16); - x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM6); - x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4); + x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM0); + x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM1); x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1); x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM1, 0xB1); - x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM0); - x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM2); - x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM6); - x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4); + x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM2); + x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM1); + x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM0); + x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM2); x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6); - x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM6); - x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM5); + x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4); + x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4); x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_R8, 32); x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3); x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1); x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8); x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R8, 112); - x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM9); - x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM7); - x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM10); - x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM8); + x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM5); + x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5); + x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6); + x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6); x64_sse_movaps_membase_reg(ins, X64_R8, 0, X64_XMM7); x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8); x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9); @@ -306,27 +306,27 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign) x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80); x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14); x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RDI, 48); - x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM0); - x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14); + x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11); + x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM12); x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1); x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1); - x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM11); - x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM13); - x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM0); - x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM14); + x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM13); + x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM12); + x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM11); + x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM13); x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM0); - x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM0); - x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM15); + x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM14); + x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14); x64_sse_xorps_reg_reg(ins, X64_XMM0, X64_XMM3); x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 16); x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R8, 48); x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1); x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1); x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM2); - x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM1); - x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM2); - x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM4); - x64_sse_addps_reg_reg(ins, X64_XMM0, X64_XMM5); + x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15); + x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0); + x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15); + x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0); x64_sse_movaps_membase_reg(ins, X64_R8, 16, X64_XMM1); x64_sse_movaps_membase_reg(ins, X64_R8, 48, X64_XMM2); x64_sse_movaps_membase_reg(ins, X64_R8, 80, X64_XMM4); |