summaryrefslogtreecommitdiffstats
path: root/src/codegen_sse.h
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-11 18:16:33 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-11 18:16:33 +0200
commitff7ee6bfd6126b0ed3c273be60fb791671189bb9 (patch)
treea900f9887a0503c69dee03192338e53943b380d6 /src/codegen_sse.h
parent36e24f0144c8f44dc282642c962b4d7003e74909 (diff)
parent332e68112344e53c31d0fd94bbe8d308d9292b16 (diff)
downloadffts-ff7ee6bfd6126b0ed3c273be60fb791671189bb9.zip
ffts-ff7ee6bfd6126b0ed3c273be60fb791671189bb9.tar.gz
benchFFTS is computing the correct answer with these
Diffstat (limited to 'src/codegen_sse.h')
-rw-r--r--src/codegen_sse.h48
1 files changed, 24 insertions, 24 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 20c0f00..7fdb3da 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -279,26 +279,26 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7);
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4);
x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R9, 16);
- x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM6);
- x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM0);
+ x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM1);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM1, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM0);
- x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM2);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM6);
- x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4);
+ x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM2);
+ x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM1);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM0);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM2);
x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM6);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM5);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_R8, 32);
x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R8, 112);
- x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM9);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM7);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM10);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
x64_sse_movaps_membase_reg(ins, X64_R8, 0, X64_XMM7);
x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8);
x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9);
@@ -307,27 +307,27 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
- x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM0);
- x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14);
+ x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
+ x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM12);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM11);
- x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM0);
- x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM13);
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM13);
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM0);
- x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM0);
- x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
x64_sse_xorps_reg_reg(ins, X64_XMM0, X64_XMM3);
x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 16);
x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R8, 48);
x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM2);
- x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM1);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM2);
- x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM4);
- x64_sse_addps_reg_reg(ins, X64_XMM0, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
x64_sse_movaps_membase_reg(ins, X64_R8, 16, X64_XMM1);
x64_sse_movaps_membase_reg(ins, X64_R8, 48, X64_XMM2);
x64_sse_movaps_membase_reg(ins, X64_R8, 80, X64_XMM4);
OpenPOWER on IntegriCloud