summaryrefslogtreecommitdiffstats
path: root/src/codegen_sse.h
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-16 14:05:24 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-16 14:05:24 +0200
commit804a00300bc62594714ffba61bd1f87fb2bc28f6 (patch)
treec968f7a11b538f2da99de9b114c70a9ca633504e /src/codegen_sse.h
parentda5fcc7eafbcca83c447efe57838c669d399b45a (diff)
downloadffts-804a00300bc62594714ffba61bd1f87fb2bc28f6.zip
ffts-804a00300bc62594714ffba61bd1f87fb2bc28f6.tar.gz
Add some comments to macro assembly
Diffstat (limited to 'src/codegen_sse.h')
-rw-r--r--src/codegen_sse.h49
1 files changed, 47 insertions, 2 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 0191b1d..8a03ae4 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -292,7 +292,10 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_R8, 32);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R8, 112);
@@ -319,7 +322,10 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM0);
x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM14);
x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM0, X64_XMM3);
+
x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 16);
x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R8, 48);
x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1);
@@ -413,7 +419,10 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets)
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8);
+
x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[4], X64_RAX, 2);
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
@@ -449,7 +458,10 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM1);
x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM1);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM8);
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM6);
x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@@ -466,7 +478,10 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM3);
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM3);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8);
+
x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6);
@@ -536,7 +551,10 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5);
x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
+
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
@@ -565,7 +583,10 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_movlhps_reg_reg(ins, X64_XMM2, X64_XMM8);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM5);
x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM15);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM15, X64_XMM3);
+
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM5);
x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM14);
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM14);
@@ -595,7 +616,10 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
+
x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM10);
@@ -603,8 +627,8 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11);
x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM4, 0xEE);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM5);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6);
x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM2);
x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM1);
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM3);
@@ -655,7 +679,10 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM0);
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM14);
@@ -696,12 +723,18 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2);
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5);
x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM5);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM0);
+
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM3);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM0);
+
x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6);
x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM6);
@@ -790,8 +823,13 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint
x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12);
x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM9);
x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM5);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5);
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
@@ -944,6 +982,7 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
/* load [output + 6 * output_stride] */
x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RCX, 0, X64_RSI, 1);
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
/* load [input + 3 * input_stride] */
@@ -988,7 +1027,10 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM12);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM12);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM2);
x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM9);
@@ -1011,7 +1053,10 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM1);
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
+
+ /* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
+
x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM11);
x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM11);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
OpenPOWER on IntegriCloud