summaryrefslogtreecommitdiffstats
path: root/src/codegen_sse.h
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-17 12:15:41 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-17 12:15:41 +0200
commitd77be00aaadd7772b364c10bff41a38e0112f59d (patch)
treea0d80bcb62d2e51a41742543512d809d3e4dc416 /src/codegen_sse.h
parent3a44c5ebd57ea10d69b883b212ebd7a773fe7a43 (diff)
downloadffts-d77be00aaadd7772b364c10bff41a38e0112f59d.zip
ffts-d77be00aaadd7772b364c10bff41a38e0112f59d.tar.gz
Don't use long NOPs, instead add extra prefix to extend op codes to align branch targets
Diffstat (limited to 'src/codegen_sse.h')
-rw-r--r--src/codegen_sse.h203
1 files changed, 119 insertions, 84 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 739335f..558a015 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -404,6 +404,9 @@ static FFTS_INLINE void generate_leaf_init(insns_t **fp, uint32_t loop_count)
/* set "pointer" to constants */
x64_mov_reg_membase(ins, X64_RSI, X64_RCX, 0xE0, 8);
+
+ /* use XMM3 for sign change */
+ x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0);
#else
/* set loop counter */
x86_mov_reg_imm(ins, X86_ECX, loop_count);
@@ -421,7 +424,7 @@ static FFTS_INLINE void generate_leaf_init(insns_t **fp, uint32_t loop_count)
*fp = ins;
}
-static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets)
+static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
{
#ifdef _M_X64
insns_t *leaf_ee_loop;
@@ -434,39 +437,47 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets)
insns_t *ins = *fp;
#ifdef _M_X64
- x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RSI, 32);
- x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RSI, 0);
-
+ x64_sse_movaps_reg_membase_size(ins, X64_XMM0, X64_RSI, 32, 1);
+
/* beginning of the loop (make sure it's 16 byte aligned) */
leaf_ee_loop = ins;
assert(!(((uintptr_t) leaf_ee_loop) & 0xF));
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ extend--;
+
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
-
+
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8);
-
+ x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3);
+
x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[4], X64_RAX, 2);
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM9);
x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[5], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
- x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RDX, offsets[6], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ extend--;
+
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM7);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ extend--;
+
x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM14);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM14);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM11);
x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM11);
x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM12);
@@ -475,60 +486,74 @@ static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 16);
x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
- /* TODO?? */
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 16);
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+ extend--;
- x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM5);
x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
- x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3);
- x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM1);
- x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);
+
+ x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+ extend--;
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
-
+
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM8);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM6);
+ x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+ extend--;
+
x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM4);
- x64_sse_mulps_reg_reg(ins, X64_XMM3, X64_XMM0);
+ x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM0);
x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM3);
- x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM8);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM12);
x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM7);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM9, 0xB1);
x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM8, X64_XMM11);
x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM9);
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM3);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM3);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM8);
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8);
+ x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3);
- x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6);
x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
- x64_sse_movlhps_reg_reg(ins, X64_XMM3, X64_XMM4);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4);
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+ extend--;
+
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
- x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM5);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+ extend--;
+
x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
x64_sse_movlhps_reg_reg(ins, X64_XMM9, X64_XMM14);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM13, 0xEE);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM8);
x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM4);
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM7);
x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM9);
@@ -578,7 +603,6 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4);
x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4);
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5);
- x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
/* change sign */
@@ -627,12 +651,9 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 48);
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM11);
x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM7);
x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM15);
-
- /* TODO? */
- x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RSI, 48);
-
x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RSI, 64);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
@@ -649,9 +670,9 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
- x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
- x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM10);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11);
x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
@@ -660,7 +681,7 @@ static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6);
x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM2);
x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM1);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM0);
x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM12);
#else
/* copy function */
@@ -690,7 +711,6 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
insns_t *ins = *fp;
#ifdef _M_X64
- x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RSI, 0);
x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RDX, offsets[2], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM6);
@@ -710,7 +730,7 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE);
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM0);
+ x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
@@ -725,66 +745,63 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM13);
x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RSI, 48);
x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM13);
x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 64);
x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM5);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM5, 0xB1);
x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM12);
x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM14);
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5);
-
- /* TODO? */
- x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RSI, 48);
-
- x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM4);
+ x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM4, 0xB1);
x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM12);
x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[4], X64_RAX, 2);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM4);
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[6], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM9);
x64_sse_movaps_reg_memindex(ins, X64_XMM2, X64_RDX, offsets[7], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[5], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM13);
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM2);
- x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM0, X64_XMM15);
x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM5);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM1);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM1);
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM0);
+ x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
- x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM3);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM0);
+ x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6);
x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
- x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM6);
x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM2);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM3);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM0);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM8, 0xEE);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM9, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM9, 0xEE);
x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM4);
x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM3);
- x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM13);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM13);
x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM13);
x64_sse_movlhps_reg_reg(ins, X64_XMM10, X64_XMM8);
x64_sse_movlhps_reg_reg(ins, X64_XMM11, X64_XMM9);
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM10);
x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM11);
x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM0);
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM14);
x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM4);
#else
@@ -804,7 +821,7 @@ static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
*fp = ins;
}
-static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets)
+static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int extend)
{
#ifdef _M_X64
insns_t *leaf_oo_loop;
@@ -819,26 +836,37 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint
#ifdef _M_X64
/* align loop/jump destination */
x86_mov_reg_imm(ins, X86_EBX, loop_count);
- ffts_align_mem16(&ins, 3);
- x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RSI, 0);
-
/* beginning of the loop (make sure it's 16 byte aligned) */
leaf_oo_loop = ins;
assert(!(((uintptr_t) leaf_oo_loop) & 0xF));
x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+ extend--;
+
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM7);
- x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM7);
+
+ x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ extend--;
+
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM6);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ extend--;
+
x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ extend--;
+
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM4);
x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[7], X64_RAX, 2);
@@ -850,19 +878,23 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint
x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM12);
x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM9);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9);
-
+
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM5);
+ x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9);
/* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5);
+ x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM3);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM6);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+ extend--;
+
x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM10);
@@ -871,23 +903,26 @@ static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint
x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM10);
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
- x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM4);
+
+ x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+ extend--;
+
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM14);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM4, 0xEE);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM3);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM5);
x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM15);
x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM13);
x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM15, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM15, 0xEE);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM13, 0xEE);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM1, 0xEE);
x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM14);
x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM7);
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM4);
x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM8);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM5);
x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM6);
x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM9);
x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM2);
OpenPOWER on IntegriCloud