summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-03-18 14:13:26 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-03-18 14:13:26 +0200
commitdeb54fd909ce5dcb2a74c33ffa05ee54500a5aa1 (patch)
tree23ff2379615e2bc814a18a1702c2df3843b90108 /src
parent8dc312e88784ef67419f16bfb86defb7f6cc71c1 (diff)
downloadffts-deb54fd909ce5dcb2a74c33ffa05ee54500a5aa1.zip
ffts-deb54fd909ce5dcb2a74c33ffa05ee54500a5aa1.tar.gz
Always run-time generate x64 dynamic code
Diffstat (limited to 'src')
-rw-r--r--src/codegen_sse.h2176
1 files changed, 1335 insertions, 841 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 558a015..c518481 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -1,33 +1,33 @@
/*
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@@ -40,66 +40,36 @@
#include <assert.h>
#include <string.h>
-#ifdef SSE_DEFINE_CONSTANTS
static const FFTS_ALIGN(16) unsigned int sse_constants[20] = {
- /* 0.0, -0.0, 0.0, -0.0 */
- 0x00000000, 0x80000000, 0x00000000, 0x80000000,
- /* 0.707, 0.707, 0.707, 0.707 */
- 0x3f3504f3, 0x3f3504f3, 0x3f3504f3, 0x3f3504f3,
- /* -0.707, 0.707, -0.707, 0.707 */
- 0xbf3504f3, 0x3f3504f3, 0xbf3504f3, 0x3f3504f3,
- /* 1.0, 1.0, 0.707, 0.707 */
- 0x3f800000, 0x3f800000, 0x3f3504f3, 0x3f3504f3,
- /* 0.0, 0.0, -.707, 0.707 */
- 0x00000000, 0x00000000, 0xbf3504f3, 0x3f3504f3
+ /* 0.0, -0.0, 0.0, -0.0 */
+ 0x00000000, 0x80000000, 0x00000000, 0x80000000,
+ /* 0.707, 0.707, 0.707, 0.707 */
+ 0x3f3504f3, 0x3f3504f3, 0x3f3504f3, 0x3f3504f3,
+ /* -0.707, 0.707, -0.707, 0.707 */
+ 0xbf3504f3, 0x3f3504f3, 0xbf3504f3, 0x3f3504f3,
+ /* 1.0, 1.0, 0.707, 0.707 */
+ 0x3f800000, 0x3f800000, 0x3f3504f3, 0x3f3504f3,
+ /* 0.0, 0.0, -.707, 0.707 */
+ 0x00000000, 0x00000000, 0xbf3504f3, 0x3f3504f3
};
static const FFTS_ALIGN(16) unsigned int sse_constants_inv[20] = {
- /* -0.0, 0.0, -0.0, 0.0 */
- 0x80000000, 0x00000000, 0x80000000, 0x00000000,
- /* 0.707, 0.707, 0.707, 0.707 */
+ /* -0.0, 0.0, -0.0, 0.0 */
+ 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+ /* 0.707, 0.707, 0.707, 0.707 */
0x3f3504f3, 0x3f3504f3, 0x3f3504f3, 0x3f3504f3,
- /* 0.707, -0.707, 0.707, -0.707 */
+ /* 0.707, -0.707, 0.707, -0.707 */
0x3f3504f3, 0xbf3504f3, 0x3f3504f3, 0xbf3504f3,
- /* 1.0, 1.0, 0.707, 0.707 */
+ /* 1.0, 1.0, 0.707, 0.707 */
0x3f800000, 0x3f800000, 0x3f3504f3, 0x3f3504f3,
- /* 0.0, 0.0, 0.707, -0.707 */
+ /* 0.0, 0.0, 0.707, -0.707 */
0x00000000, 0x00000000, 0x3f3504f3, 0xbf3504f3
};
-#else
-extern void leaf_ee_init();
-extern void leaf_ee();
-extern void leaf_eo();
-extern void leaf_oe();
-extern void leaf_oo();
-extern void leaf_end();
-
-extern void sse_constants();
-extern void sse_constants_inv();
-
-extern const uint32_t sse_leaf_ee_offsets[8];
-extern const uint32_t sse_leaf_eo_offsets[8];
-extern const uint32_t sse_leaf_oe_offsets[8];
-extern const uint32_t sse_leaf_oo_offsets[8];
-
-extern void x_init();
-extern void x4();
-extern void x8_soft();
-extern void x8_soft_end();
-#endif
#define P(x) (*(*p)++ = x)
-static void IMM32_NI(uint8_t *p, int32_t imm)
-{
- int i;
-
- for (i = 0; i < 4; i++) {
- *(p+i) = (imm & (0xff << (8 * i))) >> (8 * i);
- }
-}
-
-static FFTS_INLINE void ffts_insert_nops(uint8_t **p, uint32_t count)
+static FFTS_INLINE void
+ffts_insert_nops(uint8_t **p, uint32_t count)
{
if (count >= 9) {
P(0x66);
@@ -175,36 +145,38 @@ static FFTS_INLINE void ffts_insert_nops(uint8_t **p, uint32_t count)
}
}
-static FFTS_INLINE void ffts_align_mem16(uint8_t **p, uint32_t offset)
+static FFTS_INLINE void
+ffts_align_mem16(uint8_t **p, uint32_t offset)
{
int r = (16 - (offset & 0xf)) - (int) ((uintptr_t)(*p) & 0xf);
r = (16 + r) & 0xf;
ffts_insert_nops(p, r);
}
-static FFTS_INLINE void generate_epilogue(insns_t **fp)
+static FFTS_INLINE void
+generate_epilogue(insns_t **fp)
{
#ifdef _M_X64
/* restore nonvolatile registers */
- x64_mov_reg_membase(*fp, X64_RBX, X64_RSP, -64, 8);
- x64_mov_reg_membase(*fp, X64_RSI, X64_RSP, -56, 8);
-
- x64_sse_movaps_reg_membase(*fp, X64_XMM6, X64_RSP, -48);
- x64_sse_movaps_reg_membase(*fp, X64_XMM7, X64_RSP, -32);
- x64_sse_movaps_reg_membase(*fp, X64_XMM8, X64_RSP, -16);
- x64_sse_movaps_reg_membase(*fp, X64_XMM9, X64_RSP, 0);
- x64_sse_movaps_reg_membase(*fp, X64_XMM10, X64_RSP, 16);
- x64_sse_movaps_reg_membase(*fp, X64_XMM11, X64_RSP, 32);
- x64_sse_movaps_reg_membase(*fp, X64_XMM12, X64_RSP, 48);
- x64_sse_movaps_reg_membase(*fp, X64_XMM13, X64_RSP, 64);
+ x64_mov_reg_membase(*fp, X64_RBX, X64_RSP, -64, 8);
+ x64_mov_reg_membase(*fp, X64_RSI, X64_RSP, -56, 8);
+
+ x64_sse_movaps_reg_membase(*fp, X64_XMM6, X64_RSP, -48);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM7, X64_RSP, -32);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM8, X64_RSP, -16);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM9, X64_RSP, 0);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM10, X64_RSP, 16);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM11, X64_RSP, 32);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM12, X64_RSP, 48);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM13, X64_RSP, 64);
/* restore the last 2 registers from the shadow space */
- x64_sse_movaps_reg_membase(*fp, X64_XMM14, X64_RSP, 96);
- x64_sse_movaps_reg_membase(*fp, X64_XMM15, X64_RSP, 112);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM14, X64_RSP, 96);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM15, X64_RSP, 112);
/* restore stack */
- x64_alu_reg_imm_size(*fp, X86_ADD, X64_RSP, 88, 8);
-#else
+ x64_alu_reg_imm_size(*fp, X86_ADD, X64_RSP, 88, 8);
+#else
x64_pop_reg(*fp, X64_R15);
x64_pop_reg(*fp, X64_R14);
x64_pop_reg(*fp, X64_R13);
@@ -218,12 +190,13 @@ static FFTS_INLINE void generate_epilogue(insns_t **fp)
x64_ret(*fp);
}
-static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p)
+static FFTS_INLINE insns_t*
+generate_prologue(insns_t **fp, ffts_plan_t *p)
{
insns_t *start;
-
- /* unreferenced parameter */
- (void) p;
+
+ /* unreferenced parameter */
+ (void) p;
/* align call destination */
ffts_align_mem16(fp, 0);
@@ -232,740 +205,1151 @@ static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p)
/* save nonvolatile registers */
#ifdef _M_X64
/* reserve space to save XMM6-XMM15 registers */
- x64_alu_reg_imm_size(*fp, X86_SUB, X64_RSP, 88, 8);
-
- x64_mov_membase_reg(*fp, X64_RSP, -64, X64_RBX, 8);
- x64_mov_membase_reg(*fp, X64_RSP, -56, X64_RSI, 8);
-
- x64_sse_movaps_membase_reg(*fp, X64_RSP, -48, X64_XMM6);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, -32, X64_XMM7);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, -16, X64_XMM8);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 0, X64_XMM9);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 16, X64_XMM10);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 32, X64_XMM11);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 48, X64_XMM12);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 64, X64_XMM13);
-
- /* use the shadow space to save last 2 registers */
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 96, X64_XMM14);
- x64_sse_movaps_membase_reg(*fp, X64_RSP, 112, X64_XMM15);
+ x64_alu_reg_imm_size(*fp, X86_SUB, X64_RSP, 88, 8);
+
+ x64_mov_membase_reg(*fp, X64_RSP, -64, X64_RBX, 8);
+ x64_mov_membase_reg(*fp, X64_RSP, -56, X64_RSI, 8);
+
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, -48, X64_XMM6);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, -32, X64_XMM7);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, -16, X64_XMM8);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 0, X64_XMM9);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 16, X64_XMM10);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 32, X64_XMM11);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 48, X64_XMM12);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 64, X64_XMM13);
+
+ /* use the shadow space to save last 2 registers */
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 96, X64_XMM14);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 112, X64_XMM15);
#else
- x64_push_reg(*fp, X64_RBP);
- x64_push_reg(*fp, X64_RBX);
- x64_push_reg(*fp, X64_R10);
- x64_push_reg(*fp, X64_R11);
- x64_push_reg(*fp, X64_R12);
- x64_push_reg(*fp, X64_R13);
- x64_push_reg(*fp, X64_R14);
- x64_push_reg(*fp, X64_R15);
+ x64_push_reg(*fp, X64_RBP);
+ x64_push_reg(*fp, X64_RBX);
+ x64_push_reg(*fp, X64_R10);
+ x64_push_reg(*fp, X64_R11);
+ x64_push_reg(*fp, X64_R12);
+ x64_push_reg(*fp, X64_R13);
+ x64_push_reg(*fp, X64_R14);
+ x64_push_reg(*fp, X64_R15);
#endif
return start;
}
-static FFTS_INLINE void generate_transform_init(insns_t **fp)
+static FFTS_INLINE void
+generate_transform_init(insns_t **fp)
{
#ifdef _M_X64
/* generate function */
- x64_sse_movaps_reg_membase(*fp, X64_XMM3, X64_RSI, 0);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM3, X64_RSI, 0);
/* set "pointer" to twiddle factors */
- x64_mov_reg_membase(*fp, X64_R9, X64_RCX, 0x20, 8);
+ x64_mov_reg_membase(*fp, X64_R9, X64_RCX, 0x20, 8);
#else
- size_t len;
+ /* generate function */
+ x64_sse_movaps_reg_membase(*fp, X64_XMM3, X64_R9, 0);
- /* copy function */
- assert((char*) x4 > (char*) x_init);
- len = (char*) x4 - (char*) x_init;
- memcpy(*fp, x_init, len);
- *fp += len;
+ /* set "pointer" to twiddle factors */
+ x64_mov_reg_membase(*fp, X64_R8, X64_RDI, 0x20, 8);
#endif
}
-static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
+static FFTS_INLINE insns_t*
+generate_size4_base_case(insns_t **fp, int sign)
{
- insns_t *ins;
+ insns_t *ins;
insns_t *x4_addr;
-#ifndef _M_X64
- size_t len;
-#endif
- /* unreferenced parameter */
- (void) sign;
+ /* unreferenced parameter */
+ (void) sign;
- /* to avoid deferring */
- ins = *fp;
+ /* to avoid deferring */
+ ins = *fp;
/* align call destination */
ffts_align_mem16(&ins, 0);
x4_addr = ins;
#ifdef _M_X64
- /* generate function */
- x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_R8, 64);
- x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 96);
- x64_sse_movaps_reg_membase(ins, X64_XMM7, X64_R8, 0);
- x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_R9, 0);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4);
- x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R9, 16);
- x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM0);
- x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM1, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM2);
- x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM1);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM0);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM2);
- x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
- x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_R8, 32);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
- x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R8, 112);
- x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM5);
- x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
- x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
- x64_sse_movaps_membase_reg(ins, X64_R8, 0, X64_XMM7);
- x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8);
- x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9);
- x64_sse_movaps_membase_reg(ins, X64_R8, 96, X64_XMM10);
- x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R9, 32);
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
- x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
- x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
- x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
- x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM12);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM13);
- x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM12);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM13);
- x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM0);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM14);
- x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM0, X64_XMM3);
-
- x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 16);
- x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R8, 48);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM2);
- x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
- x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
- x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
- x64_sse_movaps_membase_reg(ins, X64_R8, 16, X64_XMM1);
- x64_sse_movaps_membase_reg(ins, X64_R8, 48, X64_XMM2);
- x64_sse_movaps_membase_reg(ins, X64_R8, 80, X64_XMM4);
- x64_sse_movaps_membase_reg(ins, X64_R8, 112, X64_XMM5);
- x64_ret(ins);
+ /* generate function */
+ x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_R8, 64);
+ x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 96);
+ x64_sse_movaps_reg_membase(ins, X64_XMM7, X64_R8, 0);
+ x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_R9, 0);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4);
+ x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R9, 16);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM0);
+ x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM1, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM2);
+ x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM1);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM0);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
+ x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_R8, 32);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R8, 112);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 0, X64_XMM7);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 96, X64_XMM10);
+ x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R9, 32);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
+ x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
+ x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
+ x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM13);
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM13);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM0, X64_XMM3);
+
+ x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 16);
+ x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R8, 48);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM2);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 16, X64_XMM1);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 48, X64_XMM2);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 80, X64_XMM4);
+ x64_sse_movaps_membase_reg(ins, X64_R8, 112, X64_XMM5);
+ x64_ret(ins);
#else
- /* copy function */
- assert((char*) x8_soft > (char*) x4);
- len = (char*) x8_soft - (char*) x4;
- memcpy(ins, x4, len);
- ins += len;
+ /* generate function */
+ x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_RDX, 64);
+ x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_RDX, 96);
+ x64_sse_movaps_reg_membase(ins, X64_XMM7, X64_RDX, 0);
+ x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_R8, 0);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4);
+ x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R8, 16);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM0);
+ x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM1, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM2);
+ x64_sse_mulps_reg_reg(ins, X64_XMM2, X64_XMM1);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM0);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
+ x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RDX, 32);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RDX, 112);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 0, X64_XMM7);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 32, X64_XMM8);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 64, X64_XMM9);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 96, X64_XMM10);
+ x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R8, 32);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RDX, 80);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
+ x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R8, 48);
+ x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
+ x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM13);
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM13);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM0, X64_XMM3);
+
+ x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_RDX, 16);
+ x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_RDX, 48);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM2);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 16, X64_XMM1);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 48, X64_XMM2);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 80, X64_XMM4);
+ x64_sse_movaps_membase_reg(ins, X64_RDX, 112, X64_XMM5);
+ x64_ret(ins);
#endif
- *fp = ins;
+ *fp = ins;
return x4_addr;
}
-static FFTS_INLINE void generate_leaf_init(insns_t **fp, uint32_t loop_count)
+static FFTS_INLINE void
+generate_leaf_init(insns_t **fp, uint32_t loop_count)
{
-#ifndef _M_X64
- size_t len;
-#endif
-
- /* to avoid deferring */
- insns_t *ins = *fp;
+ /* to avoid deferring */
+ insns_t *ins = *fp;
#ifdef _M_X64
- /* set loop counter */
- x86_mov_reg_imm(ins, X86_EBX, loop_count);
+ /* set loop counter */
+ x86_mov_reg_imm(ins, X86_EBX, loop_count);
- /* generate function */
+ /* generate function */
/* clear */
- x86_clear_reg(ins, X86_EAX);
+ x86_clear_reg(ins, X86_EAX);
/* set "pointer" to offsets */
- x64_mov_reg_membase(ins, X64_R9, X64_RCX, 0x0, 8);
+ x64_mov_reg_membase(ins, X64_R9, X64_RCX, 0x0, 8);
/* set "pointer" to constants */
- x64_mov_reg_membase(ins, X64_RSI, X64_RCX, 0xE0, 8);
-
- /* use XMM3 for sign change */
- x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0);
+ x64_mov_reg_membase(ins, X64_RSI, X64_RCX, 0xE0, 8);
+
+ /* use XMM3 for sign change */
+ x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_RSI, 0);
#else
- /* set loop counter */
- x86_mov_reg_imm(ins, X86_ECX, loop_count);
+ /* set loop counter */
+ x86_mov_reg_imm(ins, X86_ECX, loop_count);
- /* copy function */
- assert((char*) leaf_ee > (char*) leaf_ee_init);
- len = (char*) leaf_ee - (char*) leaf_ee_init;
- memcpy(ins, leaf_ee_init, (size_t) len);
- ins += len;
+ /* generate function */
- /* align loop/jump destination */
+ /* clear */
+ x86_clear_reg(ins, X86_EAX);
+
+ /* set "pointer" to offsets */
+ x64_mov_reg_membase(ins, X64_R8, X64_RDI, 0x0, 8);
+
+ /* set "pointer" to constants */
+ x64_mov_reg_membase(ins, X64_R9, X64_RDI, 0xE0, 8);
+
+ /* align loop/jump destination */
ffts_align_mem16(&ins, 9);
#endif
- *fp = ins;
+ *fp = ins;
}
-static FFTS_INLINE void generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
+static FFTS_INLINE void
+generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
{
-#ifdef _M_X64
insns_t *leaf_ee_loop;
-#else
- size_t len;
- int i;
-#endif
- /* to avoid deferring */
- insns_t *ins = *fp;
+ /* to avoid deferring */
+ insns_t *ins = *fp;
#ifdef _M_X64
- x64_sse_movaps_reg_membase_size(ins, X64_XMM0, X64_RSI, 32, 1);
+ x64_sse_movaps_reg_membase_size(ins, X64_XMM0, X64_RSI, 32, 1);
- /* beginning of the loop (make sure it's 16 byte aligned) */
+ /* beginning of the loop (make sure it's 16 byte aligned) */
leaf_ee_loop = ins;
assert(!(((uintptr_t) leaf_ee_loop) & 0xF));
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
- extend--;
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
- x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3);
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[1], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[4], X64_RAX, 2);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
- x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM9);
- x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[5], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
- x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
- extend--;
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
- extend--;
-
- x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
- x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM14);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13);
- x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
- x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 16);
- x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
- extend--;
-
- x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
- x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
- x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
- x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);
-
- x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
- extend--;
-
- x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
- extend--;
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM3);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
- extend--;
-
- x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
- x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM4);
- x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM0);
- x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM8);
- x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM12);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM7);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM9, 0xB1);
- x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM8, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM9);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM8);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM8);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3);
-
- x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6);
- x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
- x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4);
- x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
- extend--;
-
- x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
- x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
- extend--;
-
- x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
- x64_sse_movlhps_reg_reg(ins, X64_XMM9, X64_XMM14);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM13, 0xEE);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM8);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM4);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM7);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM9);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM1);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM5);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM6);
-
- /* loop condition */
- x64_alu_reg_reg_size(ins, X86_CMP, X64_RBX, X64_RAX, 8);
- x64_branch_size(ins, X86_CC_NE, leaf_ee_loop, 0, 4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3);
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[4], X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[5], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
+ x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+ x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 16);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+ extend--;
+
+ x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);
+
+ x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM4);
+ x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM8);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM7);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM9, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM8, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM8);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6);
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+ extend--;
+
+ x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM9, X64_XMM14);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM13, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM8);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM4);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM7);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM9);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM1);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM6);
+
+ /* loop condition */
+ x64_alu_reg_reg_size(ins, X86_CMP, X64_RBX, X64_RAX, 8);
+ x64_branch_size(ins, X86_CC_NE, leaf_ee_loop, 0, 4);
#else
- /* copy function */
- assert((char*) leaf_oo > (char*) leaf_ee);
- len = (char*) leaf_oo - (char*) leaf_ee;
- memcpy(ins, leaf_ee, (size_t) len);
-
- /* patch offsets */
- for (i = 0; i < 8; i++) {
- IMM32_NI(ins + sse_leaf_ee_offsets[i], offsets[i]);
- }
+ x64_sse_movaps_reg_membase_size(ins, X64_XMM0, X64_R9, 32, 1);
- ins += len;
-#endif
+ /* use XMM8 for sign change */
+ x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_R9, 0);
- *fp = ins;
-}
+ /* beginning of the loop (make sure it's 16 byte aligned) */
+ leaf_ee_loop = ins;
+ assert(!(((uintptr_t) leaf_ee_loop) & 0xF));
-static FFTS_INLINE void generate_leaf_eo(insns_t **fp, uint32_t *offsets)
-{
-#ifndef _M_X64
- size_t len;
- int i;
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[2], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8);
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RSI, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[4], X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM9);
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RSI, offsets[5], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RSI, offsets[6], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
+ x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+ x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R9, 16);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+ extend--;
+
+ x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3);
+
+ x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM9, X64_XMM8);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM4);
+ x64_sse_mulps_reg_reg(ins, X64_XMM3, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM3);
+ x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM7);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM9, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM3);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM3);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM12, X64_XMM8);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM6);
+ x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM3, X64_XMM4);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+ extend--;
+
+ x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM9, X64_XMM14);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM13, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R11, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R11, 2, X64_XMM4);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R11, 2, X64_XMM7);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R11, 2, X64_XMM9);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R12, 2, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R12, 2, X64_XMM1);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R12, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R12, 2, X64_XMM6);
+
+ /* loop condition */
+ x64_alu_reg_reg_size(ins, X86_CMP, X64_RCX, X64_RAX, 8);
+ x64_branch_size(ins, X86_CC_NE, leaf_ee_loop, 0, 4);
#endif
- /* to avoid deferring */
- insns_t *ins = *fp;
+ *fp = ins;
+}
+
+static FFTS_INLINE void
+generate_leaf_eo(insns_t **fp, uint32_t *offsets)
+{
+ /* to avoid deferring */
+ insns_t *ins = *fp;
#ifdef _M_X64
- x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
- x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
- x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2);
- x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM5);
- x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4);
- x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5);
- x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
-
- x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
- x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
- x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM10);
- x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM11);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xEE);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM9, 0xEE);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM10);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM11);
- x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[4], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[5], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM15);
- x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[6], X64_RAX, 2);
- x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
- x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM12);
- x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[7], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM4);
- x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM14);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM13);
- x64_sse_movlhps_reg_reg(ins, X64_XMM2, X64_XMM8);
- x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM5);
- x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM15);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM15, X64_XMM3);
-
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM5);
- x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM14);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM14);
- x64_sse_movlhps_reg_reg(ins, X64_XMM1, X64_XMM9);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4);
- x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4);
- x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM15, X64_XMM15, 0xB1);
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 48);
- x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
- x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM15);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM11);
- x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM7);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM15);
- x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RSI, 64);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
- x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM15);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM7);
- x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM15);
- x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
- x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
-
- x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM2);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM4, 0xEE);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM5);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM2);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM1);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM0);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM12);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[2], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RDX, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[1], X64_RAX, 2);
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM10);
+ x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM9, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM10);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM11);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[4], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[5], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[6], X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM12);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RDX, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM2, X64_XMM8);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM5);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM15);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM15, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM5);
+ x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM14);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM15, X64_XMM15, 0xB1);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RSI, 48);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM11);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM7);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM15);
+ x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RSI, 64);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM15);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM7);
+ x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM15);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM4, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM6);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM1);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM0);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM12);
#else
- /* copy function */
- assert((char*) leaf_oe > (char*) leaf_eo);
- len = (char*) leaf_oe - (char*) leaf_eo;
- memcpy(ins, leaf_eo, len);
-
- /* patch offsets */
- for (i = 0; i < 8; i++) {
- IMM32_NI(ins + sse_leaf_eo_offsets[i], offsets[i]);
- }
-
- ins += len;
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RSI, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[2], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RSI, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[1], X64_RAX, 2);
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM5);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM4);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM5);
+ x64_sse_movaps_reg_membase(ins, X64_XMM3, X64_R9, 0);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM10);
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM9, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R12, 2, X64_XMM10);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R12, 2, X64_XMM11);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RSI, offsets[4], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[5], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[6], X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM12);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RSI, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM7, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM2, X64_XMM8);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM5);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM15);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM15, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM5);
+ x64_sse_subps_reg_reg(ins, X64_XMM5, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM14);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM1, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM15, X64_XMM15, 0xB1);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R9, 48);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM11);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM7);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM15);
+ x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_R9, 64);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM15);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM7);
+ x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM15);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM14, 0xEE);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
+
+ x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM4, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R12, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R12, 2, X64_XMM6);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R11, 2, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R11, 2, X64_XMM1);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R11, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R11, 2, X64_XMM12);
#endif
- *fp = ins;
+ *fp = ins;
}
-static FFTS_INLINE void generate_leaf_oe(insns_t **fp, uint32_t *offsets)
+static FFTS_INLINE void
+generate_leaf_oe(insns_t **fp, uint32_t *offsets)
{
-#ifndef _M_X64
- size_t len;
- int i;
-#endif
-
- /* to avoid deferring */
- insns_t *ins = *fp;
+ /* to avoid deferring */
+ insns_t *ins = *fp;
#ifdef _M_X64
- x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RDX, offsets[2], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM6);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xE4);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM6, 0xE4);
- x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[0], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM12);
- x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
- x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8);
- x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM7);
- x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM7);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
- x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
- x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM14);
- x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM9);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM10);
- x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM9);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM12, 0xEE);
- x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM10);
- x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
- x64_sse_movlhps_reg_reg(ins, X64_XMM13, X64_XMM11);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM13);
- x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RSI, 48);
- x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM12);
- x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM13);
- x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 64);
- x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM5);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM5, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM12);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM14);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5);
- x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM4, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM12);
- x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[4], X64_RAX, 2);
- x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM4);
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[6], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM9);
- x64_sse_movaps_reg_memindex(ins, X64_XMM2, X64_RDX, offsets[7], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
- x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[5], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM2);
- x64_sse_addps_reg_reg(ins, X64_XMM0, X64_XMM15);
- x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM1);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM1);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
-
- x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
- x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM0);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
-
- x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM6);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
- x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM2);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM0);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM8, 0xEE);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM9, 0xEE);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM2);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
- x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM4);
- x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM0);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM13);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM13);
- x64_sse_movlhps_reg_reg(ins, X64_XMM10, X64_XMM8);
- x64_sse_movlhps_reg_reg(ins, X64_XMM11, X64_XMM9);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM10);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM11);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM0);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM14);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RDX, offsets[2], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xE4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM6, 0xE4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM7);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM10);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM9);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM12, 0xEE);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM13, X64_XMM11);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM13);
+ x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RSI, 48);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM13);
+ x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RSI, 64);
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM5);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM5, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM12);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM14);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5);
+ x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM4, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM12);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RDX, offsets[4], X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[6], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM2, X64_RDX, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RDX, offsets[5], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM2);
+ x64_sse_addps_reg_reg(ins, X64_XMM0, X64_XMM15);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM1);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM1);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
+
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM0);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM6);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM0);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM8, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM9, 0xEE);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM0);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM13);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM10);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM11);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM0);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM14);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM4);
#else
- /* copy function */
- assert((char*) leaf_end > (char*) leaf_oe);
- len = (char*) leaf_end - (char*) leaf_oe;
- memcpy(ins, leaf_oe, len);
-
- /* patch offsets */
- for (i = 0; i < 8; i++) {
- IMM32_NI(ins + sse_leaf_oe_offsets[i], offsets[i]);
- }
-
- ins += len;
+ x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_R9, 0);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RSI, offsets[2], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM6);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM8, 0xE4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM6, 0xE4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[0], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_addps_reg_reg(ins, X64_XMM14, X64_XMM7);
+ x64_sse_subps_reg_reg(ins, X64_XMM12, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM10, 0xEE);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM0);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM5, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM10);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM9);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM12, 0xEE);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM10);
+ x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM13, X64_XMM11);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R11, 2, X64_XMM13);
+ x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM13);
+ x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_R9, 64);
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM5);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM5, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM5, X64_XMM12);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R11, 2, X64_XMM14);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM5);
+ x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM4, X64_XMM4, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM4, X64_XMM12);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM9, X64_RSI, offsets[4], X64_RAX, 2);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[6], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM3, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM2, X64_RSI, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM7);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM15, X64_RSI, offsets[5], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM2);
+ x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM15);
+ x64_sse_subps_reg_reg(ins, X64_XMM9, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM2);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM1);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM1);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM0);
+
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM3);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM9);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM0);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM7);
+ x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM6);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM3);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM8, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM9, 0xEE);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM4);
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM3);
+ x64_sse_subps_reg_reg(ins, X64_XMM3, X64_XMM13);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R11, 2, X64_XMM10);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R11, 2, X64_XMM11);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R12, 2, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R12, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R12, 2, X64_XMM14);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R12, 2, X64_XMM4);
#endif
- *fp = ins;
+ *fp = ins;
}
-static FFTS_INLINE void generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int extend)
+static FFTS_INLINE void
+generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int extend)
{
-#ifdef _M_X64
insns_t *leaf_oo_loop;
-#else
- size_t len;
- int i;
-#endif
- /* to avoid deferring */
- insns_t *ins = *fp;
+ /* to avoid deferring */
+ insns_t *ins = *fp;
#ifdef _M_X64
- /* align loop/jump destination */
- x86_mov_reg_imm(ins, X86_EBX, loop_count);
+ /* align loop/jump destination */
+ x86_mov_reg_imm(ins, X86_EBX, loop_count);
- /* beginning of the loop (make sure it's 16 byte aligned) */
- leaf_oo_loop = ins;
+ /* beginning of the loop (make sure it's 16 byte aligned) */
+ leaf_oo_loop = ins;
assert(!(((uintptr_t) leaf_oo_loop) & 0xF));
- x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
- extend--;
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
- x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);
-
- x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
- extend--;
-
- x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
- extend--;
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
- x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
- extend--;
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
- extend--;
-
- x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM4);
- x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[7], X64_RAX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
- x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
- x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
- x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM12);
- x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3);
-
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM9);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM3);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
-
- x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
- extend--;
-
- x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
- x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM10);
- x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
- x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
-
- x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
- extend--;
-
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
- x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM14);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM4, 0xEE);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM5);
- x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
- x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM15);
- x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM13);
- x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM15, 0xEE);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM13, 0xEE);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM1, 0xEE);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM14);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM7);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM4);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM8);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM5);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM6);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM9);
- x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM2);
-
- /* loop condition */
- x64_alu_reg_reg_size(ins, X86_CMP, X64_RBX, X64_RAX, 8);
- x64_branch_size(ins, X86_CC_NE, leaf_oo_loop, 0, 4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);
+
+ x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
+ x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM3);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM10);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
+
+ x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM14);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM4, 0xEE);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM5);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM5, X64_XMM15, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM13, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM1, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R10, 2, X64_XMM14);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R10, 2, X64_XMM7);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R10, 2, X64_XMM4);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R10, 2, X64_XMM8);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 0, X64_R11, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 16, X64_R11, 2, X64_XMM6);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 32, X64_R11, 2, X64_XMM9);
+ x64_sse_movaps_memindex_reg(ins, X64_R8, 48, X64_R11, 2, X64_XMM2);
+
+ /* loop condition */
+ x64_alu_reg_reg_size(ins, X86_CMP, X64_RBX, X64_RAX, 8);
+ x64_branch_size(ins, X86_CC_NE, leaf_oo_loop, 0, 4);
#else
- /* align loop/jump destination */
- x86_mov_reg_imm(ins, X86_ECX, loop_count);
- ffts_align_mem16(&ins, 4);
+ /* align loop/jump destination */
+ x86_mov_reg_imm(ins, X86_ECX, loop_count);
+ ffts_align_mem16(&ins, 4);
- /* copy function */
- assert((char*) leaf_eo > (char*) leaf_oo);
- len = (char*) leaf_eo - (char*) leaf_oo;
- memcpy(ins, leaf_oo, len);
+ x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_R9, 0);
- /* patch offsets */
- for (i = 0; i < 8; i++) {
- IMM32_NI(ins + sse_leaf_oo_offsets[i], offsets[i]);
- }
+ /* beginning of the loop (make sure it's 16 byte aligned) */
+ leaf_oo_loop = ins;
+ assert(!(((uintptr_t) leaf_oo_loop) & 0xF));
- ins += len;
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[0], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[2], X64_RAX, 2);
+
+ x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RSI, offsets[4], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RSI, offsets[5], X64_RAX, 2);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+ extend--;
+
+ x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[6], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM4);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[7], X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
+ x64_sse_subps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_addps_reg_reg(ins, X64_XMM2, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM12);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM13, X64_XMM12);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM10, X64_XMM5);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM3, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM9);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
+
+ x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+ extend--;
+
+ x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM15, X64_XMM10);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
+
+ x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+ extend--;
+
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM14);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM4, 0xEE);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM3);
+ x64_sse_movaps_reg_reg(ins, X64_XMM4, X64_XMM9);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM14, X64_XMM15);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM13);
+ x64_sse_movlhps_reg_reg(ins, X64_XMM8, X64_XMM1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM15, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM9, X64_XMM13, 0xEE);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM1, 0xEE);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R11, 2, X64_XMM14);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R11, 2, X64_XMM7);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R11, 2, X64_XMM4);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R11, 2, X64_XMM8);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 0, X64_R12, 2, X64_XMM3);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 16, X64_R12, 2, X64_XMM6);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 32, X64_R12, 2, X64_XMM9);
+ x64_sse_movaps_memindex_reg(ins, X64_RDX, 48, X64_R12, 2, X64_XMM2);
+
+ /* loop condition */
+ x64_alu_reg_reg_size(ins, X86_CMP, X64_RCX, X64_RAX, 8);
+ x64_branch_size(ins, X86_CC_NE, leaf_oo_loop, 0, 4);
#endif
- *fp = ins;
+ *fp = ins;
}
-static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
+static FFTS_INLINE insns_t*
+generate_size8_base_case(insns_t **fp, int sign)
{
- insns_t *ins;
+ insns_t *ins;
insns_t *x8_addr;
-#ifdef _M_X64
insns_t *x8_soft_loop;
-#else
- size_t len;
-#endif
- /* unreferenced parameter */
- (void) sign;
+ /* unreferenced parameter */
+ (void) sign;
- /* to avoid deferring */
- ins = *fp;
+ /* to avoid deferring */
+ ins = *fp;
/* align call destination */
ffts_align_mem16(&ins, 0);
@@ -982,193 +1366,303 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
/* generate function */
/* input */
- x64_mov_reg_reg(ins, X64_RAX, X64_R9, 8);
+ x64_mov_reg_reg(ins, X64_RAX, X64_R9, 8);
/* output */
- x64_mov_reg_reg(ins, X64_RCX, X64_R8, 8);
+ x64_mov_reg_reg(ins, X64_RCX, X64_R8, 8);
/* loop stop (RDX = output + output_stride) */
- x64_lea_memindex(ins, X64_RDX, X64_R8, 0, X64_RBX, 0);
+ x64_lea_memindex(ins, X64_RDX, X64_R8, 0, X64_RBX, 0);
/* RSI = 3 * output_stride */
- x64_lea_memindex(ins, X64_RSI, X64_RBX, 0, X64_RBX, 1);
+ x64_lea_memindex(ins, X64_RSI, X64_RBX, 0, X64_RBX, 1);
/* R10 = 5 * output_stride */
- x64_lea_memindex(ins, X64_R10, X64_RBX, 0, X64_RBX, 2);
+ x64_lea_memindex(ins, X64_R10, X64_RBX, 0, X64_RBX, 2);
/* R11 = 7 * output_stride */
- x64_lea_memindex(ins, X64_R11, X64_RSI, 0, X64_RBX, 2);
+ x64_lea_memindex(ins, X64_R11, X64_RSI, 0, X64_RBX, 2);
- /* beginning of the loop (make sure it's 16 byte aligned) */
+ /* beginning of the loop (make sure it's 16 byte aligned) */
x8_soft_loop = ins;
assert(!(((uintptr_t) x8_soft_loop) & 0xF));
/* load [input + 0 * input_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RAX, 0);
+ x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RAX, 0);
/* load [output + 2 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RCX, 0, X64_RBX, 1);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_RCX, 0, X64_RBX, 1);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
/* load [output + 3 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RCX, 0, X64_RSI, 0);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RCX, 0, X64_RSI, 0);
/* load [input + 1 * input_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RAX, 16);
-
- x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM6);
- x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM7);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM8);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6);
- x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM7);
- x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RAX, 16);
+
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM6);
+ x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6);
+ x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
/* load [input + 2 * input_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RAX, 32);
+ x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RAX, 32);
- x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
- x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
/* load [output + 0 * output_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RCX, 0);
+ x64_sse_movaps_reg_membase(ins, X64_XMM5, X64_RCX, 0);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM15);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM15);
/* load [output + 4 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RCX, 0, X64_RBX, 2);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RCX, 0, X64_RBX, 2);
- x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM5);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM5);
/* load [output + 6 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RCX, 0, X64_RSI, 1);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_RCX, 0, X64_RSI, 1);
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
/* load [input + 3 * input_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_RAX, 48);
+ x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_RAX, 48);
- x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM10);
- x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM10);
- x64_sse_mulps_reg_reg(ins, X64_XMM15, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM10);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM10);
+ x64_sse_mulps_reg_reg(ins, X64_XMM15, X64_XMM13);
/* load [input + 4 * input_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM10, X64_RAX, 64);
+ x64_sse_movaps_reg_membase(ins, X64_XMM10, X64_RAX, 64);
- x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM5);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
- x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14);
- x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM13);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM5);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14);
+ x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
- /* load [output + 5 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RCX, 0, X64_R10, 0);
+ /* load [output + 5 * output_stride] */
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RCX, 0, X64_R10, 0);
- x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM10);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM10);
- /* load [output + 7 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RCX, 0, X64_R11, 0);
+ /* load [output + 7 * output_stride] */
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RCX, 0, X64_R11, 0);
- x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM6);
+ x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM6);
/* load [input + 5 * input_stride] */
- x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RAX, 80);
+ x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RAX, 80);
/* move input by 6 * input_stride */
- x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 0x60, 8);
-
- x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM7);
- x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM15);
- x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM15);
- x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM8);
- x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM12);
- x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM12);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM2);
- x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM9);
- x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM7);
- x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 0x60, 8);
+
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM7);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM15);
+ x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM12);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM2);
+ x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM9);
+ x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM7);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
/* load [output + 1 * output_stride] */
- x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RCX, 0, X64_RBX, 0);
-
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
- x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM4);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
- x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM11);
- x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM6);
- x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM6);
- x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM13);
- x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4);
- x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM1);
- x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM10);
- x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
-
- /* change sign */
- x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
-
- x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM11);
- x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM11);
- x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RCX, 0, X64_RBX, 0);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM6);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM13);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM1);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM11);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
/* store [output + 0 * output_stride] */
- x64_sse_movaps_membase_reg(ins, X64_RCX, 0, X64_XMM5);
+ x64_sse_movaps_membase_reg(ins, X64_RCX, 0, X64_XMM5);
/* store [output + 1 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RBX, 0, X64_XMM4);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RBX, 0, X64_XMM4);
/* store [output + 2 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RBX, 1, X64_XMM2);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RBX, 1, X64_XMM2);
- x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM13);
- x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM13);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM13);
/* store [output + 3 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RSI, 0, X64_XMM1);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RSI, 0, X64_XMM1);
/* store [output + 4 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RBX, 2, X64_XMM0);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RBX, 2, X64_XMM0);
/* store [output + 5 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_R10, 0, X64_XMM14);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_R10, 0, X64_XMM14);
/* store [output + 6 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RSI, 1, X64_XMM12);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_RSI, 1, X64_XMM12);
/* store [output + 7 * output_stride] */
- x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_R11, 0, X64_XMM6);
+ x64_sse_movaps_memindex_reg(ins, X64_RCX, 0, X64_R11, 0, X64_XMM6);
/* move output by 16 */
- x64_alu_reg_imm_size(ins, X86_ADD, X64_RCX, 16, 8);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RCX, 16, 8);
/* loop condition */
- x64_alu_reg_reg_size(ins, X86_CMP, X64_RCX, X64_RDX, 8);
- x64_branch_size(ins, X86_CC_NE, x8_soft_loop, 0, 4);
+ x64_alu_reg_reg_size(ins, X86_CMP, X64_RCX, X64_RDX, 8);
+ x64_branch_size(ins, X86_CC_NE, x8_soft_loop, 0, 4);
x64_ret(ins);
#else
- /* copy function */
- assert((char*) x8_soft_end >= (char*) x8_soft);
- len = (char*) x8_soft_end - (char*) x8_soft;
- memcpy(ins, x8_soft, len);
- ins += len;
+ /* generate function */
+ x86_clear_reg(ins, X86_EAX);
+ x64_mov_reg_reg(ins, X64_RBX, X64_RDX, 8);
+ x64_mov_reg_reg(ins, X64_RSI, X64_R8, 8);
+
+ x64_lea_memindex(ins, X64_R9, X64_RDX, 0, X64_RCX, 2);
+ x64_lea_memindex(ins, X64_R10, X64_R9, 0, X64_RCX, 2);
+ x64_lea_memindex(ins, X64_R11, X64_R10, 0, X64_RCX, 2);
+ x64_lea_memindex(ins, X64_R12, X64_R11, 0, X64_RCX, 2);
+ x64_lea_memindex(ins, X64_R13, X64_R12, 0, X64_RCX, 2);
+ x64_lea_memindex(ins, X64_R14, X64_R13, 0, X64_RCX, 2);
+ x64_lea_memindex(ins, X64_R15, X64_R14, 0, X64_RCX, 2);
+
+ /* beginning of the loop (make sure it's 16 byte aligned) */
+ x8_soft_loop = ins;
+ assert(!(((uintptr_t) x8_soft_loop) & 0xF));
+
+ x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RSI, 0);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM6, X64_R10, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_R11, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RSI, 16);
+ x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM6);
+ x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM7);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM8);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM6);
+ x64_sse_mulps_reg_reg(ins, X64_XMM8, X64_XMM7);
+ x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_movaps_reg_membase(ins, X64_XMM15, X64_RSI, 32);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+ x64_sse_subps_reg_reg(ins, X64_XMM11, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM5, X64_RBX, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM15);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_R12, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM5);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM13, X64_R14, 0, X64_RAX, 2);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM11, X64_XMM3);
+
+ x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_RSI, 48);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM10);
+ x64_sse_mulps_reg_reg(ins, X64_XMM6, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM10);
+ x64_sse_mulps_reg_reg(ins, X64_XMM15, X64_XMM13);
+ x64_sse_movaps_reg_membase(ins, X64_XMM10, X64_RSI, 64);
+ x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM5);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
+ x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14);
+ x64_sse_mulps_reg_reg(ins, X64_XMM14, X64_XMM13);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM15, X64_XMM14);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_R13, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM10);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_R15, 0, X64_RAX, 2);
+ x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM6);
+ x64_sse_movaps_reg_membase(ins, X64_XMM9, X64_RSI, 80);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RSI, 0x60, 8);
+ x64_sse_mulps_reg_reg(ins, X64_XMM13, X64_XMM7);
+ x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM15);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM15);
+ x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM0, X64_XMM12);
+ x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM12);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
+
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM12, X64_XMM2);
+ x64_sse_mulps_reg_reg(ins, X64_XMM7, X64_XMM9);
+ x64_sse_mulps_reg_reg(ins, X64_XMM9, X64_XMM8);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM7);
+ x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM9);
+ x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_R9, 0, X64_RAX, 2);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
+ x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM4);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
+ x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM11);
+ x64_sse_addps_reg_reg(ins, X64_XMM12, X64_XMM6);
+ x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM6);
+ x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM13);
+ x64_sse_movaps_reg_reg(ins, X64_XMM14, X64_XMM4);
+ x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM1);
+ x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM10);
+ x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
+
+ /* change sign */
+ x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
+
+ x64_sse_addps_reg_reg(ins, X64_XMM4, X64_XMM11);
+ x64_sse_subps_reg_reg(ins, X64_XMM14, X64_XMM11);
+ x64_sse_shufps_reg_reg_imm(ins, X64_XMM13, X64_XMM13, 0xB1);
+ x64_sse_movaps_memindex_reg(ins, X64_RBX, 0, X64_RAX, 2, X64_XMM5);
+ x64_sse_movaps_memindex_reg(ins, X64_R9, 0, X64_RAX, 2, X64_XMM4);
+ x64_sse_movaps_memindex_reg(ins, X64_R10, 0, X64_RAX, 2, X64_XMM2);
+ x64_sse_subps_reg_reg(ins, X64_XMM1, X64_XMM13);
+ x64_sse_addps_reg_reg(ins, X64_XMM6, X64_XMM13);
+ x64_sse_movaps_memindex_reg(ins, X64_R11, 0, X64_RAX, 2, X64_XMM1);
+ x64_sse_movaps_memindex_reg(ins, X64_R12, 0, X64_RAX, 2, X64_XMM0);
+ x64_sse_movaps_memindex_reg(ins, X64_R13, 0, X64_RAX, 2, X64_XMM14);
+ x64_sse_movaps_memindex_reg(ins, X64_R14, 0, X64_RAX, 2, X64_XMM12);
+ x64_sse_movaps_memindex_reg(ins, X64_R15, 0, X64_RAX, 2, X64_XMM6);
+ x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
+
+ /* loop condition */
+ x64_alu_reg_reg_size(ins, X86_CMP, X64_RCX, X64_RAX, 8);
+ x64_branch_size(ins, X86_CC_NE, x8_soft_loop, 0, 4);
+ x64_ret(ins);
#endif
- *fp = ins;
+ *fp = ins;
return x8_addr;
}
OpenPOWER on IntegriCloud