summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-10 17:07:31 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-10 17:07:31 +0200
commit20766e39cbc37bd5fabe1a144a270a99541955b2 (patch)
treeef7f0c17a11c2c39c833f19dbce4f950df027682 /src
parent0343c47c36b9cb0e1ea9c0bad14723d4872dccbc (diff)
downloadffts-20766e39cbc37bd5fabe1a144a270a99541955b2.zip
ffts-20766e39cbc37bd5fabe1a144a270a99541955b2.tar.gz
Replace movdqa with movaps which is one byte shorter. Don't need RDI register as R9 is saved by caller.
Diffstat (limited to 'src')
-rw-r--r--src/codegen.c31
-rw-r--r--src/codegen_sse.h83
-rw-r--r--src/sse_win64.s36
3 files changed, 71 insertions, 79 deletions
diff --git a/src/codegen.c b/src/codegen.c
index efa8e9a..6c6c887 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -150,19 +150,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
#ifdef __arm__
start = generate_prologue(&fp, p);
-#else
- start = generate_prologue(&fp, p);
-
- /* assign loop counter register */
- loop_count = 4 * p->i0;
-#ifdef _M_X64
- x86_mov_reg_imm(fp, X86_EBX, loop_count);
-#else
- x86_mov_reg_imm(fp, X86_ECX, loop_count);
-#endif
-#endif
-#ifdef __arm__
#ifdef HAVE_NEON
memcpy(fp, neon_ee, neon_oo - neon_ee);
if (sign < 0) {
@@ -201,24 +189,27 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
fp += (vfp_o - vfp_e) / 4;
#endif
#else
- //fprintf(stderr, "Body start address = %016p\n", start);
+ /* generate function */
+ start = generate_prologue(&fp, p);
+ loop_count = 4 * p->i0;
#ifdef _M_X64
- /* generate function */
+ /* set loop counter */
+ x86_mov_reg_imm(fp, X86_EBX, loop_count);
/* clear */
x86_clear_reg(fp, X86_EAX);
/* set "pointer" to offsets */
- x64_mov_reg_membase(fp, X64_RDI, X64_RCX, 0x0, 8);
+ x64_mov_reg_membase(fp, X64_R9, X64_RCX, 0x0, 8);
/* set "pointer" to constants */
x64_mov_reg_membase(fp, X64_RSI, X64_RCX, 0xE0, 8);
-
- /* align loop/jump destination */
- ffts_align_mem16(&fp, 8);
#else
- /* copy function */
+ /* set loop counter */
+ x86_mov_reg_imm(fp, X86_ECX, loop_count);
+
+ /* copy function */
assert((char*) leaf_ee > (char*) leaf_ee_init);
len = (char*) leaf_ee - (char*) leaf_ee_init;
memcpy(fp, leaf_ee_init, (size_t) len);
@@ -390,7 +381,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
int offset = (int) (ws_is - pLUT);
#ifdef _M_X64
- x64_alu_reg_imm_size(fp, X86_ADD, X64_RDI, offset, 8);
+ x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8);
#else
x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#endif
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index fcab9f3..40bfa3f 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -162,24 +162,24 @@ static FFTS_INLINE void generate_epilogue(insns_t **fp)
{
#ifdef _M_X64
/* restore nonvolatile registers */
- x64_sse_movdqa_reg_membase(*fp, X64_XMM6, X64_RSP, 0);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM7, X64_RSP, 16);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM8, X64_RSP, 32);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM9, X64_RSP, 48);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM10, X64_RSP, 64);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM11, X64_RSP, 80);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM12, X64_RSP, 96);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM13, X64_RSP, 112);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM14, X64_RSP, 128);
- x64_sse_movdqa_reg_membase(*fp, X64_XMM15, X64_RSP, 144);
+ x64_mov_reg_membase(*fp, X64_RBX, X64_RSP, -64, 8);
+ x64_mov_reg_membase(*fp, X64_RSI, X64_RSP, -56, 8);
+
+ x64_sse_movaps_reg_membase(*fp, X64_XMM6, X64_RSP, -48);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM7, X64_RSP, -32);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM8, X64_RSP, -16);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM9, X64_RSP, 0);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM10, X64_RSP, 16);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM11, X64_RSP, 32);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM12, X64_RSP, 48);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM13, X64_RSP, 64);
+
+ /* restore the last 2 registers from the shadow space */
+ x64_sse_movaps_reg_membase(*fp, X64_XMM14, X64_RSP, 96);
+ x64_sse_movaps_reg_membase(*fp, X64_XMM15, X64_RSP, 112);
/* restore stack */
- x64_alu_reg_imm_size(*fp, X86_ADD, X64_RSP, 168, 8);
-
- /* restore the last 3 registers from the shadow space */
- x64_mov_reg_membase(*fp, X64_RBX, X64_RSP, 8, 8);
- x64_mov_reg_membase(*fp, X64_RSI, X64_RSP, 16, 8);
- x64_mov_reg_membase(*fp, X64_RDI, X64_RSP, 24, 8);
+ x64_alu_reg_imm_size(*fp, X86_ADD, X64_RSP, 88, 8);
#else
x64_pop_reg(*fp, X64_R15);
x64_pop_reg(*fp, X64_R14);
@@ -204,25 +204,24 @@ static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p)
/* save nonvolatile registers */
#ifdef _M_X64
- /* use the shadow space to save first 3 registers */
- x64_mov_membase_reg(*fp, X64_RSP, 8, X64_RBX, 8);
- x64_mov_membase_reg(*fp, X64_RSP, 16, X64_RSI, 8);
- x64_mov_membase_reg(*fp, X64_RSP, 24, X64_RDI, 8);
-
- /* reserve space.. */
- x64_alu_reg_imm_size(*fp, X86_SUB, X64_RSP, 168, 8);
-
- /* to save XMM6-XMM15 registers */
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 0, X64_XMM6);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 16, X64_XMM7);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 32, X64_XMM8);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 48, X64_XMM9);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 64, X64_XMM10);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 80, X64_XMM11);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 96, X64_XMM12);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 112, X64_XMM13);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 128, X64_XMM14);
- x64_sse_movdqa_membase_reg(*fp, X64_RSP, 144, X64_XMM15);
+ /* reserve space to save XMM6-XMM15 registers */
+ x64_alu_reg_imm_size(*fp, X86_SUB, X64_RSP, 88, 8);
+
+ x64_mov_membase_reg(*fp, X64_RSP, -64, X64_RBX, 8);
+ x64_mov_membase_reg(*fp, X64_RSP, -56, X64_RSI, 8);
+
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, -48, X64_XMM6);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, -32, X64_XMM7);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, -16, X64_XMM8);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 0, X64_XMM9);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 16, X64_XMM10);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 32, X64_XMM11);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 48, X64_XMM12);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 64, X64_XMM13);
+
+ /* use the shadow space to save last 2 registers */
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 96, X64_XMM14);
+ x64_sse_movaps_membase_reg(*fp, X64_RSP, 112, X64_XMM15);
#else
x64_push_reg(*fp, X64_RBP);
x64_push_reg(*fp, X64_RBX);
@@ -244,7 +243,7 @@ static FFTS_INLINE void generate_transform_init(insns_t **fp)
x64_sse_movaps_reg_membase(*fp, X64_XMM3, X64_RSI, 0);
/* set "pointer" to twiddle factors */
- x64_mov_reg_membase(*fp, X64_RDI, X64_RCX, 0x20, 8);
+ x64_mov_reg_membase(*fp, X64_R9, X64_RCX, 0x20, 8);
#else
size_t len;
@@ -260,7 +259,9 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
{
insns_t *ins;
insns_t *x4_addr;
+#ifndef _M_X64
size_t len;
+#endif
/* to avoid deferring */
ins = *fp;
@@ -274,10 +275,10 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_R8, 64);
x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 96);
x64_sse_movaps_reg_membase(ins, X64_XMM7, X64_R8, 0);
- x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_RDI, 0);
+ x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_R9, 0);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7);
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4);
- x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_RDI, 16);
+ x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R9, 16);
x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM6);
x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1);
@@ -302,10 +303,10 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8);
x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9);
x64_sse_movaps_membase_reg(ins, X64_R8, 96, X64_XMM10);
- x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_RDI, 32);
+ x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R9, 32);
x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
- x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RDI, 48);
+ x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM0);
x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1);
@@ -370,7 +371,7 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
#ifdef _M_X64
/* input */
- x64_mov_reg_reg(ins, X64_RAX, X64_RDI, 8);
+ x64_mov_reg_reg(ins, X64_RAX, X64_R9, 8);
/* output */
x64_mov_reg_reg(ins, X64_RCX, X64_R8, 8);
diff --git a/src/sse_win64.s b/src/sse_win64.s
index c92358f..6b71a2f 100644
--- a/src/sse_win64.s
+++ b/src/sse_win64.s
@@ -58,12 +58,12 @@ leaf_ee_init:
# rdx is 'in' base pointer
# rbx is loop max count
# rsi is constants pointer
-# rdi is offsets pointer
+# r9 is offsets pointer
# r8 is 'out' base pointer
# scratch: rax r10 r11
xorl %eax, %eax
- movq (%rcx), %rdi
+ movq (%rcx), %r9
movq 0xe0(%rcx), %rsi
# _leaf_ee + 8 needs 16 byte alignment
@@ -105,7 +105,7 @@ LEAF_EE_const_7:
movaps %xmm3, %xmm15 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm7, %xmm4 #83.5
- movslq (%rdi, %rax, 4), %r10 #83.44
+ movslq (%r9, %rax, 4), %r10 #83.44
subps %xmm13, %xmm10 #83.5
subps %xmm14, %xmm3 #83.5
addps %xmm11, %xmm5 #83.5
@@ -146,7 +146,7 @@ LEAF_EE_const_7:
movaps %xmm2, %xmm3 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm6, %xmm9 #83.5
- movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movslq 8(%r9, %rax, 4), %r11 #83.59
movlhps %xmm4, %xmm3 #83.5
addq $4, %rax
shufps $238, %xmm4, %xmm2 #83.5
@@ -205,7 +205,7 @@ LEAF_OO_const_6:
LEAF_OO_const_7:
movaps 0xFECA(%rdx,%rax,4), %xmm12 #93.5
movaps %xmm14, %xmm13 #93.5
- movslq (%rdi, %rax, 4), %r10 #83.44
+ movslq (%r9, %rax, 4), %r10 #83.44
subps %xmm8, %xmm10 #93.5
addps %xmm8, %xmm9 #93.5
addps %xmm11, %xmm2 #93.5
@@ -220,7 +220,7 @@ LEAF_OO_const_7:
movaps %xmm2, %xmm9 #93.5
shufps $177, %xmm14, %xmm14 #93.5
movaps %xmm6, %xmm7 #93.5
- movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movslq 8(%r9, %rax, 4), %r11 #83.59
addq $4, %rax #92.18
addps %xmm10, %xmm4 #93.5
addps %xmm13, %xmm9 #93.5
@@ -281,9 +281,9 @@ LEAF_EO_const_1:
subps %xmm6, %xmm11 #88.5
subps %xmm7, %xmm8 #88.5
addps %xmm7, %xmm9 #88.5
- movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movslq 8(%r9, %rax, 4), %r11 #83.59
movaps %xmm10, %xmm2 #88.5
- movslq (%rdi, %rax, 4), %r10 #83.44
+ movslq (%r9, %rax, 4), %r10 #83.44
movaps %xmm11, %xmm1 #88.5
shufps $238, %xmm8, %xmm10 #88.5
shufps $238, %xmm9, %xmm11 #88.5
@@ -370,7 +370,7 @@ LEAF_OE_const_0:
LEAF_OE_const_1:
movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5
movaps %xmm12, %xmm14 #70.5
- movslq (%rdi, %rax, 4), %r10 #83.44
+ movslq (%r9, %rax, 4), %r10 #83.44
addps %xmm8, %xmm9 #70.5
subps %xmm8, %xmm10 #70.5
addps %xmm7, %xmm14 #70.5
@@ -387,7 +387,7 @@ LEAF_OE_const_1:
subps %xmm9, %xmm14 #70.5
shufps $238, %xmm12, %xmm5 #70.5
addps %xmm10, %xmm12 #70.5
- movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movslq 8(%r9, %rax, 4), %r11 #83.59
movlhps %xmm11, %xmm13 #70.5
movaps %xmm13, (%r8,%r10,4) #70.5
movaps 0x30(%rsi), %xmm13 #70.5
@@ -466,7 +466,7 @@ _x_init:
x_init:
#endif
movaps (%rsi), %xmm3 #34.3
- movq 0x20(%rcx), %rdi
+ movq 0x20(%rcx), %r9
#ifdef __APPLE__
.globl _x4
_x4:
@@ -477,10 +477,10 @@ x4:
movaps 64(%r8), %xmm0 #34.3
movaps 96(%r8), %xmm1 #34.3
movaps (%r8), %xmm7 #34.3
- movaps (%rdi), %xmm4 #const
+ movaps (%r9), %xmm4 #const
movaps %xmm7, %xmm9 #34.3
movaps %xmm4, %xmm6 #34.3
- movaps 16(%rdi), %xmm2 #const
+ movaps 16(%r9), %xmm2 #const
mulps %xmm0, %xmm6 #34.3
mulps %xmm1, %xmm4 #34.3
shufps $177, %xmm0, %xmm0 #34.3
@@ -505,10 +505,10 @@ x4:
movaps %xmm8, 32(%r8) #34.3
movaps %xmm9, 64(%r8) #34.3
movaps %xmm10, 96(%r8) #34.3
- movaps 32(%rdi), %xmm14 #const #34.3
+ movaps 32(%r9), %xmm14 #const #34.3
movaps 80(%r8), %xmm11 #34.3
movaps %xmm14, %xmm0 #34.3
- movaps 48(%rdi), %xmm13 #const #34.3
+ movaps 48(%r9), %xmm13 #const #34.3
mulps %xmm11, %xmm0 #34.3
mulps %xmm12, %xmm14 #34.3
shufps $177, %xmm11, %xmm11 #34.3
@@ -544,11 +544,11 @@ _x8_soft:
.globl x8_soft
x8_soft:
#endif
- # rax, rcx, rdx, r8, r10, r11 (r9 not used)
- # rbx, rdi, rsi
+ # rax, rcx, rdx, r8, r9, r10, r11
+ # rbx, rsi
# input
- movq %rdi, %rax
+ movq %r9, %rax
# output
movq %r8, %rcx
OpenPOWER on IntegriCloud