diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-11-10 17:07:31 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-11-10 17:07:31 +0200 |
commit | 20766e39cbc37bd5fabe1a144a270a99541955b2 (patch) | |
tree | ef7f0c17a11c2c39c833f19dbce4f950df027682 /src | |
parent | 0343c47c36b9cb0e1ea9c0bad14723d4872dccbc (diff) | |
download | ffts-20766e39cbc37bd5fabe1a144a270a99541955b2.zip ffts-20766e39cbc37bd5fabe1a144a270a99541955b2.tar.gz |
Replace movdqa with movaps which is one byte shorter. Don't need RDI register as R9 is saved by caller.
Diffstat (limited to 'src')
-rw-r--r-- | src/codegen.c | 31 | ||||
-rw-r--r-- | src/codegen_sse.h | 83 | ||||
-rw-r--r-- | src/sse_win64.s | 36 |
3 files changed, 71 insertions, 79 deletions
diff --git a/src/codegen.c b/src/codegen.c index efa8e9a..6c6c887 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -150,19 +150,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N #ifdef __arm__ start = generate_prologue(&fp, p); -#else - start = generate_prologue(&fp, p); - - /* assign loop counter register */ - loop_count = 4 * p->i0; -#ifdef _M_X64 - x86_mov_reg_imm(fp, X86_EBX, loop_count); -#else - x86_mov_reg_imm(fp, X86_ECX, loop_count); -#endif -#endif -#ifdef __arm__ #ifdef HAVE_NEON memcpy(fp, neon_ee, neon_oo - neon_ee); if (sign < 0) { @@ -201,24 +189,27 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N fp += (vfp_o - vfp_e) / 4; #endif #else - //fprintf(stderr, "Body start address = %016p\n", start); + /* generate function */ + start = generate_prologue(&fp, p); + loop_count = 4 * p->i0; #ifdef _M_X64 - /* generate function */ + /* set loop counter */ + x86_mov_reg_imm(fp, X86_EBX, loop_count); /* clear */ x86_clear_reg(fp, X86_EAX); /* set "pointer" to offsets */ - x64_mov_reg_membase(fp, X64_RDI, X64_RCX, 0x0, 8); + x64_mov_reg_membase(fp, X64_R9, X64_RCX, 0x0, 8); /* set "pointer" to constants */ x64_mov_reg_membase(fp, X64_RSI, X64_RCX, 0xE0, 8); - - /* align loop/jump destination */ - ffts_align_mem16(&fp, 8); #else - /* copy function */ + /* set loop counter */ + x86_mov_reg_imm(fp, X86_ECX, loop_count); + + /* copy function */ assert((char*) leaf_ee > (char*) leaf_ee_init); len = (char*) leaf_ee - (char*) leaf_ee_init; memcpy(fp, leaf_ee_init, (size_t) len); @@ -390,7 +381,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N int offset = (int) (ws_is - pLUT); #ifdef _M_X64 - x64_alu_reg_imm_size(fp, X86_ADD, X64_RDI, offset, 8); + x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8); #else x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8); #endif diff --git a/src/codegen_sse.h b/src/codegen_sse.h index fcab9f3..40bfa3f 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -162,24 +162,24 @@ static FFTS_INLINE void generate_epilogue(insns_t **fp) { #ifdef _M_X64 /* restore nonvolatile registers */ - x64_sse_movdqa_reg_membase(*fp, X64_XMM6, X64_RSP, 0); - x64_sse_movdqa_reg_membase(*fp, X64_XMM7, X64_RSP, 16); - x64_sse_movdqa_reg_membase(*fp, X64_XMM8, X64_RSP, 32); - x64_sse_movdqa_reg_membase(*fp, X64_XMM9, X64_RSP, 48); - x64_sse_movdqa_reg_membase(*fp, X64_XMM10, X64_RSP, 64); - x64_sse_movdqa_reg_membase(*fp, X64_XMM11, X64_RSP, 80); - x64_sse_movdqa_reg_membase(*fp, X64_XMM12, X64_RSP, 96); - x64_sse_movdqa_reg_membase(*fp, X64_XMM13, X64_RSP, 112); - x64_sse_movdqa_reg_membase(*fp, X64_XMM14, X64_RSP, 128); - x64_sse_movdqa_reg_membase(*fp, X64_XMM15, X64_RSP, 144); + x64_mov_reg_membase(*fp, X64_RBX, X64_RSP, -64, 8); + x64_mov_reg_membase(*fp, X64_RSI, X64_RSP, -56, 8); + + x64_sse_movaps_reg_membase(*fp, X64_XMM6, X64_RSP, -48); + x64_sse_movaps_reg_membase(*fp, X64_XMM7, X64_RSP, -32); + x64_sse_movaps_reg_membase(*fp, X64_XMM8, X64_RSP, -16); + x64_sse_movaps_reg_membase(*fp, X64_XMM9, X64_RSP, 0); + x64_sse_movaps_reg_membase(*fp, X64_XMM10, X64_RSP, 16); + x64_sse_movaps_reg_membase(*fp, X64_XMM11, X64_RSP, 32); + x64_sse_movaps_reg_membase(*fp, X64_XMM12, X64_RSP, 48); + x64_sse_movaps_reg_membase(*fp, X64_XMM13, X64_RSP, 64); + + /* restore the last 2 registers from the shadow space */ + x64_sse_movaps_reg_membase(*fp, X64_XMM14, X64_RSP, 96); + x64_sse_movaps_reg_membase(*fp, X64_XMM15, X64_RSP, 112); /* restore stack */ - x64_alu_reg_imm_size(*fp, X86_ADD, X64_RSP, 168, 8); - - /* restore the last 3 registers from the shadow space */ - x64_mov_reg_membase(*fp, X64_RBX, X64_RSP, 8, 8); - x64_mov_reg_membase(*fp, X64_RSI, X64_RSP, 16, 8); - x64_mov_reg_membase(*fp, X64_RDI, X64_RSP, 24, 8); + x64_alu_reg_imm_size(*fp, X86_ADD, X64_RSP, 88, 8); #else x64_pop_reg(*fp, X64_R15); x64_pop_reg(*fp, X64_R14); @@ -204,25 +204,24 @@ static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p) /* save nonvolatile registers */ #ifdef _M_X64 - /* use the shadow space to save first 3 registers */ - x64_mov_membase_reg(*fp, X64_RSP, 8, X64_RBX, 8); - x64_mov_membase_reg(*fp, X64_RSP, 16, X64_RSI, 8); - x64_mov_membase_reg(*fp, X64_RSP, 24, X64_RDI, 8); - - /* reserve space.. */ - x64_alu_reg_imm_size(*fp, X86_SUB, X64_RSP, 168, 8); - - /* to save XMM6-XMM15 registers */ - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 0, X64_XMM6); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 16, X64_XMM7); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 32, X64_XMM8); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 48, X64_XMM9); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 64, X64_XMM10); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 80, X64_XMM11); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 96, X64_XMM12); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 112, X64_XMM13); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 128, X64_XMM14); - x64_sse_movdqa_membase_reg(*fp, X64_RSP, 144, X64_XMM15); + /* reserve space to save XMM6-XMM15 registers */ + x64_alu_reg_imm_size(*fp, X86_SUB, X64_RSP, 88, 8); + + x64_mov_membase_reg(*fp, X64_RSP, -64, X64_RBX, 8); + x64_mov_membase_reg(*fp, X64_RSP, -56, X64_RSI, 8); + + x64_sse_movaps_membase_reg(*fp, X64_RSP, -48, X64_XMM6); + x64_sse_movaps_membase_reg(*fp, X64_RSP, -32, X64_XMM7); + x64_sse_movaps_membase_reg(*fp, X64_RSP, -16, X64_XMM8); + x64_sse_movaps_membase_reg(*fp, X64_RSP, 0, X64_XMM9); + x64_sse_movaps_membase_reg(*fp, X64_RSP, 16, X64_XMM10); + x64_sse_movaps_membase_reg(*fp, X64_RSP, 32, X64_XMM11); + x64_sse_movaps_membase_reg(*fp, X64_RSP, 48, X64_XMM12); + x64_sse_movaps_membase_reg(*fp, X64_RSP, 64, X64_XMM13); + + /* use the shadow space to save last 2 registers */ + x64_sse_movaps_membase_reg(*fp, X64_RSP, 96, X64_XMM14); + x64_sse_movaps_membase_reg(*fp, X64_RSP, 112, X64_XMM15); #else x64_push_reg(*fp, X64_RBP); x64_push_reg(*fp, X64_RBX); @@ -244,7 +243,7 @@ static FFTS_INLINE void generate_transform_init(insns_t **fp) x64_sse_movaps_reg_membase(*fp, X64_XMM3, X64_RSI, 0); /* set "pointer" to twiddle factors */ - x64_mov_reg_membase(*fp, X64_RDI, X64_RCX, 0x20, 8); + x64_mov_reg_membase(*fp, X64_R9, X64_RCX, 0x20, 8); #else size_t len; @@ -260,7 +259,9 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign) { insns_t *ins; insns_t *x4_addr; +#ifndef _M_X64 size_t len; +#endif /* to avoid deferring */ ins = *fp; @@ -274,10 +275,10 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign) x64_sse_movaps_reg_membase(ins, X64_XMM0, X64_R8, 64); x64_sse_movaps_reg_membase(ins, X64_XMM1, X64_R8, 96); x64_sse_movaps_reg_membase(ins, X64_XMM7, X64_R8, 0); - x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_RDI, 0); + x64_sse_movaps_reg_membase(ins, X64_XMM4, X64_R9, 0); x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM7); x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM4); - x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_RDI, 16); + x64_sse_movaps_reg_membase(ins, X64_XMM2, X64_R9, 16); x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM6); x64_sse_mulps_reg_reg(ins, X64_XMM1, X64_XMM4); x64_sse_shufps_reg_reg_imm(ins, X64_XMM0, X64_XMM0, 0xB1); @@ -302,10 +303,10 @@ static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign) x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8); x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9); x64_sse_movaps_membase_reg(ins, X64_R8, 96, X64_XMM10); - x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_RDI, 32); + x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R9, 32); x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80); x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14); - x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_RDI, 48); + x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48); x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM0); x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM14); x64_sse_shufps_reg_reg_imm(ins, X64_XMM11, X64_XMM11, 0xB1); @@ -370,7 +371,7 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) #ifdef _M_X64 /* input */ - x64_mov_reg_reg(ins, X64_RAX, X64_RDI, 8); + x64_mov_reg_reg(ins, X64_RAX, X64_R9, 8); /* output */ x64_mov_reg_reg(ins, X64_RCX, X64_R8, 8); diff --git a/src/sse_win64.s b/src/sse_win64.s index c92358f..6b71a2f 100644 --- a/src/sse_win64.s +++ b/src/sse_win64.s @@ -58,12 +58,12 @@ leaf_ee_init: # rdx is 'in' base pointer # rbx is loop max count # rsi is constants pointer -# rdi is offsets pointer +# r9 is offsets pointer # r8 is 'out' base pointer # scratch: rax r10 r11 xorl %eax, %eax - movq (%rcx), %rdi + movq (%rcx), %r9 movq 0xe0(%rcx), %rsi # _leaf_ee + 8 needs 16 byte alignment @@ -105,7 +105,7 @@ LEAF_EE_const_7: movaps %xmm3, %xmm15 #83.5 shufps $177, %xmm12, %xmm12 #83.5 movaps %xmm7, %xmm4 #83.5 - movslq (%rdi, %rax, 4), %r10 #83.44 + movslq (%r9, %rax, 4), %r10 #83.44 subps %xmm13, %xmm10 #83.5 subps %xmm14, %xmm3 #83.5 addps %xmm11, %xmm5 #83.5 @@ -146,7 +146,7 @@ LEAF_EE_const_7: movaps %xmm2, %xmm3 #83.5 shufps $177, %xmm12, %xmm12 #83.5 movaps %xmm6, %xmm9 #83.5 - movslq 8(%rdi, %rax, 4), %r11 #83.59 + movslq 8(%r9, %rax, 4), %r11 #83.59 movlhps %xmm4, %xmm3 #83.5 addq $4, %rax shufps $238, %xmm4, %xmm2 #83.5 @@ -205,7 +205,7 @@ LEAF_OO_const_6: LEAF_OO_const_7: movaps 0xFECA(%rdx,%rax,4), %xmm12 #93.5 movaps %xmm14, %xmm13 #93.5 - movslq (%rdi, %rax, 4), %r10 #83.44 + movslq (%r9, %rax, 4), %r10 #83.44 subps %xmm8, %xmm10 #93.5 addps %xmm8, %xmm9 #93.5 addps %xmm11, %xmm2 #93.5 @@ -220,7 +220,7 @@ LEAF_OO_const_7: movaps %xmm2, %xmm9 #93.5 shufps $177, %xmm14, %xmm14 #93.5 movaps %xmm6, %xmm7 #93.5 - movslq 8(%rdi, %rax, 4), %r11 #83.59 + movslq 8(%r9, %rax, 4), %r11 #83.59 addq $4, %rax #92.18 addps %xmm10, %xmm4 #93.5 addps %xmm13, %xmm9 #93.5 @@ -281,9 +281,9 @@ LEAF_EO_const_1: subps %xmm6, %xmm11 #88.5 subps %xmm7, %xmm8 #88.5 addps %xmm7, %xmm9 #88.5 - movslq 8(%rdi, %rax, 4), %r11 #83.59 + movslq 8(%r9, %rax, 4), %r11 #83.59 movaps %xmm10, %xmm2 #88.5 - movslq (%rdi, %rax, 4), %r10 #83.44 + movslq (%r9, %rax, 4), %r10 #83.44 movaps %xmm11, %xmm1 #88.5 shufps $238, %xmm8, %xmm10 #88.5 shufps $238, %xmm9, %xmm11 #88.5 @@ -370,7 +370,7 @@ LEAF_OE_const_0: LEAF_OE_const_1: movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5 movaps %xmm12, %xmm14 #70.5 - movslq (%rdi, %rax, 4), %r10 #83.44 + movslq (%r9, %rax, 4), %r10 #83.44 addps %xmm8, %xmm9 #70.5 subps %xmm8, %xmm10 #70.5 addps %xmm7, %xmm14 #70.5 @@ -387,7 +387,7 @@ LEAF_OE_const_1: subps %xmm9, %xmm14 #70.5 shufps $238, %xmm12, %xmm5 #70.5 addps %xmm10, %xmm12 #70.5 - movslq 8(%rdi, %rax, 4), %r11 #83.59 + movslq 8(%r9, %rax, 4), %r11 #83.59 movlhps %xmm11, %xmm13 #70.5 movaps %xmm13, (%r8,%r10,4) #70.5 movaps 0x30(%rsi), %xmm13 #70.5 @@ -466,7 +466,7 @@ _x_init: x_init: #endif movaps (%rsi), %xmm3 #34.3 - movq 0x20(%rcx), %rdi + movq 0x20(%rcx), %r9 #ifdef __APPLE__ .globl _x4 _x4: @@ -477,10 +477,10 @@ x4: movaps 64(%r8), %xmm0 #34.3 movaps 96(%r8), %xmm1 #34.3 movaps (%r8), %xmm7 #34.3 - movaps (%rdi), %xmm4 #const + movaps (%r9), %xmm4 #const movaps %xmm7, %xmm9 #34.3 movaps %xmm4, %xmm6 #34.3 - movaps 16(%rdi), %xmm2 #const + movaps 16(%r9), %xmm2 #const mulps %xmm0, %xmm6 #34.3 mulps %xmm1, %xmm4 #34.3 shufps $177, %xmm0, %xmm0 #34.3 @@ -505,10 +505,10 @@ x4: movaps %xmm8, 32(%r8) #34.3 movaps %xmm9, 64(%r8) #34.3 movaps %xmm10, 96(%r8) #34.3 - movaps 32(%rdi), %xmm14 #const #34.3 + movaps 32(%r9), %xmm14 #const #34.3 movaps 80(%r8), %xmm11 #34.3 movaps %xmm14, %xmm0 #34.3 - movaps 48(%rdi), %xmm13 #const #34.3 + movaps 48(%r9), %xmm13 #const #34.3 mulps %xmm11, %xmm0 #34.3 mulps %xmm12, %xmm14 #34.3 shufps $177, %xmm11, %xmm11 #34.3 @@ -544,11 +544,11 @@ _x8_soft: .globl x8_soft x8_soft: #endif - # rax, rcx, rdx, r8, r10, r11 (r9 not used) - # rbx, rdi, rsi + # rax, rcx, rdx, r8, r9, r10, r11 + # rbx, rsi # input - movq %rdi, %rax + movq %r9, %rax # output movq %r8, %rcx |