summaryrefslogtreecommitdiffstats
path: root/src/codegen.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-10 17:07:31 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-10 17:07:31 +0200
commit20766e39cbc37bd5fabe1a144a270a99541955b2 (patch)
treeef7f0c17a11c2c39c833f19dbce4f950df027682 /src/codegen.c
parent0343c47c36b9cb0e1ea9c0bad14723d4872dccbc (diff)
downloadffts-20766e39cbc37bd5fabe1a144a270a99541955b2.zip
ffts-20766e39cbc37bd5fabe1a144a270a99541955b2.tar.gz
Replace movdqa with movaps which is one byte shorter. Don't need RDI register as R9 is saved by caller.
Diffstat (limited to 'src/codegen.c')
-rw-r--r--src/codegen.c31
1 files changed, 11 insertions, 20 deletions
diff --git a/src/codegen.c b/src/codegen.c
index efa8e9a..6c6c887 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -150,19 +150,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
#ifdef __arm__
start = generate_prologue(&fp, p);
-#else
- start = generate_prologue(&fp, p);
-
- /* assign loop counter register */
- loop_count = 4 * p->i0;
-#ifdef _M_X64
- x86_mov_reg_imm(fp, X86_EBX, loop_count);
-#else
- x86_mov_reg_imm(fp, X86_ECX, loop_count);
-#endif
-#endif
-#ifdef __arm__
#ifdef HAVE_NEON
memcpy(fp, neon_ee, neon_oo - neon_ee);
if (sign < 0) {
@@ -201,24 +189,27 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
fp += (vfp_o - vfp_e) / 4;
#endif
#else
- //fprintf(stderr, "Body start address = %016p\n", start);
+ /* generate function */
+ start = generate_prologue(&fp, p);
+ loop_count = 4 * p->i0;
#ifdef _M_X64
- /* generate function */
+ /* set loop counter */
+ x86_mov_reg_imm(fp, X86_EBX, loop_count);
/* clear */
x86_clear_reg(fp, X86_EAX);
/* set "pointer" to offsets */
- x64_mov_reg_membase(fp, X64_RDI, X64_RCX, 0x0, 8);
+ x64_mov_reg_membase(fp, X64_R9, X64_RCX, 0x0, 8);
/* set "pointer" to constants */
x64_mov_reg_membase(fp, X64_RSI, X64_RCX, 0xE0, 8);
-
- /* align loop/jump destination */
- ffts_align_mem16(&fp, 8);
#else
- /* copy function */
+ /* set loop counter */
+ x86_mov_reg_imm(fp, X86_ECX, loop_count);
+
+ /* copy function */
assert((char*) leaf_ee > (char*) leaf_ee_init);
len = (char*) leaf_ee - (char*) leaf_ee_init;
memcpy(fp, leaf_ee_init, (size_t) len);
@@ -390,7 +381,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
int offset = (int) (ws_is - pLUT);
#ifdef _M_X64
- x64_alu_reg_imm_size(fp, X86_ADD, X64_RDI, offset, 8);
+ x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8);
#else
x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#endif
OpenPOWER on IntegriCloud