diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-11-01 11:16:56 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-11-01 11:16:56 +0200 |
commit | b4ec2061aab28f7cc626f36a3d8324eebeaab88a (patch) | |
tree | c8d7bd76bf34f0651d7b046fef23200dd22e0c9b /src/codegen.c | |
parent | 8011f28cf0cb592251f5e95accbd8cf21e28f0c7 (diff) | |
download | ffts-b4ec2061aab28f7cc626f36a3d8324eebeaab88a.zip ffts-b4ec2061aab28f7cc626f36a3d8324eebeaab88a.tar.gz |
XMM6:XMM15 Nonvolatile, must be preserved as needed by callee. http://msdn.microsoft.com/en-us/library/9z1stfyw(v=vs.80).aspx
Diffstat (limited to 'src/codegen.c')
-rw-r--r-- | src/codegen.c | 206 |
1 files changed, 200 insertions, 6 deletions
diff --git a/src/codegen.c b/src/codegen.c index 0cc3d24..af5deda 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -385,6 +385,104 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N *fp++ = 0x7C; *fp++ = 0x24; *fp++ = 0x18; + + /* reserve space for XMM6-XMM15 registers*/ + + /* sub rsp, 168 */ + SUBI(&fp, RSP, 168); + + /* movdqa [rsp], xmm6 */ + *fp++ = 0x66; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x34; + *fp++ = 0x24; + + /* movdqa [rsp + 16], xmm7 */ + *fp++ = 0x66; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x7C; + *fp++ = 0x24; + *fp++ = 0x10; + + /* movdqa [rsp + 32], xmm8 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x44; + *fp++ = 0x24; + *fp++ = 0x20; + + /* movdqa [rsp + 48], xmm9 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x4C; + *fp++ = 0x24; + *fp++ = 0x30; + + /* movdqa [rsp + 64], xmm10 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x54; + *fp++ = 0x24; + *fp++ = 0x40; + + /* movdqa [rsp + 80], xmm11 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x5C; + *fp++ = 0x24; + *fp++ = 0x50; + + /* movdqa [rsp + 96], xmm12 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x64; + *fp++ = 0x24; + *fp++ = 0x60; + + /* movdqa [rsp + 112], xmm13 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0x6C; + *fp++ = 0x24; + *fp++ = 0x70; + + /* movdqa [rsp + 128], xmm14 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0xB4; + *fp++ = 0x24; + *fp++ = 0x80; + *fp++ = 0x00; + *fp++ = 0x00; + *fp++ = 0x00; + + /* movdqa [rsp + 144], xmm15 */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x7F; + *fp++ = 0xBC; + *fp++ = 0x24; + *fp++ = 0x90; + *fp++ = 0x00; + *fp++ = 0x00; + *fp++ = 0x00; #else PUSH(&fp, RBP); PUSH(&fp, RBX); @@ -575,11 +673,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N fp += len; } - //fprintf(stderr, "Body start address = %016p\n", fp); - //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); - memcpy(fp, x_init, (char*) x4 - (char*) x_init); - //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); - fp += ((char*) x4 - (char*) x_init); + assert((char*) x4 > (char*) x_init); + len = (char*) x4 - (char*) x_init; + memcpy(fp, x_init, len); + fp += len; count = 2; while (pps[0]) { @@ -927,7 +1024,104 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N /* restore nonvolatile registers */ #ifdef _M_AMD64 - /* mov rbx, [rsp + 8] */ + + /* movdqa xmm6, [rsp] */ + *fp++ = 0x66; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x34; + *fp++ = 0x24; + + /* movdqa xmm7, [rsp + 16] */ + *fp++ = 0x66; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x7C; + *fp++ = 0x24; + *fp++ = 0x10; + + /* movdqa xmm8, [rsp + 32] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x44; + *fp++ = 0x24; + *fp++ = 0x20; + + /* movdqa xmm9, [rsp + 48] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x4C; + *fp++ = 0x24; + *fp++ = 0x30; + + /* movdqa xmm10, [rsp + 64] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x54; + *fp++ = 0x24; + *fp++ = 0x40; + + /* movdqa xmm11, [rsp + 80] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x5C; + *fp++ = 0x24; + *fp++ = 0x50; + + /* movdqa xmm12, [rsp + 96] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x64; + *fp++ = 0x24; + *fp++ = 0x60; + + /* movdqa xmm13 , [rsp + 112] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0x6C; + *fp++ = 0x24; + *fp++ = 0x70; + + /* movdqa xmm14, [rsp + 128] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0xB4; + *fp++ = 0x24; + *fp++ = 0x80; + *fp++ = 0x00; + *fp++ = 0x00; + *fp++ = 0x00; + + /* movdqa xmm15, [rsp + 144] */ + *fp++ = 0x66; + *fp++ = 0x44; + *fp++ = 0x0F; + *fp++ = 0x6F; + *fp++ = 0xBC; + *fp++ = 0x24; + *fp++ = 0x90; + *fp++ = 0x00; + *fp++ = 0x00; + *fp++ = 0x00; + + /* add rsp, 168 */ + ADDI(&fp, RSP, 168); + + /* mov rbx, [rsp + 8] */ *fp++ = 0x48; *fp++ = 0x8B; *fp++ = 0x5C; |