summaryrefslogtreecommitdiffstats
path: root/src/codegen.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-01 11:16:56 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-01 11:16:56 +0200
commitb4ec2061aab28f7cc626f36a3d8324eebeaab88a (patch)
treec8d7bd76bf34f0651d7b046fef23200dd22e0c9b /src/codegen.c
parent8011f28cf0cb592251f5e95accbd8cf21e28f0c7 (diff)
downloadffts-b4ec2061aab28f7cc626f36a3d8324eebeaab88a.zip
ffts-b4ec2061aab28f7cc626f36a3d8324eebeaab88a.tar.gz
XMM6:XMM15 Nonvolatile, must be preserved as needed by callee. http://msdn.microsoft.com/en-us/library/9z1stfyw(v=vs.80).aspx
Diffstat (limited to 'src/codegen.c')
-rw-r--r--src/codegen.c206
1 files changed, 200 insertions, 6 deletions
diff --git a/src/codegen.c b/src/codegen.c
index 0cc3d24..af5deda 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -385,6 +385,104 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
*fp++ = 0x7C;
*fp++ = 0x24;
*fp++ = 0x18;
+
+ /* reserve space for XMM6-XMM15 registers*/
+
+ /* sub rsp, 168 */
+ SUBI(&fp, RSP, 168);
+
+ /* movdqa [rsp], xmm6 */
+ *fp++ = 0x66;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x34;
+ *fp++ = 0x24;
+
+ /* movdqa [rsp + 16], xmm7 */
+ *fp++ = 0x66;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x7C;
+ *fp++ = 0x24;
+ *fp++ = 0x10;
+
+ /* movdqa [rsp + 32], xmm8 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x44;
+ *fp++ = 0x24;
+ *fp++ = 0x20;
+
+ /* movdqa [rsp + 48], xmm9 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x4C;
+ *fp++ = 0x24;
+ *fp++ = 0x30;
+
+ /* movdqa [rsp + 64], xmm10 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x54;
+ *fp++ = 0x24;
+ *fp++ = 0x40;
+
+ /* movdqa [rsp + 80], xmm11 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x5C;
+ *fp++ = 0x24;
+ *fp++ = 0x50;
+
+ /* movdqa [rsp + 96], xmm12 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x64;
+ *fp++ = 0x24;
+ *fp++ = 0x60;
+
+ /* movdqa [rsp + 112], xmm13 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0x6C;
+ *fp++ = 0x24;
+ *fp++ = 0x70;
+
+ /* movdqa [rsp + 128], xmm14 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0xB4;
+ *fp++ = 0x24;
+ *fp++ = 0x80;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+
+ /* movdqa [rsp + 144], xmm15 */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x7F;
+ *fp++ = 0xBC;
+ *fp++ = 0x24;
+ *fp++ = 0x90;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
#else
PUSH(&fp, RBP);
PUSH(&fp, RBX);
@@ -575,11 +673,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
fp += len;
}
- //fprintf(stderr, "Body start address = %016p\n", fp);
- //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
- memcpy(fp, x_init, (char*) x4 - (char*) x_init);
- //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
- fp += ((char*) x4 - (char*) x_init);
+ assert((char*) x4 > (char*) x_init);
+ len = (char*) x4 - (char*) x_init;
+ memcpy(fp, x_init, len);
+ fp += len;
count = 2;
while (pps[0]) {
@@ -927,7 +1024,104 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
/* restore nonvolatile registers */
#ifdef _M_AMD64
- /* mov rbx, [rsp + 8] */
+
+ /* movdqa xmm6, [rsp] */
+ *fp++ = 0x66;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x34;
+ *fp++ = 0x24;
+
+ /* movdqa xmm7, [rsp + 16] */
+ *fp++ = 0x66;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x7C;
+ *fp++ = 0x24;
+ *fp++ = 0x10;
+
+ /* movdqa xmm8, [rsp + 32] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x44;
+ *fp++ = 0x24;
+ *fp++ = 0x20;
+
+ /* movdqa xmm9, [rsp + 48] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x4C;
+ *fp++ = 0x24;
+ *fp++ = 0x30;
+
+ /* movdqa xmm10, [rsp + 64] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x54;
+ *fp++ = 0x24;
+ *fp++ = 0x40;
+
+ /* movdqa xmm11, [rsp + 80] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x5C;
+ *fp++ = 0x24;
+ *fp++ = 0x50;
+
+ /* movdqa xmm12, [rsp + 96] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x64;
+ *fp++ = 0x24;
+ *fp++ = 0x60;
+
+ /* movdqa xmm13 , [rsp + 112] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0x6C;
+ *fp++ = 0x24;
+ *fp++ = 0x70;
+
+ /* movdqa xmm14, [rsp + 128] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0xB4;
+ *fp++ = 0x24;
+ *fp++ = 0x80;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+
+ /* movdqa xmm15, [rsp + 144] */
+ *fp++ = 0x66;
+ *fp++ = 0x44;
+ *fp++ = 0x0F;
+ *fp++ = 0x6F;
+ *fp++ = 0xBC;
+ *fp++ = 0x24;
+ *fp++ = 0x90;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+ *fp++ = 0x00;
+
+ /* add rsp, 168 */
+ ADDI(&fp, RSP, 168);
+
+ /* mov rbx, [rsp + 8] */
*fp++ = 0x48;
*fp++ = 0x8B;
*fp++ = 0x5C;
OpenPOWER on IntegriCloud