summaryrefslogtreecommitdiffstats
path: root/src/codegen.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-03 10:46:09 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-03 10:46:09 +0200
commit14c88113349263fafc88a671a71facca7e177dc9 (patch)
treeaa467450d9944c61a9b87bc788d9935a92aca1a6 /src/codegen.c
parentb4ec2061aab28f7cc626f36a3d8324eebeaab88a (diff)
downloadffts-14c88113349263fafc88a671a71facca7e177dc9.zip
ffts-14c88113349263fafc88a671a71facca7e177dc9.tar.gz
MOVDQA "intrinsic", two operand MOVDQA2, three operand MOVDQA3 helpers
Diffstat (limited to 'src/codegen.c')
-rw-r--r--src/codegen.c217
1 files changed, 26 insertions, 191 deletions
diff --git a/src/codegen.c b/src/codegen.c
index af5deda..4e524ca 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -386,103 +386,19 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
*fp++ = 0x24;
*fp++ = 0x18;
- /* reserve space for XMM6-XMM15 registers*/
-
- /* sub rsp, 168 */
+ /* reserve space to save XMM6-XMM15 registers */
SUBI(&fp, RSP, 168);
- /* movdqa [rsp], xmm6 */
- *fp++ = 0x66;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x34;
- *fp++ = 0x24;
-
- /* movdqa [rsp + 16], xmm7 */
- *fp++ = 0x66;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x7C;
- *fp++ = 0x24;
- *fp++ = 0x10;
-
- /* movdqa [rsp + 32], xmm8 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x44;
- *fp++ = 0x24;
- *fp++ = 0x20;
-
- /* movdqa [rsp + 48], xmm9 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x4C;
- *fp++ = 0x24;
- *fp++ = 0x30;
-
- /* movdqa [rsp + 64], xmm10 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x54;
- *fp++ = 0x24;
- *fp++ = 0x40;
-
- /* movdqa [rsp + 80], xmm11 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x5C;
- *fp++ = 0x24;
- *fp++ = 0x50;
-
- /* movdqa [rsp + 96], xmm12 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x64;
- *fp++ = 0x24;
- *fp++ = 0x60;
-
- /* movdqa [rsp + 112], xmm13 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0x6C;
- *fp++ = 0x24;
- *fp++ = 0x70;
-
- /* movdqa [rsp + 128], xmm14 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0xB4;
- *fp++ = 0x24;
- *fp++ = 0x80;
- *fp++ = 0x00;
- *fp++ = 0x00;
- *fp++ = 0x00;
-
- /* movdqa [rsp + 144], xmm15 */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x7F;
- *fp++ = 0xBC;
- *fp++ = 0x24;
- *fp++ = 0x90;
- *fp++ = 0x00;
- *fp++ = 0x00;
- *fp++ = 0x00;
+ MOVDQA3(&fp, RSP, 0, XMM6);
+ MOVDQA3(&fp, RSP, 16, XMM7);
+ MOVDQA3(&fp, RSP, 32, XMM8);
+ MOVDQA3(&fp, RSP, 48, XMM9);
+ MOVDQA3(&fp, RSP, 64, XMM10);
+ MOVDQA3(&fp, RSP, 80, XMM11);
+ MOVDQA3(&fp, RSP, 96, XMM12);
+ MOVDQA3(&fp, RSP, 112, XMM13);
+ MOVDQA3(&fp, RSP, 128, XMM14);
+ MOVDQA3(&fp, RSP, 144, XMM15);
#else
PUSH(&fp, RBP);
PUSH(&fp, RBX);
@@ -1022,105 +938,24 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
count++;
#else
- /* restore nonvolatile registers */
#ifdef _M_AMD64
-
- /* movdqa xmm6, [rsp] */
- *fp++ = 0x66;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x34;
- *fp++ = 0x24;
-
- /* movdqa xmm7, [rsp + 16] */
- *fp++ = 0x66;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x7C;
- *fp++ = 0x24;
- *fp++ = 0x10;
-
- /* movdqa xmm8, [rsp + 32] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x44;
- *fp++ = 0x24;
- *fp++ = 0x20;
-
- /* movdqa xmm9, [rsp + 48] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x4C;
- *fp++ = 0x24;
- *fp++ = 0x30;
-
- /* movdqa xmm10, [rsp + 64] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x54;
- *fp++ = 0x24;
- *fp++ = 0x40;
-
- /* movdqa xmm11, [rsp + 80] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x5C;
- *fp++ = 0x24;
- *fp++ = 0x50;
-
- /* movdqa xmm12, [rsp + 96] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x64;
- *fp++ = 0x24;
- *fp++ = 0x60;
-
- /* movdqa xmm13 , [rsp + 112] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0x6C;
- *fp++ = 0x24;
- *fp++ = 0x70;
-
- /* movdqa xmm14, [rsp + 128] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0xB4;
- *fp++ = 0x24;
- *fp++ = 0x80;
- *fp++ = 0x00;
- *fp++ = 0x00;
- *fp++ = 0x00;
-
- /* movdqa xmm15, [rsp + 144] */
- *fp++ = 0x66;
- *fp++ = 0x44;
- *fp++ = 0x0F;
- *fp++ = 0x6F;
- *fp++ = 0xBC;
- *fp++ = 0x24;
- *fp++ = 0x90;
- *fp++ = 0x00;
- *fp++ = 0x00;
- *fp++ = 0x00;
-
- /* add rsp, 168 */
+ /* restore nonvolatile registers */
+ MOVDQA3(&fp, XMM6, RSP, 0);
+ MOVDQA3(&fp, XMM7, RSP, 16);
+ MOVDQA3(&fp, XMM8, RSP, 32);
+ MOVDQA3(&fp, XMM9, RSP, 48);
+ MOVDQA3(&fp, XMM10, RSP, 64);
+ MOVDQA3(&fp, XMM11, RSP, 80);
+ MOVDQA3(&fp, XMM12, RSP, 96);
+ MOVDQA3(&fp, XMM13, RSP, 112);
+ MOVDQA3(&fp, XMM14, RSP, 128);
+ MOVDQA3(&fp, XMM15, RSP, 144);
+
+ /* restore stack */
ADDI(&fp, RSP, 168);
+ /* restore the last 3 registers from the shadow space */
+
/* mov rbx, [rsp + 8] */
*fp++ = 0x48;
*fp++ = 0x8B;
OpenPOWER on IntegriCloud