diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-11-09 13:55:22 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-11-09 13:55:22 +0200 |
commit | 7fd2a93b0ad374c1377b3504cf55023b90772a58 (patch) | |
tree | 50716c8329bd08023e8fde7d96c5f58038894eac | |
parent | a1ddbc888ab5d54bcd80cb7d5a7f35fad724c2a1 (diff) | |
download | ffts-7fd2a93b0ad374c1377b3504cf55023b90772a58.zip ffts-7fd2a93b0ad374c1377b3504cf55023b90772a58.tar.gz |
Replace "magic bytes" with various macros
-rw-r--r-- | src/codegen_sse.h | 270 |
1 files changed, 59 insertions, 211 deletions
diff --git a/src/codegen_sse.h b/src/codegen_sse.h index ec8b5ec..36e6fb0 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -331,40 +331,19 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) x8_soft_loop = *fp; assert(!(((uintptr_t) x8_soft_loop) & 0xF)); - /* movaps xmm9, [rax] */ - /* input + 0 * input_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x08; + /* load [input + 0 * input_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM9, X64_RAX, 0); - /* movaps xmm6, [rcx + rbx*2] */ - /* output + 2 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x34; - *(*fp)++ = 0x59; + /* load [output + 2 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM6, X64_RCX, 0, X64_RBX, 1); - /* movaps xmm11, xmm9 */ - *(*fp)++ = 0x45; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xD9; + x64_sse_movaps_reg_reg(*fp, X64_XMM11, X64_XMM9); - /* movaps xmm7, [rcx + rsi] */ - /* output + 3 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x3C; - *(*fp)++ = 0x31; + /* load [output + 3 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM7, X64_RCX, 0, X64_RSI, 0); - /* movaps xmm8, [rax + 0x10] */ - /* input + 1 * input_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x40; - *(*fp)++ = 0x10; + /* load [input + 1 * input_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM8, X64_RAX, 16); x64_sse_mulps_reg_reg(*fp, X64_XMM11, X64_XMM6); x64_sse_mulps_reg_reg(*fp, X64_XMM9, X64_XMM7); @@ -373,86 +352,42 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) x64_sse_shufps_reg_reg_imm(*fp, X64_XMM7, X64_XMM7, 0xB1); x64_sse_subps_reg_reg(*fp, X64_XMM11, X64_XMM6); x64_sse_mulps_reg_reg(*fp, X64_XMM8, X64_XMM7); - - /* movaps xmm10, xmm11 */ - *(*fp)++ = 0x45; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xD3; - + x64_sse_movaps_reg_reg(*fp, X64_XMM10, X64_XMM11); x64_sse_addps_reg_reg(*fp, X64_XMM9, X64_XMM8); - /* movaps xmm15, [rax + 0x20] */ - /* input + 2 * input_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x78; - *(*fp)++ = 0x20; + /* load [input + 2 * input_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM15, X64_RAX, 32); x64_sse_addps_reg_reg(*fp, X64_XMM10, X64_XMM9); x64_sse_subps_reg_reg(*fp, X64_XMM11, X64_XMM9); - /* movaps xmm5, [rcx] */ - /* output + 0 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x29; + /* load [output + 0 * output_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM5, X64_RCX, 0); - /* movaps xmm6,xmm15 */ - *(*fp)++ = 0x41; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xF7; + x64_sse_movaps_reg_reg(*fp, X64_XMM6, X64_XMM15); - /* movaps xmm12, [rcx + rbx*4] */ - /* output + 4 * output_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x24; - *(*fp)++ = 0x99; + /* load [output + 4 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM12, X64_RCX, 0, X64_RBX, 2); - /* movaps xmm2, xmm5 */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xD5; + x64_sse_movaps_reg_reg(*fp, X64_XMM2, X64_XMM5); - /* movaps xmm13, [rcx + rsi*2] */ - /* output + 6 * output_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x2C; - *(*fp)++ = 0x71; + /* load [output + 6 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM13, X64_RCX, 0, X64_RSI, 1); x64_sse_xorps_reg_reg(*fp, X64_XMM11, X64_XMM3); - /* movaps xmm14, [rax + 0x30] */ - /* input + 3 * input_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x70; - *(*fp)++ = 0x30; + /* load [input + 3 * input_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM14, X64_RAX, 48); x64_sse_subps_reg_reg(*fp, X64_XMM2, X64_XMM10); x64_sse_mulps_reg_reg(*fp, X64_XMM6, X64_XMM12); x64_sse_addps_reg_reg(*fp, X64_XMM5, X64_XMM10); x64_sse_mulps_reg_reg(*fp, X64_XMM15, X64_XMM13); - /* movaps xmm10, [rax + 0x40] */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x50; - *(*fp)++ = 0x40; - - /* movaps xmm0, xmm5 */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xC5; + /* load [input + 4 * input_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM10, X64_RAX, 64); + x64_sse_movaps_reg_reg(*fp, X64_XMM0, X64_XMM5); x64_sse_shufps_reg_reg_imm(*fp, X64_XMM12, X64_XMM12, 0xB1); x64_sse_shufps_reg_reg_imm(*fp, X64_XMM13, X64_XMM13, 0xB1); x64_sse_mulps_reg_reg(*fp, X64_XMM12, X64_XMM14); @@ -460,40 +395,20 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) x64_sse_subps_reg_reg(*fp, X64_XMM6, X64_XMM12); x64_sse_addps_reg_reg(*fp, X64_XMM15, X64_XMM14); - /* movaps xmm7, [rcx + r10] */ - *(*fp)++ = 0x42; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x3C; - *(*fp)++ = 0x11; + /* load [output + 5 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM7, X64_RCX, 0, X64_R10, 0); - /* movaps xmm13, xmm10 */ - *(*fp)++ = 0x45; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xEA; + x64_sse_movaps_reg_reg(*fp, X64_XMM13, X64_XMM10); - /* movaps xmm8, [rcx + r11] */ - *(*fp)++ = 0x46; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x04; - *(*fp)++ = 0x19; + /* load [output + 7 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM8, X64_RCX, 0, X64_R11, 0); - /* movaps xmm12, xmm6 */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xE6; + x64_sse_movaps_reg_reg(*fp, X64_XMM12, X64_XMM6); - /* movaps xmm9, [rax + 0x50] */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x48; - *(*fp)++ = 0x50; + /* load [input + 5 * input_stride] */ + x64_sse_movaps_reg_membase(*fp, X64_XMM9, X64_RAX, 80); - /* input + 6 * input_stride */ + /* move input by 6 * input_stride */ x64_alu_reg_imm_size(*fp, X86_ADD, X64_RAX, 0x60, 8); x64_sse_mulps_reg_reg(*fp, X64_XMM13, X64_XMM7); @@ -505,55 +420,25 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) x64_sse_shufps_reg_reg_imm(*fp, X64_XMM7, X64_XMM7, 0xB1); x64_sse_xorps_reg_reg(*fp, X64_XMM6, X64_XMM3); x64_sse_shufps_reg_reg_imm(*fp, X64_XMM8, X64_XMM8, 0xB1); - - /* movaps xmm12, xmm2 */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xE2; - + x64_sse_movaps_reg_reg(*fp, X64_XMM12, X64_XMM2); x64_sse_mulps_reg_reg(*fp, X64_XMM7, X64_XMM9); x64_sse_mulps_reg_reg(*fp, X64_XMM9, X64_XMM8); x64_sse_subps_reg_reg(*fp, X64_XMM13, X64_XMM7); x64_sse_addps_reg_reg(*fp, X64_XMM10, X64_XMM9); - /* movaps xmm4, [rcx + rbx] */ - /* output + 1 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0x24; - *(*fp)++ = 0x19; + /* load [output + 1 * output_stride] */ + x64_sse_movaps_reg_memindex(*fp, X64_XMM4, X64_RCX, 0, X64_RBX, 0); x64_sse_shufps_reg_reg_imm(*fp, X64_XMM11, X64_XMM11, 0xB1); - - /* movaps xmm1, xmm4 */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xCC; - + x64_sse_movaps_reg_reg(*fp, X64_XMM1, X64_XMM4); x64_sse_shufps_reg_reg_imm(*fp, X64_XMM6, X64_XMM6, 0xB1); x64_sse_addps_reg_reg(*fp, X64_XMM1, X64_XMM11); x64_sse_subps_reg_reg(*fp, X64_XMM4, X64_XMM11); x64_sse_addps_reg_reg(*fp, X64_XMM12, X64_XMM6); x64_sse_subps_reg_reg(*fp, X64_XMM2, X64_XMM6); - - /* movaps xmm11, xmm13 */ - *(*fp)++ = 0x45; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xDD; - - /* movaps xmm14, xmm4 */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xF4; - - /* movaps xmm6, xmm1 */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x28; - *(*fp)++ = 0xF1; - + x64_sse_movaps_reg_reg(*fp, X64_XMM11, X64_XMM13); + x64_sse_movaps_reg_reg(*fp, X64_XMM14, X64_XMM4); + x64_sse_movaps_reg_reg(*fp, X64_XMM6, X64_XMM1); x64_sse_subps_reg_reg(*fp, X64_XMM13, X64_XMM10); x64_sse_addps_reg_reg(*fp, X64_XMM11, X64_XMM10); x64_sse_xorps_reg_reg(*fp, X64_XMM13, X64_XMM3); @@ -561,72 +446,35 @@ static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign) x64_sse_subps_reg_reg(*fp, X64_XMM14, X64_XMM11); x64_sse_shufps_reg_reg_imm(*fp, X64_XMM13, X64_XMM13, 0xB1); - /* movaps [rcx], xmm5 */ - /* output + 0 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x29; + /* store [output + 0 * output_stride] */ + x64_sse_movaps_membase_reg(*fp, X64_RCX, 0, X64_XMM5); - /* movaps [rcx + rbx], xmm4 */ - /* output + 1 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x24; - *(*fp)++ = 0x19; + /* store [output + 1 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_RBX, 0, X64_XMM4); - /* movaps [rcx + rbx*2], xmm2 */ - /* output + 2 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x14; - *(*fp)++ = 0x59; + /* store [output + 2 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_RBX, 1, X64_XMM2); x64_sse_subps_reg_reg(*fp, X64_XMM1, X64_XMM13); x64_sse_addps_reg_reg(*fp, X64_XMM6, X64_XMM13); - /* movaps [rcx + rsi], xmm1 */ - /* output + 3 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x0C; - *(*fp)++ = 0x31; + /* store [output + 3 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_RSI, 0, X64_XMM1); - /* movaps [rcx + rbx*4], xmm0 */ - /* output + 4 * output_stride */ - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x04; - *(*fp)++ = 0x99; + /* store [output + 4 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_RBX, 2, X64_XMM0); - /* movaps [rcx + r10], xmm14 */ - /* output + 5 * output_stride */ - *(*fp)++ = 0x46; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x34; - *(*fp)++ = 0x11; + /* store [output + 5 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_R10, 0, X64_XMM14); - /* movaps [rcx + rsi*2], xmm12 */ - /* output + 6 * output_stride */ - *(*fp)++ = 0x44; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x24; - *(*fp)++ = 0x71; + /* store [output + 6 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_RSI, 1, X64_XMM12); - /* movaps [rcx + r11], xmm6 */ - /* output + 7 * output_stride */ - *(*fp)++ = 0x42; - *(*fp)++ = 0x0F; - *(*fp)++ = 0x29; - *(*fp)++ = 0x34; - *(*fp)++ = 0x19; + /* store [output + 7 * output_stride] */ + x64_sse_movaps_memindex_reg(*fp, X64_RCX, 0, X64_R11, 0, X64_XMM6); - /* add rcx, 0x10 */ - *(*fp)++ = 0x48; - *(*fp)++ = 0x83; - *(*fp)++ = 0xC1; - *(*fp)++ = 0x10; + /* move output by 16 */ + x64_alu_reg_imm_size(*fp, X86_ADD, X64_RCX, 16, 8); /* cmp rcx, rdx */ *(*fp)++ = 0x48; |