diff options
-rw-r--r-- | src/codegen.c | 36 | ||||
-rw-r--r-- | src/codegen_sse.h | 123 |
2 files changed, 39 insertions, 120 deletions
diff --git a/src/codegen.c b/src/codegen.c index 7814b04..d08be0d 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -156,9 +156,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N /* assign loop counter register */ loop_count = 4 * p->i0; #ifdef _M_X64 - MOV_I(&fp, X86_EBX, loop_count); + x86_mov_reg_imm(fp, X86_EBX, loop_count); #else - MOV_I(&fp, X86_ECX, loop_count); + x86_mov_reg_imm(fp, X86_ECX, loop_count); #endif #endif @@ -245,10 +245,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N /* align loop/jump destination */ #ifdef _M_X64 - MOV_I(&fp, X86_EBX, loop_count); + x86_mov_reg_imm(fp, X86_EBX, loop_count); ffts_align_mem16(&fp, 3); #else - MOV_I(&fp, X86_ECX, loop_count); + x86_mov_reg_imm(fp, X86_ECX, loop_count); ffts_align_mem16(&fp, 4); #endif @@ -298,10 +298,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N /* align loop/jump destination */ #ifdef _M_X64 - MOV_I(&fp, X86_EBX, loop_count); + x86_mov_reg_imm(fp, X86_EBX, loop_count); ffts_align_mem16(&fp, 3); #else - MOV_I(&fp, X86_ECX, loop_count); + x86_mov_reg_imm(fp, X86_ECX, loop_count); ffts_align_mem16(&fp, 4); #endif @@ -325,10 +325,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N /* align loop/jump destination */ #ifdef _M_X64 - MOV_I(&fp, X86_EBX, loop_count); + x86_mov_reg_imm(fp, X86_EBX, loop_count); ffts_align_mem16(&fp, 8); #else - MOV_I(&fp, X86_ECX, loop_count); + x86_mov_reg_imm(fp, X86_ECX, loop_count); ffts_align_mem16(&fp, 9); #endif @@ -352,9 +352,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N if (!pN) { #ifdef _M_X64 - MOV_I(&fp, X86_EBX, pps[0]); + x86_mov_reg_imm(fp, X86_EBX, pps[0]); #else - MOV_I(&fp, X86_ECX, pps[0] / 4); + x86_mov_reg_imm(fp, X86_ECX, pps[0] / 4); #endif } else { int offset = (4 * pps[1]) - pAddr; @@ -370,9 +370,17 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN); #ifdef _M_X64 - SHIFT(&fp, X86_EBX, factor); + if (factor > 0) { + x86_shift_reg_imm(fp, X86_SHL, X86_EBX, factor); + } else { + x86_shift_reg_imm(fp, X86_SHR, X86_EBX, -factor); + } #else - SHIFT(&fp, X86_ECX, factor); + if (factor > 0) { + x86_shift_reg_imm(fp, X86_SHL, X86_ECX, factor); + } else { + x86_shift_reg_imm(fp, X86_SHR, X86_ECX, -factor); + } #endif } } @@ -389,9 +397,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N } if (pps[0] == 2 * leaf_N) { - CALL(&fp, x_4_addr); + x64_call_imm(fp, (char*) x_4_addr - ((char*) fp + 4)); } else { - CALL(&fp, x_8_addr); + x64_call_imm(fp, (char*) x_8_addr - ((char*) fp + 4)); } pAddr = 4 * pps[1]; diff --git a/src/codegen_sse.h b/src/codegen_sse.h index 3c3a6ef..c7351fc 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -119,26 +119,11 @@ static void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) } } -static void CALL(uint8_t **p, uint8_t *func) -{ - *(*p)++ = 0xe8; - IMM32(p, func - *p - 4); -} - static void IMM8(uint8_t **p, int32_t imm) { *(*p)++ = (imm & 0xff); } -static void IMM16(uint8_t **p, int32_t imm) -{ - int i; - - for (i = 0; i < 2; i++) { - *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i); - } -} - static void IMM32(uint8_t **p, int32_t imm) { int i; @@ -368,33 +353,6 @@ static FFTS_INLINE void MOV_D(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t d } } -static void MOV_I(uint8_t **p, uint8_t dst, uint64_t imm) -{ - /* REX prefix */ - if (dst >= 8 || imm > UINT32_MAX) { - uint8_t val = 0x40; - - if (dst >= 8) { - val |= 1; - } - - if (imm > UINT32_MAX) { - val |= 8; - } - - *(*p)++ = val; - } - - /* opcode */ - *(*p)++ = 0xb8 | (dst & 0x7); - - if (imm > UINT32_MAX) { - IMM64(p, imm); - } else { - IMM32(p, imm); - } -} - static FFTS_INLINE void MOV_R(uint8_t **p, uint8_t reg1, uint8_t reg2, int is_store) { uint8_t r1 = (reg1 & 7); @@ -437,53 +395,6 @@ static FFTS_INLINE void MULPS(uint8_t **p, uint8_t reg2, uint8_t reg1) *(*p)++ = 0xC0 | r1 | (r2 << 3); } -static void POP(uint8_t **p, uint8_t reg) -{ - if (reg >= 8) { - *(*p)++ = 0x41; - } - - *(*p)++ = 0x58 | (reg & 7); -} - -static void PUSH(uint8_t **p, uint8_t reg) -{ - if (reg >= 8) { - *(*p)++ = 0x41; - } - - *(*p)++ = 0x50 | (reg & 7); -} - -static int32_t READ_IMM32(uint8_t *p) -{ - int32_t rval = 0; - int i; - - for (i = 0; i < 4; i++) { - rval |= *(p+i) << (8 * i); - } - - return rval; -} - -static void SHIFT(uint8_t **p, uint8_t reg, int shift) -{ - if (reg >= 8) { - *(*p)++ = 0x49; - } - - - *(*p)++ = 0xc1; - if (shift > 0) { - *(*p)++ = 0xe0 | (reg & 7); - *(*p)++ = (shift & 0xff); - } else { - *(*p)++ = 0xe8 | (reg & 7); - *(*p)++ = ((-shift) & 0xff); - } -} - static FFTS_INLINE void SHUFPS(uint8_t **p, uint8_t reg2, uint8_t reg1, const int select) { uint8_t r1 = (reg1 & 7); @@ -662,15 +573,15 @@ static FFTS_INLINE void generate_epilogue(insns_t **fp) MOV_D(fp, X64_RBX, X64_RSP, 8, 0); MOV_D(fp, X64_RSI, X64_RSP, 16, 0); MOV_D(fp, X64_RDI, X64_RSP, 24, 0); -#else - POP(fp, X64_R15); - POP(fp, X64_R14); - POP(fp, X64_R13); - POP(fp, X64_R12); - POP(fp, X64_R11); - POP(fp, X64_R10); - POP(fp, X64_RBX); - POP(fp, X64_RBP); +#else + x64_pop_reg(*fp, X64_R15); + x64_pop_reg(*fp, X64_R14); + x64_pop_reg(*fp, X64_R13); + x64_pop_reg(*fp, X64_R12); + x64_pop_reg(*fp, X64_R11); + x64_pop_reg(*fp, X64_R10); + x64_pop_reg(*fp, X64_RBX); + x64_pop_reg(*fp, X64_RBP); #endif x64_ret(*fp); @@ -706,14 +617,14 @@ static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p) MOVDQA3(fp, X64_RSP, 128, XMM14); MOVDQA3(fp, X64_RSP, 144, XMM15); #else - PUSH(fp, X64_RBP); - PUSH(fp, X64_RBX); - PUSH(fp, X64_R10); - PUSH(fp, X64_R11); - PUSH(fp, X64_R12); - PUSH(fp, X64_R13); - PUSH(fp, X64_R14); - PUSH(fp, X64_R15); + x64_push_reg(*fp, X64_RBP); + x64_push_reg(*fp, X64_RBX); + x64_push_reg(*fp, X64_R10); + x64_push_reg(*fp, X64_R11); + x64_push_reg(*fp, X64_R12); + x64_push_reg(*fp, X64_R13); + x64_push_reg(*fp, X64_R14); + x64_push_reg(*fp, X64_R15); #endif return start; |