diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/codegen.c | 18 | ||||
-rw-r--r-- | src/codegen_sse.h | 48 | ||||
-rw-r--r-- | src/cp_sse.h | 1 | ||||
-rw-r--r-- | src/sse.s | 9 |
4 files changed, 59 insertions, 17 deletions
diff --git a/src/codegen.c b/src/codegen.c index 274bf50..b595cb5 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -88,6 +88,8 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { size_t *ps = malloc(count * 2 * sizeof(size_t)); size_t *pps = ps; + p->constants = sse_constants; + elaborate_tree(&pps, N, leafN, 0); pps[0] = 0; pps[1] = 0; @@ -193,7 +195,19 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { int i; memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init); - IMM32_NI(fp + 3, READ_IMM32(fp + 3) + ((void *)leaf_ee_init - (void *)fp )); + + fprintf(stderr, "Leaf start address = %016p\n", fp); + fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); + fprintf(stderr, "Constants address = %016p\n", sse_constants); + fprintf(stderr, "Constants address = %016p\n", p->constants); + +//int32_t val = READ_IMM32(fp + 3); +//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p)); + +//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp ); +//fprintf(stderr, "IMM = 0x%llx\n", v2); + +//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); fp += (leaf_ee - leaf_ee_init); memcpy(fp, leaf_ee, leaf_oo - leaf_ee); @@ -252,7 +266,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { fprintf(stderr, "Body start address = %016p\n", fp); //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); memcpy(fp, x_init, x4 - x_init); - IMM32_NI(fp + 3, READ_IMM32(fp + 3) + ((void *)x_init - (void *)fp )); +//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); fp += (x4 - x_init); int32_t pAddr = 0; diff --git a/src/codegen_sse.h b/src/codegen_sse.h index 490826b..7316b50 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -19,6 +19,8 @@ void x4(); void x8_soft(); void x8_hard(); +void sse_constants(); + typedef uint8_t insns_t; extern const uint32_t sse_leaf_ee_offsets[8]; @@ -50,25 +52,31 @@ extern const uint32_t sse_leaf_oe_offsets[8]; #define R14 14 #define R15 15 -void IMM8(uint8_t **p, uint32_t imm) { +void IMM8(uint8_t **p, int32_t imm) { *(*p)++ = (imm & 0xff); } -void IMM32(uint8_t **p, uint32_t imm) { +void IMM16(uint8_t **p, int32_t imm) { + int i; + for(i=0;i<2;i++) { + *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); + } +} +void IMM32(uint8_t **p, int32_t imm) { int i; for(i=0;i<4;i++) { *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); } } -void IMM32_NI(uint8_t *p, uint32_t imm) { +void IMM32_NI(uint8_t *p, int32_t imm) { int i; for(i=0;i<4;i++) { *(p+i) = (imm & (0xff << (i*8))) >> (i*8); } } -uint32_t READ_IMM32(uint8_t *p) { - uint32_t rval = 0; +int32_t READ_IMM32(uint8_t *p) { + int32_t rval = 0; int i; for(i=0;i<4;i++) { rval |= *(p+i) << (i*8); @@ -77,14 +85,29 @@ uint32_t READ_IMM32(uint8_t *p) { } void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) { - if(dst < 8) { - *(*p)++ = 0xb8 + dst; - }else{ - *(*p)++ = 0x49; - *(*p)++ = 0xc7; - *(*p)++ = 0xc0 | (dst - 8); - } +// if(imm < 65536) *(*p)++ = 0x66; + if(dst >= 8) *(*p)++ = 0x41; + + //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66; + + //if(imm >= 256) + *(*p)++ = 0xb8 | (dst & 0x7); +// else *(*p)++ = 0xb0 | (dst & 0x7); + + // if(imm < 256) IMM8(p, imm); +// else +//if(imm < 65536) IMM16(p, imm); +//else IMM32(p, imm); + +//if(dst < 8) { +// *(*p)++ = 0xb8 + dst; +//}else{ +// *(*p)++ = 0x49; +// *(*p)++ = 0xc7; +// *(*p)++ = 0xc0 | (dst - 8); +//} +//IMM32(p, imm); } void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) { @@ -111,6 +134,7 @@ void RET(uint8_t **p) { } void ADDI(uint8_t **p, uint8_t dst, int32_t imm) { + if(dst >= 8) *(*p)++ = 0x49; else *(*p)++ = 0x48; diff --git a/src/cp_sse.h b/src/cp_sse.h index d8d7cac..ca7b558 100644 --- a/src/cp_sse.h +++ b/src/cp_sse.h @@ -35,6 +35,7 @@ struct _ffts_plan_t { void (*transform)(struct _ffts_plan_t * restrict, const float * restrict, float * restrict); void *transform_base; size_t transform_size; + void *constants; }; typedef struct _ffts_plan_t ffts_plan_t; @@ -15,9 +15,9 @@ _neon_x8_t: .globl _leaf_ee_init .align 4, 0x90 _leaf_ee_init: - lea L_sse_constants(%rip), %r9 + #lea L_sse_constants(%rip), %r9 + movq 0xe0(%rdi), %r9 xorl %eax, %eax - # eax is loop counter (init to 0) # rcx is loop max count # rsi is 'in' base pointer @@ -404,7 +404,8 @@ _leaf_end: .globl _x_init .align 4,0x90 _x_init: - movaps L_sse_constants(%rip), %xmm3 #34.3 + #movaps L_sse_constants(%rip), %xmm3 #34.3 + movaps (%r9), %xmm3 #34.3 movq 0x20(%rdi),%r8 .globl _x4 @@ -728,7 +729,9 @@ _sse_leaf_oe_offsets: .long LEAF_OE_const_7-_leaf_oe+0x4 .section __TEXT, __const + .globl _sse_constants .align 4 +_sse_constants: L_sse_constants: L_2il0floatpacket.719: .long 0x00000000,0x80000000,0x00000000,0x80000000 |