summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/codegen.c18
-rw-r--r--src/codegen_sse.h48
-rw-r--r--src/cp_sse.h1
-rw-r--r--src/sse.s9
4 files changed, 59 insertions, 17 deletions
diff --git a/src/codegen.c b/src/codegen.c
index 274bf50..b595cb5 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -88,6 +88,8 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) {
size_t *ps = malloc(count * 2 * sizeof(size_t));
size_t *pps = ps;
+ p->constants = sse_constants;
+
elaborate_tree(&pps, N, leafN, 0);
pps[0] = 0;
pps[1] = 0;
@@ -193,7 +195,19 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) {
int i;
memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
- IMM32_NI(fp + 3, READ_IMM32(fp + 3) + ((void *)leaf_ee_init - (void *)fp ));
+
+ fprintf(stderr, "Leaf start address = %016p\n", fp);
+ fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
+ fprintf(stderr, "Constants address = %016p\n", sse_constants);
+ fprintf(stderr, "Constants address = %016p\n", p->constants);
+
+//int32_t val = READ_IMM32(fp + 3);
+//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
+
+//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
+//fprintf(stderr, "IMM = 0x%llx\n", v2);
+
+//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp ));
fp += (leaf_ee - leaf_ee_init);
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
@@ -252,7 +266,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) {
fprintf(stderr, "Body start address = %016p\n", fp);
//LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
memcpy(fp, x_init, x4 - x_init);
- IMM32_NI(fp + 3, READ_IMM32(fp + 3) + ((void *)x_init - (void *)fp ));
+//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
fp += (x4 - x_init);
int32_t pAddr = 0;
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 490826b..7316b50 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -19,6 +19,8 @@ void x4();
void x8_soft();
void x8_hard();
+void sse_constants();
+
typedef uint8_t insns_t;
extern const uint32_t sse_leaf_ee_offsets[8];
@@ -50,25 +52,31 @@ extern const uint32_t sse_leaf_oe_offsets[8];
#define R14 14
#define R15 15
-void IMM8(uint8_t **p, uint32_t imm) {
+void IMM8(uint8_t **p, int32_t imm) {
*(*p)++ = (imm & 0xff);
}
-void IMM32(uint8_t **p, uint32_t imm) {
+void IMM16(uint8_t **p, int32_t imm) {
+ int i;
+ for(i=0;i<2;i++) {
+ *(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
+ }
+}
+void IMM32(uint8_t **p, int32_t imm) {
int i;
for(i=0;i<4;i++) {
*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
}
}
-void IMM32_NI(uint8_t *p, uint32_t imm) {
+void IMM32_NI(uint8_t *p, int32_t imm) {
int i;
for(i=0;i<4;i++) {
*(p+i) = (imm & (0xff << (i*8))) >> (i*8);
}
}
-uint32_t READ_IMM32(uint8_t *p) {
- uint32_t rval = 0;
+int32_t READ_IMM32(uint8_t *p) {
+ int32_t rval = 0;
int i;
for(i=0;i<4;i++) {
rval |= *(p+i) << (i*8);
@@ -77,14 +85,29 @@ uint32_t READ_IMM32(uint8_t *p) {
}
void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
- if(dst < 8) {
- *(*p)++ = 0xb8 + dst;
- }else{
- *(*p)++ = 0x49;
- *(*p)++ = 0xc7;
- *(*p)++ = 0xc0 | (dst - 8);
- }
+// if(imm < 65536) *(*p)++ = 0x66;
+ if(dst >= 8) *(*p)++ = 0x41;
+
+ //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66;
+
+ //if(imm >= 256)
+ *(*p)++ = 0xb8 | (dst & 0x7);
+// else *(*p)++ = 0xb0 | (dst & 0x7);
+
+ // if(imm < 256) IMM8(p, imm);
+// else
+//if(imm < 65536) IMM16(p, imm);
+//else
IMM32(p, imm);
+
+//if(dst < 8) {
+// *(*p)++ = 0xb8 + dst;
+//}else{
+// *(*p)++ = 0x49;
+// *(*p)++ = 0xc7;
+// *(*p)++ = 0xc0 | (dst - 8);
+//}
+//IMM32(p, imm);
}
void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
@@ -111,6 +134,7 @@ void RET(uint8_t **p) {
}
void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
+
if(dst >= 8) *(*p)++ = 0x49;
else *(*p)++ = 0x48;
diff --git a/src/cp_sse.h b/src/cp_sse.h
index d8d7cac..ca7b558 100644
--- a/src/cp_sse.h
+++ b/src/cp_sse.h
@@ -35,6 +35,7 @@ struct _ffts_plan_t {
void (*transform)(struct _ffts_plan_t * restrict, const float * restrict, float * restrict);
void *transform_base;
size_t transform_size;
+ void *constants;
};
typedef struct _ffts_plan_t ffts_plan_t;
diff --git a/src/sse.s b/src/sse.s
index 9a1f937..3742844 100644
--- a/src/sse.s
+++ b/src/sse.s
@@ -15,9 +15,9 @@ _neon_x8_t:
.globl _leaf_ee_init
.align 4, 0x90
_leaf_ee_init:
- lea L_sse_constants(%rip), %r9
+ #lea L_sse_constants(%rip), %r9
+ movq 0xe0(%rdi), %r9
xorl %eax, %eax
-
# eax is loop counter (init to 0)
# rcx is loop max count
# rsi is 'in' base pointer
@@ -404,7 +404,8 @@ _leaf_end:
.globl _x_init
.align 4,0x90
_x_init:
- movaps L_sse_constants(%rip), %xmm3 #34.3
+ #movaps L_sse_constants(%rip), %xmm3 #34.3
+ movaps (%r9), %xmm3 #34.3
movq 0x20(%rdi),%r8
.globl _x4
@@ -728,7 +729,9 @@ _sse_leaf_oe_offsets:
.long LEAF_OE_const_7-_leaf_oe+0x4
.section __TEXT, __const
+ .globl _sse_constants
.align 4
+_sse_constants:
L_sse_constants:
L_2il0floatpacket.719:
.long 0x00000000,0x80000000,0x00000000,0x80000000
OpenPOWER on IntegriCloud