summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-06 11:38:55 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-06 11:38:55 +0200
commit18d19a9f8b4e409b4db46338c9040a61555f9c58 (patch)
treed086566753530cd86bafc68f03887a1eceb844fd /src
parenta0db4af6fe8f68a62cbf993871137d4cd341dfc5 (diff)
downloadffts-18d19a9f8b4e409b4db46338c9040a61555f9c58.zip
ffts-18d19a9f8b4e409b4db46338c9040a61555f9c58.tar.gz
Win64 actually "generate_size8_base_case" instead of copying
Diffstat (limited to 'src')
-rw-r--r--src/codegen.c74
-rw-r--r--src/codegen_sse.h1059
-rw-r--r--src/sequitur.h630
3 files changed, 1090 insertions, 673 deletions
diff --git a/src/codegen.c b/src/codegen.c
index 9d95519..4e70cb1 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -144,21 +144,21 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
fp = (insns_t*) p->transform_base;
- /* generate base cases */
- x_4_addr = generate_size4_base_case(&fp, sign);
- x_8_addr = generate_size8_base_case(&fp, sign);
+ /* generate base cases */
+ x_4_addr = generate_size4_base_case(&fp, sign);
+ x_8_addr = generate_size8_base_case(&fp, sign);
#ifdef __arm__
- start = generate_prologue(&fp, p);
+ start = generate_prologue(&fp, p);
#else
- start = generate_prologue(&fp, p);
+ start = generate_prologue(&fp, p);
/* assign loop counter register */
loop_count = 4 * p->i0;
#ifdef _M_X64
- MOVI(&fp, EBX, loop_count);
+ MOV_I(&fp, EBX, loop_count);
#else
- MOVI(&fp, ECX, loop_count);
+ MOV_I(&fp, ECX, loop_count);
#endif
#endif
@@ -204,18 +204,18 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
//fprintf(stderr, "Body start address = %016p\n", start);
#ifdef _M_X64
- /* generate function */
+ /* generate function */
- /* clear */
- XOR2(&fp, EAX, EAX);
-
- /* set "pointer" to offsets */
- MOV(&fp, RDI, RCX, 0, 0);
+ /* clear */
+ XOR2(&fp, EAX, EAX);
- /* set "pointer" to constants */
- MOV(&fp, RSI, RCX, 0xE0, 0);
+ /* set "pointer" to offsets */
+ MOV_D(&fp, RDI, RCX, 0, 0);
- /* align loop/jump destination */
+ /* set "pointer" to constants */
+ MOV_D(&fp, RSI, RCX, 0xE0, 0);
+
+ /* align loop/jump destination */
ffts_align_mem16(&fp, 8);
#else
/* copy function */
@@ -245,10 +245,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
/* align loop/jump destination */
#ifdef _M_X64
- MOVI(&fp, EBX, loop_count);
+ MOV_I(&fp, EBX, loop_count);
ffts_align_mem16(&fp, 3);
#else
- MOVI(&fp, ECX, loop_count);
+ MOV_I(&fp, ECX, loop_count);
ffts_align_mem16(&fp, 4);
#endif
@@ -298,10 +298,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
/* align loop/jump destination */
#ifdef _M_X64
- MOVI(&fp, EBX, loop_count);
+ MOV_I(&fp, EBX, loop_count);
ffts_align_mem16(&fp, 3);
#else
- MOVI(&fp, ECX, loop_count);
+ MOV_I(&fp, ECX, loop_count);
ffts_align_mem16(&fp, 4);
#endif
@@ -325,10 +325,10 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
/* align loop/jump destination */
#ifdef _M_X64
- MOVI(&fp, EBX, loop_count);
+ MOV_I(&fp, EBX, loop_count);
ffts_align_mem16(&fp, 8);
#else
- MOVI(&fp, ECX, loop_count);
+ MOV_I(&fp, ECX, loop_count);
ffts_align_mem16(&fp, 9);
#endif
@@ -343,38 +343,26 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
fp += len;
}
-#ifdef _M_X64
- /* generate function */
- MOVAPS2(&fp, XMM3, RSI);
-
- /* set "pointer" to twiddle factors */
- MOV(&fp, RDI, RCX, 0x20, 0);
-#else
- /* copy function */
- assert((char*) x4 > (char*) x_init);
- len = (char*) x4 - (char*) x_init;
- memcpy(fp, x_init, len);
- fp += len;
-#endif
+ generate_transform_init(&fp);
- /* generate subtransform calls */
+ /* generate subtransform calls */
count = 2;
while (pps[0]) {
size_t ws_is;
if (!pN) {
#ifdef _M_X64
- MOVI(&fp, EBX, pps[0]);
+ MOV_I(&fp, EBX, pps[0]);
#else
- MOVI(&fp, ECX, pps[0] / 4);
+ MOV_I(&fp, ECX, pps[0] / 4);
#endif
} else {
int offset = (4 * pps[1]) - pAddr;
if (offset) {
#ifdef _M_X64
- ADDI(&fp, R8, offset);
+ ADD_I(&fp, R8, offset);
#else
- ADDI(&fp, RDX, offset);
+ ADD_I(&fp, RDX, offset);
#endif
}
@@ -394,9 +382,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
int offset = (int) (ws_is - pLUT);
#ifdef _M_X64
- ADDI(&fp, RDI, offset);
+ ADD_I(&fp, RDI, offset);
#else
- ADDI(&fp, R8, offset);
+ ADD_I(&fp, R8, offset);
#endif
}
@@ -701,7 +689,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
*fp++ = POP_LR();
count++;
#else
- generate_epilogue(&fp);
+ generate_epilogue(&fp);
#endif
// *fp++ = B(14); count++;
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index abb5008..fa67a32 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -35,6 +35,7 @@
#define FFTS_CODEGEN_SSE_H
#include <assert.h>
+#include <string.h>
void neon_x4(float *, size_t, float *);
void neon_x8(float *, size_t, float *);
@@ -90,8 +91,8 @@ extern const uint32_t sse_leaf_oe_offsets[8];
#define XMM0 (XMM_REG | 0x0)
#define XMM1 (XMM_REG | 0x1)
-#define XMM2 (XMM_REG | 0x2)
-#define XMM3 (XMM_REG | 0x3)
+#define XMM2 (XMM_REG | 0x2)
+#define XMM3 (XMM_REG | 0x3)
#define XMM4 (XMM_REG | 0x4)
#define XMM5 (XMM_REG | 0x5)
#define XMM6 (XMM_REG | 0x6)
@@ -111,7 +112,26 @@ extern const uint32_t sse_leaf_oe_offsets[8];
static void IMM8(uint8_t **p, int32_t imm);
static void IMM32(uint8_t **p, int32_t imm);
-static void ADDI(uint8_t **p, uint8_t dst, int32_t imm)
+static FFTS_INLINE void ADDPS(uint8_t **p, uint8_t reg2, uint8_t reg1)
+{
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
+
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
+
+ /* opcode */
+ *(*p)++ = 0x58;
+ *(*p)++ = 0xC0 | r1 | (r2 << 3);
+}
+
+/* Immediate */
+static void ADD_I(uint8_t **p, uint8_t dst, int32_t imm)
{
if (dst >= 8) {
*(*p)++ = 0x49;
@@ -201,297 +221,268 @@ static void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp)
ADDRMODE(p, dst, base, disp);
}
-static FFTS_INLINE void MOV(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t disp, int is_store)
+static FFTS_INLINE void MOVAPS(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t disp, int is_store)
{
- uint8_t r1 = (reg1 & 7);
- uint8_t r2 = (reg2 & 7);
-
- if ((reg1 & 8) || (reg2 & 8)) {
- *(*p)++ = 0x49;
- } else {
- *(*p)++ = 0x48;
- }
-
- if (is_store) {
- *(*p)++ = 0x89;
- } else {
- *(*p)++ = 0x8B;
- }
-
- if (disp == 0) {
- *(*p)++ = r2 | (r1 << 3);
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+ uint8_t r;
- if (r2 == 4) {
- *(*p)++ = 0x24;
- }
- } else if (disp <= 127 && disp >= -128) {
- *(*p)++ = 0x40 | r2 | (r1 << 3);
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
- if (r2 == 4) {
- *(*p)++ = 0x24;
- }
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
- IMM8(p, disp);
+ /* opcode */
+ if (is_store) {
+ *(*p)++ = 0x29;
} else {
- *(*p)++ = 0x80 | r2 | (r1 << 3) | (r1 << 11);
-
- if (r2 == 4) {
- *(*p)++ = 0x24;
- }
-
- IMM32(p, disp);
+ *(*p)++ = 0x28;
}
-}
-
-static FFTS_INLINE void MOVAPS(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t disp, int is_store)
-{
- uint8_t r1 = (reg1 & 7);
- uint8_t r2 = (reg2 & 7);
- uint8_t r;
-
- /* REX prefix */
- if ((reg1 & 8) || (reg2 & 8)) {
- *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
- }
-
- /* esacape opcode */
- *(*p)++ = 0x0F;
- /* opcode */
- if (is_store) {
- *(*p)++ = 0x29;
- } else {
- *(*p)++ = 0x28;
- }
+ r = r1 | (r2 << 3);
- r = r1 | (r2 << 3);
-
- if ((reg1 & XMM_REG) && (reg2 & XMM_REG)) {
- assert(disp == 0);
- *(*p)++ = 0xC0 | r;
- } else {
- assert((reg1 & XMM_REG) || (reg2 & XMM_REG));
+ if ((reg1 & XMM_REG) && (reg2 & XMM_REG)) {
+ assert(disp == 0);
+ *(*p)++ = 0xC0 | r;
+ } else {
+ assert((reg1 & XMM_REG) || (reg2 & XMM_REG));
- if (disp == 0 && r1 != 5) {
- *(*p)++ = r;
+ if (disp == 0 && r1 != 5) {
+ *(*p)++ = r;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
- } else {
- if (disp <= 127 && disp >= -128) {
- *(*p)++ = 0x40 | r;
+ if (r1 == 4) {
+ *(*p)++ = 0x24;
+ }
+ } else {
+ if (disp <= 127 && disp >= -128) {
+ *(*p)++ = 0x40 | r;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
+ if (r1 == 4) {
+ *(*p)++ = 0x24;
+ }
- IMM8(p, disp);
- } else {
- *(*p)++ = 0x80 | r;
+ IMM8(p, disp);
+ } else {
+ *(*p)++ = 0x80 | r;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
+ if (r1 == 4) {
+ *(*p)++ = 0x24;
+ }
- IMM32(p, disp);
- }
- }
- }
+ IMM32(p, disp);
+ }
+ }
+ }
}
static FFTS_INLINE void MOVAPS2(uint8_t **p, uint8_t reg1, uint8_t reg2)
{
- if (reg1 & XMM_REG) {
- MOVAPS(p, reg2, reg1, 0, 0);
- } else {
- MOVAPS(p, reg1, reg2, 0, 1);
- }
+ if (reg1 & XMM_REG) {
+ MOVAPS(p, reg2, reg1, 0, 0);
+ } else {
+ MOVAPS(p, reg1, reg2, 0, 1);
+ }
}
static FFTS_INLINE void MOVAPS3(uint8_t **p, uint8_t reg1, int32_t op2, int32_t op3)
{
- if (reg1 & XMM_REG) {
- MOVAPS(p, (uint8_t) op2, reg1, op3, 0);
- } else {
- MOVAPS(p, reg1, (uint8_t) op3, op2, 1);
- }
+ if (reg1 & XMM_REG) {
+ MOVAPS(p, (uint8_t) op2, reg1, op3, 0);
+ } else {
+ MOVAPS(p, reg1, (uint8_t) op3, op2, 1);
+ }
}
static FFTS_INLINE void MOVDQA(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t disp, int is_store)
{
- uint8_t r1 = (reg1 & 7);
- uint8_t r2 = (reg2 & 7);
- uint8_t r;
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+ uint8_t r;
- /* mandatory prefix */
- *(*p)++ = 0x66;
+ /* mandatory prefix */
+ *(*p)++ = 0x66;
- /* REX prefix */
- if ((reg1 & 8) || (reg2 & 8)) {
- *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
- }
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
- /* esacape opcode */
- *(*p)++ = 0x0F;
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
- /* opcode */
- if (is_store) {
- *(*p)++ = 0x7F;
- } else {
- *(*p)++ = 0x6F;
- }
+ /* opcode */
+ if (is_store) {
+ *(*p)++ = 0x7F;
+ } else {
+ *(*p)++ = 0x6F;
+ }
- r = r1 | (r2 << 3);
+ r = r1 | (r2 << 3);
- if ((reg1 & XMM_REG) && (reg2 & XMM_REG)) {
- assert(disp == 0);
- *(*p)++ = 0xC0 | r;
- } else {
- assert((reg1 & XMM_REG) || (reg2 & XMM_REG));
+ if ((reg1 & XMM_REG) && (reg2 & XMM_REG)) {
+ assert(disp == 0);
+ *(*p)++ = 0xC0 | r;
+ } else {
+ assert((reg1 & XMM_REG) || (reg2 & XMM_REG));
- if (disp == 0 && r1 != 5) {
- *(*p)++ = r;
+ if (disp == 0 && r1 != 5) {
+ *(*p)++ = r;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
- } else {
- if (disp <= 127 && disp >= -128) {
- *(*p)++ = 0x40 | r;
+ if (r1 == 4) {
+ *(*p)++ = 0x24;
+ }
+ } else {
+ if (disp <= 127 && disp >= -128) {
+ *(*p)++ = 0x40 | r;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
+ if (r1 == 4) {
+ *(*p)++ = 0x24;
+ }
- IMM8(p, disp);
- } else {
- *(*p)++ = 0x80 | r;
+ IMM8(p, disp);
+ } else {
+ *(*p)++ = 0x80 | r;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
+ if (r1 == 4) {
+ *(*p)++ = 0x24;
+ }
- IMM32(p, disp);
- }
- }
- }
+ IMM32(p, disp);
+ }
+ }
+ }
}
static FFTS_INLINE void MOVDQA2(uint8_t **p, uint8_t reg1, uint8_t reg2)
{
- if (reg1 & XMM_REG) {
- MOVDQA(p, reg2, reg1, 0, 0);
- } else {
- MOVDQA(p, reg1, reg2, 0, 1);
- }
+ if (reg1 & XMM_REG) {
+ MOVDQA(p, reg2, reg1, 0, 0);
+ } else {
+ MOVDQA(p, reg1, reg2, 0, 1);
+ }
}
static FFTS_INLINE void MOVDQA3(uint8_t **p, uint8_t reg1, int32_t op2, int32_t op3)
{
- if (reg1 & XMM_REG) {
- MOVDQA(p, (uint8_t) op2, reg1, op3, 0);
- } else {
- MOVDQA(p, reg1, (uint8_t) op3, op2, 1);
- }
+ if (reg1 & XMM_REG) {
+ MOVDQA(p, (uint8_t) op2, reg1, op3, 0);
+ } else {
+ MOVDQA(p, reg1, (uint8_t) op3, op2, 1);
+ }
}
-static void MOVI(uint8_t **p, uint8_t dst, uint64_t imm)
+static FFTS_INLINE void MOV_D(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t disp, int is_store)
{
- /* REX prefix */
- if (dst >= 8 || imm > UINT32_MAX) {
- uint8_t val = 0x40;
-
- if (dst >= 8) {
- val |= 1;
- }
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
- if (imm > UINT32_MAX) {
- val |= 8;
- }
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x49;
+ } else {
+ *(*p)++ = 0x48;
+ }
- *(*p)++ = val;
+ if (is_store) {
+ *(*p)++ = 0x89;
+ } else {
+ *(*p)++ = 0x8B;
}
- /* opcode */
- *(*p)++ = 0xb8 | (dst & 0x7);
-
- if (imm > UINT32_MAX) {
- IMM64(p, imm);
- } else {
- IMM32(p, imm);
- }
-}
+ if (disp == 0) {
+ *(*p)++ = r2 | (r1 << 3);
-static FFTS_INLINE void MULPS(uint8_t **p, uint8_t reg1, uint8_t reg2, int32_t disp, int is_store)
-{
- uint8_t r1 = (reg1 & 7);
- uint8_t r2 = (reg2 & 7);
- uint8_t r;
+ if (r2 == 4) {
+ *(*p)++ = 0x24;
+ }
+ } else if (disp <= 127 && disp >= -128) {
+ *(*p)++ = 0x40 | r2 | (r1 << 3);
- /* REX prefix */
- if ((reg1 & 8) || (reg2 & 8)) {
- *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
- }
+ if (r2 == 4) {
+ *(*p)++ = 0x24;
+ }
- /* esacape opcode */
- *(*p)++ = 0x0F;
-
- /* opcode */
- *(*p)++ = 0x59;
-
- r = r1 | (r2 << 3);
+ IMM8(p, disp);
+ } else {
+ *(*p)++ = 0x80 | r2 | (r1 << 3) | (r1 << 11);
- if ((reg1 & XMM_REG) && (reg2 & XMM_REG)) {
- assert(disp == 0);
- *(*p)++ = 0xC0 | r;
- } else {
- assert((reg1 & XMM_REG) || (reg2 & XMM_REG));
+ if (r2 == 4) {
+ *(*p)++ = 0x24;
+ }
- if (disp == 0 && r1 != 5) {
- *(*p)++ = r;
+ IMM32(p, disp);
+ }
+}
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
- } else {
- if (disp <= 127 && disp >= -128) {
- *(*p)++ = 0x40 | r;
+static void MOV_I(uint8_t **p, uint8_t dst, uint64_t imm)
+{
+ /* REX prefix */
+ if (dst >= 8 || imm > UINT32_MAX) {
+ uint8_t val = 0x40;
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
+ if (dst >= 8) {
+ val |= 1;
+ }
- IMM8(p, disp);
- } else {
- *(*p)++ = 0x80 | r;
+ if (imm > UINT32_MAX) {
+ val |= 8;
+ }
- if (r1 == 4) {
- *(*p)++ = 0x24;
- }
+ *(*p)++ = val;
+ }
- IMM32(p, disp);
- }
- }
- }
+ /* opcode */
+ *(*p)++ = 0xb8 | (dst & 0x7);
+
+ if (imm > UINT32_MAX) {
+ IMM64(p, imm);
+ } else {
+ IMM32(p, imm);
+ }
}
-static FFTS_INLINE void MULPS2(uint8_t **p, uint8_t reg1, uint8_t reg2)
+static FFTS_INLINE void MOV_R(uint8_t **p, uint8_t reg1, uint8_t reg2, int is_store)
{
- if (reg1 & XMM_REG) {
- MULPS(p, reg2, reg1, 0, 0);
- } else {
- MULPS(p, reg1, reg2, 0, 1);
- }
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x48 | ((reg2 & 8) >> 3) | ((reg1 & 8) >> 1);
+ } else {
+ *(*p)++ = 0x48;
+ }
+
+ if (is_store) {
+ *(*p)++ = 0x89;
+ } else {
+ *(*p)++ = 0x8B;
+ }
+
+ *(*p)++ = 0xC0 | r2 | (r1 << 3);
+
+ if (r2 == 4) {
+ *(*p)++ = 0x24;
+ }
}
-static FFTS_INLINE void MULPS3(uint8_t **p, uint8_t reg1, int32_t op2, int32_t op3)
+static FFTS_INLINE void MULPS(uint8_t **p, uint8_t reg2, uint8_t reg1)
{
- if (reg1 & XMM_REG) {
- MULPS(p, (uint8_t) op2, reg1, op3, 0);
- } else {
- MULPS(p, reg1, (uint8_t) op3, op2, 1);
- }
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
+
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
+
+ /* opcode */
+ *(*p)++ = 0x59;
+ *(*p)++ = 0xC0 | r1 | (r2 << 3);
}
static void POP(uint8_t **p, uint8_t reg)
@@ -546,7 +537,48 @@ static void SHIFT(uint8_t **p, uint8_t reg, int shift)
}
}
-static void SUBI(uint8_t **p, uint8_t dst, int32_t imm)
+static FFTS_INLINE void SHUFPS(uint8_t **p, uint8_t reg2, uint8_t reg1, const int select)
+{
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+ uint8_t r;
+
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
+
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
+
+ /* opcode */
+ *(*p)++ = 0xC6;
+
+ r = r1 | (r2 << 3);
+
+ *(*p)++ = 0xC0 | r;
+ *(*p)++ = (select & 0xFF);
+}
+
+static FFTS_INLINE void SUBPS(uint8_t **p, uint8_t reg2, uint8_t reg1)
+{
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
+
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
+
+ /* opcode */
+ *(*p)++ = 0x5C;
+ *(*p)++ = 0xC0 | r1 | (r2 << 3);
+}
+
+static void SUB_I(uint8_t **p, uint8_t dst, int32_t imm)
{
if (dst >= 8) {
*(*p)++ = 0x49;
@@ -571,12 +603,34 @@ static void SUBI(uint8_t **p, uint8_t dst, int32_t imm)
static FFTS_INLINE void XOR2(uint8_t **p, uint8_t reg1, uint8_t reg2)
{
- if ((reg1 & 8) || (reg2 & 8)) {
- *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
- }
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
+
+ *(*p)++ = 0x31;
+ *(*p)++ = 0xC0 | r2 | (r1 << 3);
+}
+
+static FFTS_INLINE void XORPS(uint8_t **p, uint8_t reg2, uint8_t reg1)
+{
+ uint8_t r1 = (reg1 & 7);
+ uint8_t r2 = (reg2 & 7);
+
+ /* REX prefix */
+ if ((reg1 & 8) || (reg2 & 8)) {
+ *(*p)++ = 0x40 | ((reg1 & 8) >> 3) | ((reg2 & 8) >> 1);
+ }
+
+ /* esacape opcode */
+ *(*p)++ = 0x0F;
- *(*p)++ = 0x31;
- *(*p)++ = 0xC0 | (reg2 & 7) | ((reg1 & 7) << 3);
+ /* opcode */
+ *(*p)++ = 0x57;
+ *(*p)++ = 0xC0 | r1 | (r2 << 3);
}
static FFTS_INLINE void ffts_insert_nops(uint8_t **p, uint32_t count)
@@ -662,78 +716,71 @@ static FFTS_INLINE void ffts_align_mem16(uint8_t **p, uint32_t offset)
ffts_insert_nops(p, r);
}
-static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
-{
- insns_t *x_4_addr;
- size_t len;
-
- /* align call destination */
- ffts_align_mem16(fp, 0);
- x_4_addr = *fp;
-
- /* copy function */
- assert((char*) x8_soft > (char*) x4);
- len = (char*) x8_soft - (char*) x4;
- memcpy(*fp, x4, len);
- *fp += len;
-
- return x_4_addr;
-}
-
-static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
+static FFTS_INLINE void generate_epilogue(insns_t **fp)
{
- insns_t *x_8_addr;
- size_t len;
-
- /* align call destination */
- ffts_align_mem16(fp, 0);
- x_8_addr = *fp;
-
- /* align loop/jump destination */
#ifdef _M_X64
- ffts_align_mem16(fp, 6);
+ /* restore nonvolatile registers */
+ MOVDQA3(fp, XMM6, RSP, 0);
+ MOVDQA3(fp, XMM7, RSP, 16);
+ MOVDQA3(fp, XMM8, RSP, 32);
+ MOVDQA3(fp, XMM9, RSP, 48);
+ MOVDQA3(fp, XMM10, RSP, 64);
+ MOVDQA3(fp, XMM11, RSP, 80);
+ MOVDQA3(fp, XMM12, RSP, 96);
+ MOVDQA3(fp, XMM13, RSP, 112);
+ MOVDQA3(fp, XMM14, RSP, 128);
+ MOVDQA3(fp, XMM15, RSP, 144);
+
+ /* restore stack */
+ ADD_I(fp, RSP, 168);
+
+ /* restore the last 3 registers from the shadow space */
+ MOV_D(fp, RBX, RSP, 8, 0);
+ MOV_D(fp, RSI, RSP, 16, 0);
+ MOV_D(fp, RDI, RSP, 24, 0);
#else
- ffts_align_mem16(fp, 5);
+ POP(fp, R15);
+ POP(fp, R14);
+ POP(fp, R13);
+ POP(fp, R12);
+ POP(fp, R11);
+ POP(fp, R10);
+ POP(fp, RBX);
+ POP(fp, RBP);
#endif
- /* copy function */
- assert((char*) x8_soft_end > (char*) x8_soft);
- len = (char*) x8_soft_end - (char*) x8_soft;
- memcpy(*fp, x8_soft, len);
- *fp += len;
-
- return x_8_addr;
+ RET(fp);
}
static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p)
{
- insns_t *start;
+ insns_t *start;
- /* align call destination */
- ffts_align_mem16(fp, 0);
- start = *fp;
+ /* align call destination */
+ ffts_align_mem16(fp, 0);
+ start = *fp;
/* save nonvolatile registers */
#ifdef _M_X64
/* use the shadow space to save first 3 registers */
- MOV(fp, RBX, RSP, 8, 1);
- MOV(fp, RSI, RSP, 16, 1);
- MOV(fp, RDI, RSP, 24, 1);
+ MOV_D(fp, RBX, RSP, 8, 1);
+ MOV_D(fp, RSI, RSP, 16, 1);
+ MOV_D(fp, RDI, RSP, 24, 1);
/* reserve space.. */
- SUBI(fp, RSP, 168);
-
- /* to save XMM6-XMM15 registers */
- MOVDQA3(fp, RSP, 0, XMM6);
- MOVDQA3(fp, RSP, 16, XMM7);
- MOVDQA3(fp, RSP, 32, XMM8);
- MOVDQA3(fp, RSP, 48, XMM9);
- MOVDQA3(fp, RSP, 64, XMM10);
- MOVDQA3(fp, RSP, 80, XMM11);
- MOVDQA3(fp, RSP, 96, XMM12);
- MOVDQA3(fp, RSP, 112, XMM13);
- MOVDQA3(fp, RSP, 128, XMM14);
- MOVDQA3(fp, RSP, 144, XMM15);
+ SUB_I(fp, RSP, 168);
+
+ /* to save XMM6-XMM15 registers */
+ MOVDQA3(fp, RSP, 0, XMM6);
+ MOVDQA3(fp, RSP, 16, XMM7);
+ MOVDQA3(fp, RSP, 32, XMM8);
+ MOVDQA3(fp, RSP, 48, XMM9);
+ MOVDQA3(fp, RSP, 64, XMM10);
+ MOVDQA3(fp, RSP, 80, XMM11);
+ MOVDQA3(fp, RSP, 96, XMM12);
+ MOVDQA3(fp, RSP, 112, XMM13);
+ MOVDQA3(fp, RSP, 128, XMM14);
+ MOVDQA3(fp, RSP, 144, XMM15);
#else
PUSH(fp, RBP);
PUSH(fp, RBX);
@@ -745,43 +792,425 @@ static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p)
PUSH(fp, R15);
#endif
- return start;
+ return start;
}
-static FFTS_INLINE void generate_epilogue(insns_t **fp)
+static FFTS_INLINE void generate_transform_init(insns_t **fp)
{
#ifdef _M_X64
- /* restore nonvolatile registers */
- MOVDQA3(fp, XMM6, RSP, 0);
- MOVDQA3(fp, XMM7, RSP, 16);
- MOVDQA3(fp, XMM8, RSP, 32);
- MOVDQA3(fp, XMM9, RSP, 48);
- MOVDQA3(fp, XMM10, RSP, 64);
- MOVDQA3(fp, XMM11, RSP, 80);
- MOVDQA3(fp, XMM12, RSP, 96);
- MOVDQA3(fp, XMM13, RSP, 112);
- MOVDQA3(fp, XMM14, RSP, 128);
- MOVDQA3(fp, XMM15, RSP, 144);
-
- /* restore stack */
- ADDI(fp, RSP, 168);
+ /* generate function */
+ MOVAPS2(fp, XMM3, RSI);
- /* restore the last 3 registers from the shadow space */
- MOV(fp, RBX, RSP, 8, 0);
- MOV(fp, RSI, RSP, 16, 0);
- MOV(fp, RDI, RSP, 24, 0);
+ /* set "pointer" to twiddle factors */
+ MOV_D(fp, RDI, RCX, 0x20, 0);
#else
- POP(fp, R15);
- POP(fp, R14);
- POP(fp, R13);
- POP(fp, R12);
- POP(fp, R11);
- POP(fp, R10);
- POP(fp, RBX);
- POP(fp, RBP);
+ size_t len;
+
+ /* copy function */
+ assert((char*) x4 > (char*) x_init);
+ len = (char*) x4 - (char*) x_init;
+ memcpy(*fp, x_init, len);
+ *fp += len;
#endif
+}
+
+static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
+{
+ insns_t *x_4_addr;
+ size_t len;
+
+ /* align call destination */
+ ffts_align_mem16(fp, 0);
+ x_4_addr = *fp;
+ /* copy function */
+ assert((char*) x8_soft > (char*) x4);
+ len = (char*) x8_soft - (char*) x4;
+ memcpy(*fp, x4, len);
+ *fp += len;
+
+ return x_4_addr;
+}
+
+static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
+{
+ insns_t *x_8_addr;
+#ifdef _M_X64
+ insns_t *x8_soft_loop;
+#else
+ size_t len;
+#endif
+
+ /* align call destination */
+ ffts_align_mem16(fp, 0);
+ x_8_addr = *fp;
+
+ /* align loop/jump destination */
+#ifdef _M_X64
+ ffts_align_mem16(fp, 6);
+#else
+ ffts_align_mem16(fp, 5);
+#endif
+
+#ifdef _M_X64
+ /* input */
+ MOV_R(fp, RDI, RAX, 1);
+
+ /* output */
+ MOV_R(fp, R8, RCX, 1);
+
+ /* lea rdx, [r8 + rbx] */
+ /* loop stop (output + output_stride) */
+ *(*fp)++ = 0x49;
+ *(*fp)++ = 0x8D;
+ *(*fp)++ = 0x14;
+ *(*fp)++ = 0x18;
+
+ /* lea rsi, [rbx + rbx*2] */
+ /* 3 * output_stride */
+ *(*fp)++ = 0x48;
+ *(*fp)++ = 0x8D;
+ *(*fp)++ = 0x34;
+ *(*fp)++ = 0x5B;
+
+ /* lea r10, [rbx + rbx*4] */
+ /* 5 * output_stride */
+ *(*fp)++ = 0x4C;
+ *(*fp)++ = 0x8D;
+ *(*fp)++ = 0x14;
+ *(*fp)++ = 0x9B;
+
+ /* lea r11, [rsi + rbx*4] */
+ /* 7 * output_stride */
+ *(*fp)++ = 0x4C;
+ *(*fp)++ = 0x8D;
+ *(*fp)++ = 0x1C;
+ *(*fp)++ = 0x9E;
+
+ x8_soft_loop = *fp;
+ assert(!(((uintptr_t) x8_soft_loop) & 0xF));
+
+ /* movaps xmm9, [rax] */
+ /* input + 0 * input_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x08;
+
+ /* movaps xmm6, [rcx + rbx*2] */
+ /* output + 2 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x34;
+ *(*fp)++ = 0x59;
+
+ /* movaps xmm11, xmm9 */
+ *(*fp)++ = 0x45;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xD9;
+
+ /* movaps xmm7, [rcx + rsi] */
+ /* output + 3 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x3C;
+ *(*fp)++ = 0x31;
+
+ /* movaps xmm8, [rax + 0x10] */
+ /* input + 1 * input_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x40;
+ *(*fp)++ = 0x10;
+
+ MULPS(fp, XMM11, XMM6);
+ MULPS(fp, XMM9, XMM7);
+ SHUFPS(fp, XMM6, XMM6, 0xB1);
+ MULPS(fp, XMM6, XMM8);
+ SHUFPS(fp, XMM7, XMM7, 0xB1);
+ SUBPS(fp, XMM11, XMM6);
+ MULPS(fp, XMM8, XMM7);
+
+ /* movaps xmm10, xmm11 */
+ *(*fp)++ = 0x45;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xD3;
+
+ ADDPS(fp, XMM9, XMM8);
+
+ /* movaps xmm15, [rax + 0x20] */
+ /* input + 2 * input_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x78;
+ *(*fp)++ = 0x20;
+
+ ADDPS(fp, XMM10, XMM9);
+ SUBPS(fp, XMM11, XMM9);
+
+ /* movaps xmm5, [rcx] */
+ /* output + 0 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x29;
+
+ /* movaps xmm6,xmm15 */
+ *(*fp)++ = 0x41;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xF7;
+
+ /* movaps xmm12, [rcx + rbx*4] */
+ /* output + 4 * output_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x24;
+ *(*fp)++ = 0x99;
+
+ /* movaps xmm2, xmm5 */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xD5;
+
+ /* movaps xmm13, [rcx + rsi*2] */
+ /* output + 6 * output_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x2C;
+ *(*fp)++ = 0x71;
+
+ XORPS(fp, XMM11, XMM3);
+
+ /* movaps xmm14, [rax + 0x30] */
+ /* input + 3 * input_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x70;
+ *(*fp)++ = 0x30;
+
+ SUBPS(fp, XMM2, XMM10);
+ MULPS(fp, XMM6, XMM12);
+ ADDPS(fp, XMM5, XMM10);
+ MULPS(fp, XMM15, XMM13);
+
+ /* movaps xmm10, [rax + 0x40] */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x50;
+ *(*fp)++ = 0x40;
+
+ /* movaps xmm0, xmm5 */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xC5;
+
+ SHUFPS(fp, XMM12, XMM12, 0xB1);
+ SHUFPS(fp, XMM13, XMM13, 0xB1);
+ MULPS(fp, XMM12, XMM14);
+ MULPS(fp, XMM14, XMM13);
+ SUBPS(fp, XMM6, XMM12);
+ ADDPS(fp, XMM15, XMM14);
+
+ /* movaps xmm7, [rcx + r10] */
+ *(*fp)++ = 0x42;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x3C;
+ *(*fp)++ = 0x11;
+
+ /* movaps xmm13, xmm10 */
+ *(*fp)++ = 0x45;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xEA;
+
+ /* movaps xmm8, [rcx + r11] */
+ *(*fp)++ = 0x46;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x04;
+ *(*fp)++ = 0x19;
+
+ /* movaps xmm12, xmm6 */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xE6;
+
+ /* movaps xmm9, [rax + 0x50] */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x48;
+ *(*fp)++ = 0x50;
+
+ /* input + 6 * input_stride */
+ ADD_I(fp, RAX, 0x60);
+
+ MULPS(fp, XMM13, XMM7);
+ SUBPS(fp, XMM6, XMM15);
+ ADDPS(fp, XMM12, XMM15);
+ MULPS(fp, XMM10, XMM8);
+ SUBPS(fp, XMM0, XMM12);
+ ADDPS(fp, XMM5, XMM12);
+ SHUFPS(fp, XMM7, XMM7, 0xB1);
+ XORPS(fp, XMM6, XMM3);
+ SHUFPS(fp, XMM8, XMM8, 0xB1);
+
+ /* movaps xmm12, xmm2 */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xE2;
+
+ MULPS(fp, XMM7, XMM9);
+ MULPS(fp, XMM9, XMM8);
+ SUBPS(fp, XMM13, XMM7);
+ ADDPS(fp, XMM10, XMM9);
+
+ /* movaps xmm4, [rcx + rbx] */
+ /* output + 1 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0x24;
+ *(*fp)++ = 0x19;
+
+ SHUFPS(fp, XMM11, XMM11, 0xB1);
+
+ /* movaps xmm1, xmm4 */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xCC;
+
+ SHUFPS(fp, XMM6, XMM6, 0xB1);
+ ADDPS(fp, XMM1, XMM11);
+ SUBPS(fp, XMM4, XMM11);
+ ADDPS(fp, XMM12, XMM6);
+ SUBPS(fp, XMM2, XMM6);
+
+ /* movaps xmm11, xmm13 */
+ *(*fp)++ = 0x45;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xDD;
+
+ /* movaps xmm14, xmm4 */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xF4;
+
+ /* movaps xmm6, xmm1 */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x28;
+ *(*fp)++ = 0xF1;
+
+ SUBPS(fp, XMM13, XMM10);
+ ADDPS(fp, XMM11, XMM10);
+ XORPS(fp, XMM13, XMM3);
+ ADDPS(fp, XMM4, XMM11);
+ SUBPS(fp, XMM14, XMM11);
+ SHUFPS(fp, XMM13, XMM13, 0xB1);
+
+ /* movaps [rcx], xmm5 */
+ /* output + 0 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x29;
+
+ /* movaps [rcx + rbx], xmm4 */
+ /* output + 1 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x24;
+ *(*fp)++ = 0x19;
+
+ /* movaps [rcx + rbx*2], xmm2 */
+ /* output + 2 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x14;
+ *(*fp)++ = 0x59;
+
+ SUBPS(fp, XMM1, XMM13);
+ ADDPS(fp, XMM6, XMM13);
+
+ /* movaps [rcx + rsi], xmm1 */
+ /* output + 3 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x0C;
+ *(*fp)++ = 0x31;
+
+ /* movaps [rcx + rbx*4], xmm0 */
+ /* output + 4 * output_stride */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x04;
+ *(*fp)++ = 0x99;
+
+ /* movaps [rcx + r10], xmm14 */
+ /* output + 5 * output_stride */
+ *(*fp)++ = 0x46;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x34;
+ *(*fp)++ = 0x11;
+
+ /* movaps [rcx + rsi*2], xmm12 */
+ /* output + 6 * output_stride */
+ *(*fp)++ = 0x44;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x24;
+ *(*fp)++ = 0x71;
+
+ /* movaps [rcx + r11], xmm6 */
+ /* output + 7 * output_stride */
+ *(*fp)++ = 0x42;
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x29;
+ *(*fp)++ = 0x34;
+ *(*fp)++ = 0x19;
+
+ /* add rcx, 0x10 */
+ *(*fp)++ = 0x48;
+ *(*fp)++ = 0x83;
+ *(*fp)++ = 0xC1;
+ *(*fp)++ = 0x10;
+
+ /* cmp rcx, rdx */
+ *(*fp)++ = 0x48;
+ *(*fp)++ = 0x39;
+ *(*fp)++ = 0xD1;
+
+ /* jne [x8_soft_loop] */
+ *(*fp)++ = 0x0F;
+ *(*fp)++ = 0x85;
+ *(*fp)++ = 0x9E;
+ *(*fp)++ = 0xFE;
+ *(*fp)++ = 0xFF;
+ *(*fp)++ = 0xFF;
+
+ /* ret */
RET(fp);
+#else
+ /* copy function */
+ assert((char*) x8_soft_end >= (char*) x8_soft);
+ len = (char*) x8_soft_end - (char*) x8_soft;
+ memcpy(*fp, x8_soft, len);
+ *fp += len;
+#endif
+
+ return x_8_addr;
}
#endif /* FFTS_CODEGEN_SSE_H */ \ No newline at end of file
diff --git a/src/sequitur.h b/src/sequitur.h
index 7429115..d459c56 100644
--- a/src/sequitur.h
+++ b/src/sequitur.h
@@ -32,417 +32,417 @@
*/
typedef struct _sym_t {
- int c;
- struct _sym_t *pPrev, *pNext;
- struct _seq_rule_t *r;
- int offset;
+ int c;
+ struct _sym_t *pPrev, *pNext;
+ struct _seq_rule_t *r;
+ int offset;
} sym_t;
typedef struct _seq_rule_t {
- int c;
- sym_t *ss;
- struct _seq_rule_t *pPrev, *pNext;
- int count;
- int length;
+ int c;
+ sym_t *ss;
+ struct _seq_rule_t *pPrev, *pNext;
+ int count;
+ int length;
} seq_rule_t;
void sym_tail_insert(sym_t **ss, sym_t *s)
{
- if (!*ss) {
- *ss = s;
- s->pPrev = s->pNext = NULL;
- } else {
- while (*ss) {
- s->pPrev = *ss;
- ss = &(*ss)->pNext;
- }
-
- *ss = s;
- }
+ if (!*ss) {
+ *ss = s;
+ s->pPrev = s->pNext = NULL;
+ } else {
+ while (*ss) {
+ s->pPrev = *ss;
+ ss = &(*ss)->pNext;
+ }
+
+ *ss = s;
+ }
}
sym_t* sym_init(int c)
{
- sym_t *s;
-
- s = (sym_t*) malloc(sizeof(*s));
- if (!s) {
- return NULL;
- }
-
- s->c = c;
- s->pPrev = s->pNext = NULL;
- s->r = NULL;
-
- return s;
+ sym_t *s;
+
+ s = (sym_t*) malloc(sizeof(*s));
+ if (!s) {
+ return NULL;
+ }
+
+ s->c = c;
+ s->pPrev = s->pNext = NULL;
+ s->r = NULL;
+
+ return s;
}
sym_t* sym_init_from_sym(sym_t *s2)
{
- sym_t *s;
+ sym_t *s;
- s = (sym_t*) malloc(sizeof(*s));
- if (!s) {
- return NULL;
- }
+ s = (sym_t*) malloc(sizeof(*s));
+ if (!s) {
+ return NULL;
+ }
- s->c = s2->c;
- s->pPrev = s->pNext = NULL;
- s->r = s2->r;
- s->offset = s2->offset;
+ s->c = s2->c;
+ s->pPrev = s->pNext = NULL;
+ s->r = s2->r;
+ s->offset = s2->offset;
- return s;
+ return s;
}
seq_rule_t* seq_init_rule(int c)
{
- seq_rule_t *G;
-
- G = (seq_rule_t *)malloc(sizeof(*G));
- if (!G) {
- return NULL;
- }
-
- G->c = c;
- G->count = 2;
- G->ss = NULL;
- G->pPrev = NULL;
- G->pNext = NULL;
-
- return G;
+ seq_rule_t *G;
+
+ G = (seq_rule_t *)malloc(sizeof(*G));
+ if (!G) {
+ return NULL;
+ }
+
+ G->c = c;
+ G->count = 2;
+ G->ss = NULL;
+ G->pPrev = NULL;
+ G->pNext = NULL;
+
+ return G;
}
seq_rule_t* seq_grammer_insert_new_rule(seq_rule_t *G, char r, sym_t *a, sym_t *b)
{
- sym_t *sa, *sb;
+ sym_t *sa, *sb;
- while (G->pNext) {
- G = G->pNext;
- }
+ while (G->pNext) {
+ G = G->pNext;
+ }
- G->pNext = seq_init_rule(r);
- if (!G->pNext) {
- return NULL;
- }
+ G->pNext = seq_init_rule(r);
+ if (!G->pNext) {
+ return NULL;
+ }
- sa = sym_init_from_sym(a);
- if (!sa) {
- goto cleanup_pnext;
- }
+ sa = sym_init_from_sym(a);
+ if (!sa) {
+ goto cleanup_pnext;
+ }
- sb = sym_init_from_sym(b);
- if (!sb) {
- goto cleanup_sa;
- }
+ sb = sym_init_from_sym(b);
+ if (!sb) {
+ goto cleanup_sa;
+ }
- sb->offset = sb->offset - sa->offset;
- sa->offset = 0;
- sym_tail_insert(&G->pNext->ss, sa);
- sym_tail_insert(&G->pNext->ss, sb);
- return G->pNext;
+ sb->offset = sb->offset - sa->offset;
+ sa->offset = 0;
+ sym_tail_insert(&G->pNext->ss, sa);
+ sym_tail_insert(&G->pNext->ss, sb);
+ return G->pNext;
cleanup_sa:
- free(sa);
+ free(sa);
cleanup_pnext:
- free(G->pNext);
- G->pNext = NULL;
+ free(G->pNext);
+ G->pNext = NULL;
- return NULL;
+ return NULL;
}
sym_t* sym_match_digram(sym_t *s, sym_t *term, sym_t *a, sym_t *b)
{
- while (s != term) {
- if (s->c == a->c && s->pNext->c == b->c &&
- s->pNext->offset - s->offset == b->offset-a->offset) {
- return s;
- }
+ while (s != term) {
+ if (s->c == a->c && s->pNext->c == b->c &&
+ s->pNext->offset - s->offset == b->offset-a->offset) {
+ return s;
+ }
- s = s->pNext;
- }
+ s = s->pNext;
+ }
- return NULL;
+ return NULL;
}
seq_rule_t* seq_match_digram(seq_rule_t *R, sym_t *a, sym_t *b)
{
- while (R) {
- if (R->ss->c == a->c && R->ss->pNext->c == b->c &&
- R->ss->pNext->offset - R->ss->offset == b->offset - a->offset) {
- return R;
- }
+ while (R) {
+ if (R->ss->c == a->c && R->ss->pNext->c == b->c &&
+ R->ss->pNext->offset - R->ss->offset == b->offset - a->offset) {
+ return R;
+ }
- R = R->pNext;
- }
+ R = R->pNext;
+ }
- return NULL;
+ return NULL;
}
sym_t* sym_tail(sym_t *s)
{
- while (s->pNext) {
- s = s->pNext;
- }
+ while (s->pNext) {
+ s = s->pNext;
+ }
- return s;
+ return s;
}
int sym_count(sym_t *s)
{
- int count = 0;
+ int count = 0;
- while (s) {
- count++;
- s = s->pNext;
- }
+ while (s) {
+ count++;
+ s = s->pNext;
+ }
- return count;
+ return count;
}
sym_t* sym_copylist(sym_t *s)
{
- sym_t *head = NULL;
- sym_t *prev = head;
-
- while (s) {
- sym_t *copy = sym_init_from_sym(s);
- if (!copy) {
- return NULL;
- }
-
- copy->pPrev = prev;
-
- if (prev) {
- prev->pNext = copy;
- }
-
- if (!head) {
- head = copy;
- }
-
- prev = copy;
- s = s->pNext;
- }
-
- return head;
+ sym_t *head = NULL;
+ sym_t *prev = head;
+
+ while (s) {
+ sym_t *copy = sym_init_from_sym(s);
+ if (!copy) {
+ return NULL;
+ }
+
+ copy->pPrev = prev;
+
+ if (prev) {
+ prev->pNext = copy;
+ }
+
+ if (!head) {
+ head = copy;
+ }
+
+ prev = copy;
+ s = s->pNext;
+ }
+
+ return head;
}
void seq_enforce_uniqueness(seq_rule_t *G)
{
- seq_rule_t *R = G;//->pNext;
- seq_rule_t **ppr = &G->pNext;
+ seq_rule_t *R = G;//->pNext;
+ seq_rule_t **ppr = &G->pNext;
- while (R) {
- if (R == G || R->count > 1) {
- sym_t *s = R->ss;
- sym_t **pp = &R->ss;
+ while (R) {
+ if (R == G || R->count > 1) {
+ sym_t *s = R->ss;
+ sym_t **pp = &R->ss;
- while (s) {
- if (s->r && s->r->count == 1) {
- sym_t *temp_itr;
+ while (s) {
+ if (s->r && s->r->count == 1) {
+ sym_t *temp_itr;
+
+ *pp = s->r->ss;
- *pp = s->r->ss;
-
temp_itr = s->r->ss;
while (temp_itr) {
temp_itr->offset += s->offset;
temp_itr = temp_itr->pNext;
}
-
- s->r->ss->pPrev = s->pPrev;
- if (s->pNext) {
- s->pNext->pPrev = sym_tail(s->r->ss);
- }
-
- sym_tail(s->r->ss)->pNext = s->pNext;
- s = s->r->ss;
- continue;
- }
-
- pp = &s->pNext;
- s = s->pNext;
- }
-
- ppr = &R->pNext;
- } else {
- *ppr = R->pNext;
- }
-
- R = R->pNext;
- }
+
+ s->r->ss->pPrev = s->pPrev;
+ if (s->pNext) {
+ s->pNext->pPrev = sym_tail(s->r->ss);
+ }
+
+ sym_tail(s->r->ss)->pNext = s->pNext;
+ s = s->r->ss;
+ continue;
+ }
+
+ pp = &s->pNext;
+ s = s->pNext;
+ }
+
+ ppr = &R->pNext;
+ } else {
+ *ppr = R->pNext;
+ }
+
+ R = R->pNext;
+ }
}
void seq_merge_small_rules(seq_rule_t *G, int thresh)
{
- seq_rule_t *R = G;
+ seq_rule_t *R = G;
- while (R) {
- if (sym_count(R->ss) <= thresh) {
+ while (R) {
+ if (sym_count(R->ss) <= thresh) {
//printf("count %d > %d for %d\n", sym_count(R->ss), thresh, R->c);
- sym_t *s = R->ss;
- sym_t **pp = &R->ss;
-
- while (s) {
- if (s->r) {
- sym_t *copylist;
- sym_t *copylist_itr;
-
- s->r->count--;
-
- copylist = sym_copylist(s->r->ss);
- if (!copylist) {
- return;
- }
+ sym_t *s = R->ss;
+ sym_t **pp = &R->ss;
+
+ while (s) {
+ if (s->r) {
+ sym_t *copylist;
+ sym_t *copylist_itr;
+
+ s->r->count--;
+
+ copylist = sym_copylist(s->r->ss);
+ if (!copylist) {
+ return;
+ }
copylist_itr = copylist;
while (copylist_itr) {
copylist_itr->offset += s->offset;
copylist_itr = copylist_itr->pNext;
}
-
- *pp = copylist;
- copylist->pPrev = s->pPrev;
- if (s->pNext) {
- s->pNext->pPrev = sym_tail(copylist);
- }
-
- sym_tail(copylist)->pNext = s->pNext;
- pp = &(sym_tail(copylist)->pNext);
- s = sym_tail(copylist)->pNext;
- continue;
- }
-
- pp = &s->pNext;
- s = s->pNext;
- }
- }
-
- R = R->pNext;
- }
+
+ *pp = copylist;
+ copylist->pPrev = s->pPrev;
+ if (s->pNext) {
+ s->pNext->pPrev = sym_tail(copylist);
+ }
+
+ sym_tail(copylist)->pNext = s->pNext;
+ pp = &(sym_tail(copylist)->pNext);
+ s = sym_tail(copylist)->pNext;
+ continue;
+ }
+
+ pp = &s->pNext;
+ s = s->pNext;
+ }
+ }
+
+ R = R->pNext;
+ }
seq_enforce_uniqueness(G);
}
void seq_extract_hierarchy(seq_rule_t *G)
{
- int next_rule = -2;
- sym_t *cursym = G->ss;
-
- while (cursym) {
- sym_t *m = NULL;
- seq_rule_t *mr = NULL;
-
- if (cursym->pPrev && cursym->pPrev->pPrev) {
- mr = seq_match_digram(G->pNext, cursym->pPrev, cursym);
- if (mr) {
- if (cursym->pPrev->r) {
- cursym->pPrev->r->count--;
- }
-
- if(cursym->r) {
- cursym->r->count--;
- }
-
- mr->count++;
-
- cursym->pPrev->r = mr;
- cursym->pPrev->c = mr->c;
- cursym->pPrev->pNext = cursym->pNext;
- cursym->pNext->pPrev = cursym->pPrev;
- cursym = cursym->pPrev;
- }
-
- m = sym_match_digram(G->ss, cursym->pPrev->pPrev, cursym->pPrev, cursym);
- if (m) {
- seq_rule_t *newr;
-
- if (cursym->pPrev->r) {
- cursym->pPrev->r->count--;
- }
-
- if (cursym->r) {
- cursym->r->count--;
- }
-
- newr = seq_grammer_insert_new_rule(G, next_rule, m, m->pNext);
- if (!newr) {
- return;
- }
-
- m->r = newr;
- m->c = next_rule;
- m->pNext = m->pNext->pNext;
- m->pNext->pPrev = m;
-
- cursym->pPrev->r = newr;
- cursym->pPrev->c = next_rule;
- cursym->pPrev->pNext = cursym->pNext;
- cursym->pNext->pPrev = cursym->pPrev;
- cursym = cursym->pPrev;
-
- next_rule--;
- }
- }
-
- if (!m && !mr) {
- cursym = cursym->pNext;
- }
- }
-
- seq_enforce_uniqueness(G);
- seq_merge_small_rules(G, 2);
+ int next_rule = -2;
+ sym_t *cursym = G->ss;
+
+ while (cursym) {
+ sym_t *m = NULL;
+ seq_rule_t *mr = NULL;
+
+ if (cursym->pPrev && cursym->pPrev->pPrev) {
+ mr = seq_match_digram(G->pNext, cursym->pPrev, cursym);
+ if (mr) {
+ if (cursym->pPrev->r) {
+ cursym->pPrev->r->count--;
+ }
+
+ if(cursym->r) {
+ cursym->r->count--;
+ }
+
+ mr->count++;
+
+ cursym->pPrev->r = mr;
+ cursym->pPrev->c = mr->c;
+ cursym->pPrev->pNext = cursym->pNext;
+ cursym->pNext->pPrev = cursym->pPrev;
+ cursym = cursym->pPrev;
+ }
+
+ m = sym_match_digram(G->ss, cursym->pPrev->pPrev, cursym->pPrev, cursym);
+ if (m) {
+ seq_rule_t *newr;
+
+ if (cursym->pPrev->r) {
+ cursym->pPrev->r->count--;
+ }
+
+ if (cursym->r) {
+ cursym->r->count--;
+ }
+
+ newr = seq_grammer_insert_new_rule(G, next_rule, m, m->pNext);
+ if (!newr) {
+ return;
+ }
+
+ m->r = newr;
+ m->c = next_rule;
+ m->pNext = m->pNext->pNext;
+ m->pNext->pPrev = m;
+
+ cursym->pPrev->r = newr;
+ cursym->pPrev->c = next_rule;
+ cursym->pPrev->pNext = cursym->pNext;
+ cursym->pNext->pPrev = cursym->pPrev;
+ cursym = cursym->pPrev;
+
+ next_rule--;
+ }
+ }
+
+ if (!m && !mr) {
+ cursym = cursym->pNext;
+ }
+ }
+
+ seq_enforce_uniqueness(G);
+ seq_merge_small_rules(G, 2);
// seq_enforce_uniqueness(G);
}
void seq_compute_lengths(seq_rule_t *G)
{
- seq_rule_t *R = G->pNext;
- sym_t *s;
- int sum;
-
- while (R) {
- sum = 0;
- s = R->ss;
-
- while (s) {
- if (s->c >= 0) {
- if (s->offset + s->c > sum) {
- sum = s->offset + s->c;
- }
- }
-
- if (s->c < 0) {
- if (s->offset + s->r->length > sum) {
- sum = s->offset + s->r->length;
- }
- }
-
- s = s->pNext;
- }
-
- R->length = sum;
- R = R->pNext;
- }
-
- sum = 0;
- s = G->ss;
-
- while (s) {
- if (s->c >= 0) {
- if (s->offset + s->c > sum) {
- sum = s->offset + s->c;
- }
- }
-
- if (s->c < 0) {
- if (s->offset + s->r->length > sum) {
- sum = s->offset + s->r->length;
- }
- }
-
- s = s->pNext;
- }
-
- G->length = sum;
+ seq_rule_t *R = G->pNext;
+ sym_t *s;
+ int sum;
+
+ while (R) {
+ sum = 0;
+ s = R->ss;
+
+ while (s) {
+ if (s->c >= 0) {
+ if (s->offset + s->c > sum) {
+ sum = s->offset + s->c;
+ }
+ }
+
+ if (s->c < 0) {
+ if (s->offset + s->r->length > sum) {
+ sum = s->offset + s->r->length;
+ }
+ }
+
+ s = s->pNext;
+ }
+
+ R->length = sum;
+ R = R->pNext;
+ }
+
+ sum = 0;
+ s = G->ss;
+
+ while (s) {
+ if (s->c >= 0) {
+ if (s->offset + s->c > sum) {
+ sum = s->offset + s->c;
+ }
+ }
+
+ if (s->c < 0) {
+ if (s->offset + s->r->length > sum) {
+ sum = s->offset + s->r->length;
+ }
+ }
+
+ s = s->pNext;
+ }
+
+ G->length = sum;
} \ No newline at end of file
OpenPOWER on IntegriCloud