summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/codegen.c34
-rw-r--r--src/codegen_sse.h28
2 files changed, 35 insertions, 27 deletions
diff --git a/src/codegen.c b/src/codegen.c
index fc407cb..c4e19e6 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -9,14 +9,14 @@
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -199,7 +199,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
generate_leaf_init(&fp, loop_count);
if (ffts_ctzl(N) & 1) {
- generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);
+ generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);
if (p->i1) {
loop_count += 4 * p->i1;
@@ -209,14 +209,14 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
loop_count += 4;
generate_leaf_oe(&fp, offsets_o);
} else {
- generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);
+ generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);
loop_count += 4;
generate_leaf_eo(&fp, offsets);
if (p->i1) {
loop_count += 4 * p->i1;
- generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
+ generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
}
}
@@ -309,7 +309,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
#ifdef __arm__
#ifdef HAVE_NEON
- if(__builtin_ctzl(N) & 1) {
+ if (ffts_ctzl(N) & 1) {
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
ADDI(&fp, 9, 2, 0);
@@ -535,9 +535,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
}
- if(p->ws_is[__builtin_ctzl(pps[0]/leaf_N)-1]*8 - pLUT)
- ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
-
+ if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) {
+ ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
+ }
if(pps[0] == 2 * leaf_N) {
*fp = BL(fp+2, x_4_addr);
@@ -574,7 +574,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
pAddr = pps[1] * 4;
pN = pps[0];
- pLUT = p->ws_is[__builtin_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
+ pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
count += 4;
pps += 2;
@@ -594,7 +594,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++)
- fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));
+ //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));
free(ps);
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index c0a34fe..d15b316 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -192,7 +192,7 @@ generate_epilogue(insns_t **fp)
static FFTS_INLINE insns_t*
generate_prologue(insns_t **fp, ffts_plan_t *p)
{
- insns_t *start;
+ insns_t *start;
/* unreferenced parameter */
(void) p;
@@ -301,12 +301,14 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
+
x64_sse_movaps_membase_reg(ins, X64_R8, 0, X64_XMM7);
x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8);
x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9);
x64_sse_movaps_membase_reg(ins, X64_R8, 96, X64_XMM10);
+
x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R9, 32);
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
@@ -333,10 +335,12 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
+
x64_sse_movaps_membase_reg(ins, X64_R8, 16, X64_XMM1);
x64_sse_movaps_membase_reg(ins, X64_R8, 48, X64_XMM2);
x64_sse_movaps_membase_reg(ins, X64_R8, 80, X64_XMM4);
x64_sse_movaps_membase_reg(ins, X64_R8, 112, X64_XMM5);
+
x64_ret(ins);
#else
/* generate function */
@@ -359,10 +363,10 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RDX, 32);
-
+
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
-
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RDX, 112);
@@ -370,12 +374,14 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
+
x64_sse_movaps_membase_reg(ins, X64_RDX, 0, X64_XMM7);
x64_sse_movaps_membase_reg(ins, X64_RDX, 32, X64_XMM8);
x64_sse_movaps_membase_reg(ins, X64_RDX, 64, X64_XMM9);
x64_sse_movaps_membase_reg(ins, X64_RDX, 96, X64_XMM10);
+
x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R8, 32);
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RDX, 80);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RDX, 80);
x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R8, 48);
x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
@@ -402,10 +408,12 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
+
x64_sse_movaps_membase_reg(ins, X64_RDX, 16, X64_XMM1);
x64_sse_movaps_membase_reg(ins, X64_RDX, 48, X64_XMM2);
x64_sse_movaps_membase_reg(ins, X64_RDX, 80, X64_XMM4);
x64_sse_movaps_membase_reg(ins, X64_RDX, 112, X64_XMM5);
+
x64_ret(ins);
#endif
@@ -416,7 +424,7 @@ generate_size4_base_case(insns_t **fp, int sign)
static FFTS_INLINE void
generate_leaf_init(insns_t **fp, uint32_t loop_count)
{
- /* to avoid deferring */
+ /* to avoid deferring */
insns_t *ins = *fp;
#ifdef _M_X64
@@ -995,7 +1003,7 @@ generate_leaf_oe(insns_t **fp, uint32_t *offsets)
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
-
+
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
@@ -1189,7 +1197,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM3);
-
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
@@ -1289,7 +1297,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5);
-
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
@@ -1500,7 +1508,7 @@ generate_size8_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM1);
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
-
+
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
OpenPOWER on IntegriCloud