summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-08-28 11:55:01 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-08-28 11:55:01 +0300
commit5f0db6e851fc5d0b3db83b140e81cd7b0d4733f0 (patch)
tree69dd3599936a0993fe0a3fda995658989ad8f0b2
parentcdf9015f3f150b56fde42015868c3eeb65aaf486 (diff)
downloadffts-5f0db6e851fc5d0b3db83b140e81cd7b0d4733f0.zip
ffts-5f0db6e851fc5d0b3db83b140e81cd7b0d4733f0.tar.gz
No need to display the size of transform
-rw-r--r--src/codegen.c34
-rw-r--r--src/codegen_sse.h28
2 files changed, 35 insertions, 27 deletions
diff --git a/src/codegen.c b/src/codegen.c
index fc407cb..c4e19e6 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -9,14 +9,14 @@
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -199,7 +199,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
generate_leaf_init(&fp, loop_count);
if (ffts_ctzl(N) & 1) {
- generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);
+ generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);
if (p->i1) {
loop_count += 4 * p->i1;
@@ -209,14 +209,14 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
loop_count += 4;
generate_leaf_oe(&fp, offsets_o);
} else {
- generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);
+ generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);
loop_count += 4;
generate_leaf_eo(&fp, offsets);
if (p->i1) {
loop_count += 4 * p->i1;
- generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
+ generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
}
}
@@ -309,7 +309,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
#ifdef __arm__
#ifdef HAVE_NEON
- if(__builtin_ctzl(N) & 1) {
+ if (ffts_ctzl(N) & 1) {
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
ADDI(&fp, 9, 2, 0);
@@ -535,9 +535,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
}
- if(p->ws_is[__builtin_ctzl(pps[0]/leaf_N)-1]*8 - pLUT)
- ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
-
+ if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) {
+ ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
+ }
if(pps[0] == 2 * leaf_N) {
*fp = BL(fp+2, x_4_addr);
@@ -574,7 +574,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
pAddr = pps[1] * 4;
pN = pps[0];
- pLUT = p->ws_is[__builtin_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
+ pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
count += 4;
pps += 2;
@@ -594,7 +594,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++)
- fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));
+ //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));
free(ps);
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index c0a34fe..d15b316 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -192,7 +192,7 @@ generate_epilogue(insns_t **fp)
static FFTS_INLINE insns_t*
generate_prologue(insns_t **fp, ffts_plan_t *p)
{
- insns_t *start;
+ insns_t *start;
/* unreferenced parameter */
(void) p;
@@ -301,12 +301,14 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
+
x64_sse_movaps_membase_reg(ins, X64_R8, 0, X64_XMM7);
x64_sse_movaps_membase_reg(ins, X64_R8, 32, X64_XMM8);
x64_sse_movaps_membase_reg(ins, X64_R8, 64, X64_XMM9);
x64_sse_movaps_membase_reg(ins, X64_R8, 96, X64_XMM10);
+
x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R9, 32);
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_R8, 80);
x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R9, 48);
x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
@@ -333,10 +335,12 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
+
x64_sse_movaps_membase_reg(ins, X64_R8, 16, X64_XMM1);
x64_sse_movaps_membase_reg(ins, X64_R8, 48, X64_XMM2);
x64_sse_movaps_membase_reg(ins, X64_R8, 80, X64_XMM4);
x64_sse_movaps_membase_reg(ins, X64_R8, 112, X64_XMM5);
+
x64_ret(ins);
#else
/* generate function */
@@ -359,10 +363,10 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM6, X64_XMM4);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM4);
x64_sse_movaps_reg_membase(ins, X64_XMM8, X64_RDX, 32);
-
+
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM6, X64_XMM3);
-
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM6, X64_XMM6, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM10, X64_XMM8);
x64_sse_movaps_reg_membase(ins, X64_XMM12, X64_RDX, 112);
@@ -370,12 +374,14 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_addps_reg_reg(ins, X64_XMM7, X64_XMM5);
x64_sse_addps_reg_reg(ins, X64_XMM10, X64_XMM6);
x64_sse_subps_reg_reg(ins, X64_XMM8, X64_XMM6);
+
x64_sse_movaps_membase_reg(ins, X64_RDX, 0, X64_XMM7);
x64_sse_movaps_membase_reg(ins, X64_RDX, 32, X64_XMM8);
x64_sse_movaps_membase_reg(ins, X64_RDX, 64, X64_XMM9);
x64_sse_movaps_membase_reg(ins, X64_RDX, 96, X64_XMM10);
+
x64_sse_movaps_reg_membase(ins, X64_XMM14, X64_R8, 32);
- x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RDX, 80);
+ x64_sse_movaps_reg_membase(ins, X64_XMM11, X64_RDX, 80);
x64_sse_movaps_reg_reg(ins, X64_XMM0, X64_XMM14);
x64_sse_movaps_reg_membase(ins, X64_XMM13, X64_R8, 48);
x64_sse_mulps_reg_reg(ins, X64_XMM0, X64_XMM11);
@@ -402,10 +408,12 @@ generate_size4_base_case(insns_t **fp, int sign)
x64_sse_subps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_subps_reg_reg(ins, X64_XMM4, X64_XMM15);
x64_sse_addps_reg_reg(ins, X64_XMM5, X64_XMM0);
+
x64_sse_movaps_membase_reg(ins, X64_RDX, 16, X64_XMM1);
x64_sse_movaps_membase_reg(ins, X64_RDX, 48, X64_XMM2);
x64_sse_movaps_membase_reg(ins, X64_RDX, 80, X64_XMM4);
x64_sse_movaps_membase_reg(ins, X64_RDX, 112, X64_XMM5);
+
x64_ret(ins);
#endif
@@ -416,7 +424,7 @@ generate_size4_base_case(insns_t **fp, int sign)
static FFTS_INLINE void
generate_leaf_init(insns_t **fp, uint32_t loop_count)
{
- /* to avoid deferring */
+ /* to avoid deferring */
insns_t *ins = *fp;
#ifdef _M_X64
@@ -995,7 +1003,7 @@ generate_leaf_oe(insns_t **fp, uint32_t *offsets)
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM7, X64_XMM3);
-
+
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_movaps_reg_reg(ins, X64_XMM2, X64_XMM0);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM7, X64_XMM7, 0xB1);
@@ -1189,7 +1197,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM3);
-
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
@@ -1289,7 +1297,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM14, X64_XMM5);
-
+
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
@@ -1500,7 +1508,7 @@ generate_size8_base_case(insns_t **fp, int sign)
x64_sse_movaps_reg_reg(ins, X64_XMM6, X64_XMM1);
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM10);
x64_sse_addps_reg_reg(ins, X64_XMM11, X64_XMM10);
-
+
/* change sign */
x64_sse_xorps_reg_reg(ins, X64_XMM13, X64_XMM3);
OpenPOWER on IntegriCloud