summaryrefslogtreecommitdiffstats
path: root/src/codegen.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/codegen.c')
-rw-r--r--src/codegen.c88
1 files changed, 77 insertions, 11 deletions
diff --git a/src/codegen.c b/src/codegen.c
index 4e0b633..21f8be0 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -46,6 +46,7 @@
#include "codegen_neon.h"
// #include "neon_float.h"
#include "neon.h"
+ #include "vfp.h"
#else
#include "codegen_sse.h"
#include "sse_float.h"
@@ -201,6 +202,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
insns_t *x_8_addr = fp;
+#ifdef __arm__
#ifdef __ARM_NEON__
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
if(sign < 0) {
@@ -210,6 +212,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_x8_t - neon_x8) / 4;
#else
+ memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+ fp += (vfp_end - vfp_x8) / 4;
+#endif
+#else
align_mem16(&fp, 0);
x_8_addr = fp;
align_mem16(&fp, 5);
@@ -221,6 +227,8 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
insns_t *x_4_addr = fp;
+#ifdef __arm__
+
#ifdef __ARM_NEON__
memcpy(fp, neon_x4, neon_x8 - neon_x4);
if(sign < 0) {
@@ -228,6 +236,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_x8 - neon_x4) / 4;
#else
+ memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+ fp += (vfp_x8 - vfp_x4) / 4;
+#endif
+#else
align_mem16(&fp, 0);
x_4_addr = fp;
memcpy(fp, x4, x8_soft - x4);
@@ -257,9 +269,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
#endif
-#ifdef __ARM_NEON__
- *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
- MOVI(&fp, 11, p->i0);
+#ifdef __arm__
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ #ifdef __ARM_NEON__
+ MOVI(&fp, 11, p->i0);
+ #else
+ MOVI(&fp, 11, p->i0);
+ #endif
+
#else
align_mem16(&fp, 0);
start = fp;
@@ -273,15 +290,20 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
#endif
//fp++;
+#ifdef __arm__
#ifdef __ARM_NEON__
memcpy(fp, neon_ee, neon_oo - neon_ee);
- if(sign < 0) {
- fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
- fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
- fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
- }
+ if(sign < 0) {
+ fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+ }
fp += (neon_oo - neon_ee) / 4;
#else
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ fp += (vfp_o - vfp_e) / 4;
+#endif
+#else
//fprintf(stderr, "Body start address = %016p\n", start);
PUSH(&fp, RBP);
@@ -403,14 +425,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
if(pps[0] == 2*leafN) {
- CALL(&fp, x_4_addr);
+ // CALL(&fp, x_4_addr);
// }else if(!pps[2]){
// //uint32_t *x_8_t_addr = fp;
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
// fp += (neon_ee - neon_x8_t) / 4;
// //*fp++ = BL(fp+2, x_8_t_addr);
}else{
- CALL(&fp, x_8_addr);
+ // CALL(&fp, x_8_addr);
}
pAddr = pps[1] * 4;
@@ -422,6 +444,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
pps += 2;
}
#endif
+#ifdef __arm__
#ifdef __ARM_NEON__
if(__builtin_ctzl(N) & 1){
ADDI(&fp, 2, 7, 0);
@@ -519,7 +542,45 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
fp += (neon_oo - neon_ee) / 4;
}
+#else
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+ MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+ memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+ fp += (vfp_x4 - vfp_o) / 4;
+
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
+
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
+ MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ fp += (vfp_o - vfp_e) / 4;
+
+#endif
*fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
//ADDI(&fp, 2, 1, 0);
MOVI(&fp, 1, 0);
@@ -551,6 +612,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
*fp = BL(fp+2, x_4_addr); fp++;
}else if(!pps[2]){
//uint32_t *x_8_t_addr = fp;
+#ifdef __ARM_NEON__
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
@@ -559,6 +621,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_ee - neon_x8_t) / 4;
//*fp++ = BL(fp+2, x_8_t_addr);
+
+#else
+ *fp = BL(fp+2, x_8_addr); fp++;
+#endif
}else{
*fp = BL(fp+2, x_8_addr); fp++;
}
@@ -612,7 +678,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
exit(1);
}
#ifdef __APPLE__
-// sys_icache_invalidate(func, p->transform_size);
+ sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
OpenPOWER on IntegriCloud