diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/codegen.c | 58 | ||||
-rw-r--r-- | src/cp_sse.c | 3 | ||||
-rw-r--r-- | src/patterns.c | 6 |
3 files changed, 36 insertions, 31 deletions
diff --git a/src/codegen.c b/src/codegen.c index 1a12f99..5bc8fb6 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -96,21 +96,25 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { pps = ps; - if(N < 8192) p->transform_size = 16384; +#ifdef __ARM_NEON__ + if(N < 8192) p->transform_size = 8192; else p->transform_size = N; +#else + if(N < 2048) p->transform_size = 16384; + else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N); +#endif - p->transform_base = valloc(p->transform_size);//mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); + p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); /* if(p->transform_base == MAP_FAILED) { fprintf(stderr, "MAP FAILED\n"); exit(1); }*/ - insns_t *func = p->transform_base;//valloc(8192); insns_t *fp = func; - fprintf(stderr, "Allocating %d bytes \n", p->transform_size); - fprintf(stderr, "Base address = %016p\n", func); +//fprintf(stderr, "Allocating %d bytes \n", p->transform_size); +//fprintf(stderr, "Base address = %016p\n", func); if(!func) { fprintf(stderr, "NOMEM\n"); @@ -118,7 +122,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { } insns_t *x_8_addr = fp; - fprintf(stderr, "X8 start address = %016p\n", fp); +//fprintf(stderr, "X8 start address = %016p\n", fp); #ifdef __ARM_NEON__ memcpy(fp, neon_x8, neon_x8_t - neon_x8); fp += (neon_x8_t - neon_x8) / 4; @@ -130,7 +134,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { //memcpy(fp, neon_x8_t, neon_end - neon_x8_t); //fp += (neon_end - neon_x8_t) / 4; insns_t *x_4_addr = fp; - fprintf(stderr, "X4 start address = %016p\n", fp); +//fprintf(stderr, "X4 start address = %016p\n", fp); #ifdef __ARM_NEON__ memcpy(fp, neon_x4, neon_x8 - neon_x4); fp += (neon_x8 - neon_x4) / 4; @@ -182,7 +186,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { memcpy(fp, leaf_ee, leaf_oo - leaf_ee); fp += (neon_oo - leaf_ee) / 4; #else - fprintf(stderr, "Leaf start address = %016p\n", fp); +//fprintf(stderr, "Leaf start address = %016p\n", fp); PUSH(&fp, RBP); PUSH(&fp, RBX); @@ -192,14 +196,17 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { PUSH(&fp, R13); PUSH(&fp, R14); PUSH(&fp, R15); + PUSH(&fp, R9); + PUSH(&fp, R8); + PUSH(&fp, RCX); int i; memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init); - fprintf(stderr, "Leaf start address = %016p\n", fp); - fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); - fprintf(stderr, "Constants address = %016p\n", sse_constants); - fprintf(stderr, "Constants address = %016p\n", p->constants); +//fprintf(stderr, "Leaf start address = %016p\n", fp); +//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); +//fprintf(stderr, "Constants address = %016p\n", sse_constants); +//fprintf(stderr, "Constants address = %016p\n", p->constants); //int32_t val = READ_IMM32(fp + 3); //fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p)); @@ -263,7 +270,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { } - fprintf(stderr, "Body start address = %016p\n", fp); +//fprintf(stderr, "Body start address = %016p\n", fp); //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); memcpy(fp, x_init, x4 - x_init); //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); @@ -275,7 +282,6 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { count = 2; while(pps[0]) { -// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); if(!pN) { MOVI(&fp, RCX, pps[0] / 4); }else{ @@ -292,11 +298,6 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { *fp++ = 0xe9; *fp++ = ((-diff) & 0xff); } - - - fprintf(stderr, "%d -> %d = shl %d\n", pps[0], pN, diff); - // ADDI(&fp, RCX, (pps[0] - pN) / 4); - } } @@ -445,6 +446,9 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { *fp++ = POP_LR(); count++; #else + POP(&fp, RCX); + POP(&fp, R8); + POP(&fp, R9); POP(&fp, R15); POP(&fp, R14); POP(&fp, R13); @@ -456,14 +460,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { RET(&fp); - uint8_t *pp = func; - int counter = 0; - do{ - printf("%02x ", *pp); - if(counter++ % 16 == 15) printf("\n"); - } while(++pp < fp); +//uint8_t *pp = func; +//int counter = 0; +//do{ +// printf("%02x ", *pp); +// if(counter++ % 16 == 15) printf("\n"); +//} while(++pp < fp); - printf("\n"); +//printf("\n"); #endif @@ -486,7 +490,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { sys_icache_invalidate(func, p->transform_size); - fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); +//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); p->transform = start; } diff --git a/src/cp_sse.c b/src/cp_sse.c index f9a5c8f..7eccb2f 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -33,7 +33,8 @@ void ffts_free(ffts_plan_t *p) { perror("Couldn't mprotect"); exit(errno); } - free(p->transform_base); + munmap(p->transform_base, p->transform_size); + //free(p->transform_base); } free(p); } diff --git a/src/patterns.c b/src/patterns.c index 29fa5ae..664f20e 100644 --- a/src/patterns.c +++ b/src/patterns.c @@ -114,9 +114,9 @@ void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) { for(i=0;i<N/leafN;i++) { p->offsets[i] = offsets[i*2+1]*2; } - for(i=0;i<N/leafN;i++) { - printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); - } +//for(i=0;i<N/leafN;i++) { +// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); +//} free(offsets); |