diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-31 17:55:21 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-31 17:55:21 +0200 |
commit | 196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c (patch) | |
tree | a35307258c8cd76cf4c41af630938943c2c47e09 /src/codegen.c | |
parent | 7b999686ec4c732d28efd344065606fccba84ae4 (diff) | |
download | ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.zip ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.tar.gz |
Add CMake as an alternative build system
Add support for Windows x64 (requires YASM)
Diffstat (limited to 'src/codegen.c')
-rw-r--r-- | src/codegen.c | 1501 |
1 files changed, 877 insertions, 624 deletions
diff --git a/src/codegen.c b/src/codegen.c index 79aaca6..0cc3d24 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -35,698 +35,951 @@ #include "macros.h" #include "ffts.h" -#ifdef __APPLE__ - #include <libkern/OSCacheControl.h> -#endif - -#include <sys/types.h> -#include <sys/mman.h> - #ifdef HAVE_NEON - #include "codegen_arm.h" - #include "neon.h" +#include "codegen_arm.h" +#include "neon.h" #elif HAVE_VFP - #include "codegen_arm.h" - #include "vfp.h" +#include "codegen_arm.h" +#include "vfp.h" #else - #include "codegen_sse.h" - #include "macros-sse.h" +#include "codegen_sse.h" +#include "macros-sse.h" #endif +#include <assert.h> +#include <errno.h> +#include <stddef.h> +/* #include <stdio.h> */ +#include <stdlib.h> +#include <string.h> + #ifdef __ANDROID__ - #include <unistd.h> +#include <unistd.h> #endif -int tree_count(int N, int leafN, int offset) { - - if(N <= leafN) return 0; - int count = 0; - count += tree_count(N/4, leafN, offset); - count += tree_count(N/8, leafN, offset + N/4); - count += tree_count(N/8, leafN, offset + N/4 + N/8); - count += tree_count(N/4, leafN, offset + N/2); - count += tree_count(N/4, leafN, offset + 3*N/4); - - return 1 + count; -} - -void elaborate_tree(size_t **p, int N, int leafN, int offset) { - - if(N <= leafN) return; - elaborate_tree(p, N/4, leafN, offset); - elaborate_tree(p, N/8, leafN, offset + N/4); - elaborate_tree(p, N/8, leafN, offset + N/4 + N/8); - elaborate_tree(p, N/4, leafN, offset + N/2); - elaborate_tree(p, N/4, leafN, offset + 3*N/4); - - (*p)[0] = N; - (*p)[1] = offset*2; +#ifdef __arm__ +typedef uint32_t insns_t; +#else +typedef uint8_t insns_t; +#endif - (*p)+=2; -} +#define P(x) (*(*p)++ = x) +static int ffts_tree_count(int N, int leaf_N, int offset) +{ + int count; + if (N <= leaf_N) { + return 0; + } + count = ffts_tree_count(N/4, leaf_N, offset); + count += ffts_tree_count(N/8, leaf_N, offset + N/4); + count += ffts_tree_count(N/8, leaf_N, offset + N/4 + N/8); + count += ffts_tree_count(N/4, leaf_N, offset + N/2); + count += ffts_tree_count(N/4, leaf_N, offset + 3*N/4); -uint32_t LUT_offset(size_t N, size_t leafN) { - int i; - size_t p_lut_size = 0; - size_t lut_size = 0; - int hardcoded = 0; - size_t n_luts = __builtin_ctzl(N/leafN); - int n = leafN*2; - //if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } - - for(i=0;i<n_luts-1;i++) { - p_lut_size = lut_size; - if(!i || hardcoded) { - #ifdef __arm__ - if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); - else lut_size += n/4 * sizeof(cdata_t); - #else - lut_size += n/4 * 2 * sizeof(cdata_t); - #endif - // n *= 2; - } else { - #ifdef __arm__ - lut_size += n/8 * 3 * sizeof(cdata_t); - #else - lut_size += n/8 * 3 * 2 * sizeof(cdata_t); - #endif - } - n *= 2; - } - return lut_size; + return 1 + count; } -#ifdef __arm__ - typedef uint32_t insns_t; -#else - typedef uint8_t insns_t; -#endif +static void ffts_elaborate_tree(size_t **p, int N, int leaf_N, int offset) +{ + if (N <= leaf_N) { + return; + } -#define P(x) (*(*p)++ = x) + ffts_elaborate_tree(p, N/4, leaf_N, offset); + ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4); + ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4 + N/8); + ffts_elaborate_tree(p, N/4, leaf_N, offset + N/2); + ffts_elaborate_tree(p, N/4, leaf_N, offset + 3*N/4); + + (*p)[0] = N; + (*p)[1] = 2 * offset; -void insert_nops(uint8_t **p, uint32_t count) { - switch(count) { - case 0: break; - case 2: P(0x66); - case 1: P(0x90); break; - case 3: P(0x0F); P(0x1F); P(0x00); break; - case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break; - case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break; - case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break; - case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break; - case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break; - case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break; - default: - P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); - insert_nops(p, count-9); - break; - } + (*p) += 2; } +static void ffts_insert_nops(uint8_t **p, uint32_t count) +{ + if (count >= 9) { + P(0x66); + P(0x0F); + P(0x1F); + P(0x84); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + + if (count > 9) { + ffts_insert_nops(p, count - 9); + } + } else { + switch(count) { + case 0: + break; + case 2: + P(0x66); + /* fall through */ + case 1: + P(0x90); + break; + case 3: + P(0x0F); + P(0x1F); + P(0x00); + break; + case 4: + P(0x0F); + P(0x1F); + P(0x40); + P(0x00); + break; + case 5: + P(0x0F); + P(0x1F); + P(0x44); + P(0x00); + P(0x00); + break; + case 6: + P(0x66); + P(0x0F); + P(0x1F); + P(0x44); + P(0x00); + P(0x00); + break; + case 7: + P(0x0F); + P(0x1F); + P(0x80); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + break; + case 8: + default: + P(0x0F); + P(0x1F); + P(0x84); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + break; + } + } +} -void align_mem16(uint8_t **p, uint32_t offset) { +static void ffts_align_mem16(uint8_t **p, uint32_t offset) +{ #ifdef __x86_64__ - int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf); - r = (16 + r) & 0xf; - insert_nops(p, r); + int r = (16 - (offset & 0xf)) - ((uintptr_t)(*p) & 0xf); + r = (16 + r) & 0xf; + ffts_insert_nops(p, r); #endif } -void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { - int count = tree_count(N, leafN, 0) + 1; - size_t *ps = malloc(count * 2 * sizeof(size_t)); - size_t *pps = ps; +transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) +{ + uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; + uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; + + int32_t pAddr = 0; + int32_t pN = 0; + int32_t pLUT = 0; + + insns_t *fp; + insns_t *start; + insns_t *x_4_addr; + insns_t *x_8_addr; + uint32_t loop_count; + + int count; + int i; + ptrdiff_t len; + + size_t *ps; + size_t *pps; + + count = ffts_tree_count(N, leaf_N, 0) + 1; + + ps = pps = malloc(2 * count * sizeof(*ps)); + if (!ps) { + return NULL; + } + + ffts_elaborate_tree(&pps, N, leaf_N, 0); + + pps[0] = 0; + pps[1] = 0; + + pps = ps; #ifdef __x86_64__ - if(sign < 0) p->constants = sse_constants; - else p->constants = sse_constants_inv; + if (sign < 0) { + p->constants = sse_constants; + } else { + p->constants = sse_constants_inv; + } #endif - elaborate_tree(&pps, N, leafN, 0); - pps[0] = 0; - pps[1] = 0; + fp = (insns_t*) p->transform_base; - pps = ps; +#ifdef __arm__ +#ifdef HAVE_NEON + memcpy(fp, neon_x8, neon_x8_t - neon_x8); + /* + * Changes adds to subtracts and vice versa to allow the computation + * of both the IFFT and FFT + */ + if(sign < 0) { + fp[31] ^= 0x00200000; + fp[32] ^= 0x00200000; + fp[33] ^= 0x00200000; + fp[34] ^= 0x00200000; + fp[65] ^= 0x00200000; + fp[66] ^= 0x00200000; + fp[70] ^= 0x00200000; + fp[74] ^= 0x00200000; + fp[97] ^= 0x00200000; + fp[98] ^= 0x00200000; + fp[102] ^= 0x00200000; + fp[104] ^= 0x00200000; + } + fp += (neon_x8_t - neon_x8) / 4; +#else + memcpy(fp, vfp_x8, vfp_end - vfp_x8); + if(sign > 0) { + fp[65] ^= 0x00000040; + fp[66] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[70] ^= 0x00000040; + fp[103] ^= 0x00000040; + fp[104] ^= 0x00000040; + fp[105] ^= 0x00000040; + fp[108] ^= 0x00000040; + fp[113] ^= 0x00000040; + fp[114] ^= 0x00000040; + fp[117] ^= 0x00000040; + fp[118] ^= 0x00000040; + } + fp += (vfp_end - vfp_x8) / 4; +#endif +#else + /* align call destination */ + ffts_align_mem16(&fp, 0); + x_8_addr = fp; -#ifdef __arm__ - if(N < 8192) p->transform_size = 8192; - else p->transform_size = N; + /* align loop/jump destination */ +#ifdef _M_AMD64 + ffts_align_mem16(&fp, 6); #else - if(N < 2048) p->transform_size = 16384; - else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N); + ffts_align_mem16(&fp, 5); +#endif + + /* copy function */ + assert((char*) x8_soft_end > (char*) x8_soft); + len = (char*) x8_soft_end - (char*) x8_soft; + memcpy(fp, x8_soft, (size_t) len); + fp += len; #endif + //uint32_t *x_8_t_addr = fp; + //memcpy(fp, neon_x8_t, neon_end - neon_x8_t); + //fp += (neon_end - neon_x8_t) / 4; -#ifdef __APPLE__ - p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); +#ifdef __arm__ +#ifdef HAVE_NEON + memcpy(fp, neon_x4, neon_x8 - neon_x4); + if(sign < 0) { + fp[26] ^= 0x00200000; + fp[28] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[32] ^= 0x00200000; + } + fp += (neon_x8 - neon_x4) / 4; #else -#define MAP_ANONYMOUS 0x20 - p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); + if(sign > 0) { + fp[36] ^= 0x00000040; + fp[38] ^= 0x00000040; + fp[43] ^= 0x00000040; + fp[44] ^= 0x00000040; + } + fp += (vfp_x8 - vfp_x4) / 4; +#endif +#else + /* align call destination */ + ffts_align_mem16(&fp, 0); + x_4_addr = fp; + + /* copy function */ + assert((char*) x8_soft > (char*) x4); + len = (char*) x8_soft - (char*) x4; + memcpy(fp, x4, (size_t) len); + fp += len; #endif -/* - if(p->transform_base == MAP_FAILED) { - fprintf(stderr, "MAP FAILED\n"); - exit(1); - }*/ - insns_t *func = p->transform_base;//valloc(8192); - insns_t *fp = func; - -//fprintf(stderr, "Allocating %d bytes \n", p->transform_size); -//fprintf(stderr, "Base address = %016p\n", func); - - if(!func) { - fprintf(stderr, "NOMEM\n"); - exit(1); - } - - insns_t *x_8_addr = fp; #ifdef __arm__ + start = fp; + + *fp = PUSH_LR(); + fp++; + *fp = 0xed2d8b10; + fp++; + + ADDI(&fp, 3, 1, 0); + ADDI(&fp, 7, 1, N); + ADDI(&fp, 5, 1, 2*N); + ADDI(&fp, 10, 7, 2*N); + ADDI(&fp, 4, 5, 2*N); + ADDI(&fp, 8, 10, 2*N); + ADDI(&fp, 6, 4, 2*N); + ADDI(&fp, 9, 8, 2*N); + + *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); + fp++; // load offsets into r12 + // *fp++ = LDRI(1, 0, 4); // load ws into r1 + ADDI(&fp, 1, 0, 0); + + ADDI(&fp, 0, 2, 0), // mov out into r0 + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + fp++; + #ifdef HAVE_NEON - memcpy(fp, neon_x8, neon_x8_t - neon_x8); - /* - * Changes adds to subtracts and vice versa to allow the computation - * of both the IFFT and FFT - */ - if(sign < 0) { - fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; - fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; - fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; - } - fp += (neon_x8_t - neon_x8) / 4; + MOVI(&fp, 11, p->i0); +#else + MOVI(&fp, 11, p->i0); +#endif +#else + /* align call destination */ + ffts_align_mem16(&fp, 0); + start = fp; + + /* save nonvolatile registers */ +#ifdef _M_AMD64 + /* use the shadow space to save first 3 registers */ + + /* mov [rsp + 8], rbx */ + *fp++ = 0x48; + *fp++ = 0x89; + *fp++ = 0x5C; + *fp++ = 0x24; + *fp++ = 0x08; + + /* mov [rsp + 16], rsi */ + *fp++ = 0x48; + *fp++ = 0x89; + *fp++ = 0x74; + *fp++ = 0x24; + *fp++ = 0x10; + + /* mov [rsp + 24], rdi */ + *fp++ = 0x48; + *fp++ = 0x89; + *fp++ = 0x7C; + *fp++ = 0x24; + *fp++ = 0x18; #else - memcpy(fp, vfp_x8, vfp_end - vfp_x8); - if(sign > 0) { - fp[65] ^= 0x00000040; - fp[66] ^= 0x00000040; - fp[68] ^= 0x00000040; - fp[70] ^= 0x00000040; - fp[103] ^= 0x00000040; - fp[104] ^= 0x00000040; - fp[105] ^= 0x00000040; - fp[108] ^= 0x00000040; - fp[113] ^= 0x00000040; - fp[114] ^= 0x00000040; - fp[117] ^= 0x00000040; - fp[118] ^= 0x00000040; - } - fp += (vfp_end - vfp_x8) / 4; + PUSH(&fp, RBP); + PUSH(&fp, RBX); + PUSH(&fp, R10); + PUSH(&fp, R11); + PUSH(&fp, R12); + PUSH(&fp, R13); + PUSH(&fp, R14); + PUSH(&fp, R15); #endif + + /* assign loop counter register */ + loop_count = p->i0 * 4; +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); #else - align_mem16(&fp, 0); - x_8_addr = fp; - align_mem16(&fp, 5); - memcpy(fp, x8_soft, x8_hard - x8_soft); - fp += (x8_hard - x8_soft); -//fprintf(stderr, "X8 start address = %016p\n", x_8_addr); + MOVI(&fp, ECX, loop_count); +#endif #endif -//uint32_t *x_8_t_addr = fp; -//memcpy(fp, neon_x8_t, neon_end - neon_x8_t); -//fp += (neon_end - neon_x8_t) / 4; - insns_t *x_4_addr = fp; + #ifdef __arm__ - #ifdef HAVE_NEON - memcpy(fp, neon_x4, neon_x8 - neon_x4); - if(sign < 0) { - fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; - } - fp += (neon_x8 - neon_x4) / 4; - #else - memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); - if(sign > 0) { - fp[36] ^= 0x00000040; - fp[38] ^= 0x00000040; - fp[43] ^= 0x00000040; - fp[44] ^= 0x00000040; - } - fp += (vfp_x8 - vfp_x4) / 4; - #endif +#ifdef HAVE_NEON + memcpy(fp, neon_ee, neon_oo - neon_ee); + if (sign < 0) { + fp[33] ^= 0x00200000; + fp[37] ^= 0x00200000; + fp[38] ^= 0x00200000; + fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; + fp[41] ^= 0x00200000; + fp[44] ^= 0x00200000; + fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + + fp += (neon_oo - neon_ee) / 4; +#else + memcpy(fp, vfp_e, vfp_o - vfp_e); + + if (sign > 0) { + fp[64] ^= 0x00000040; + fp[65] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[75] ^= 0x00000040; + fp[76] ^= 0x00000040; + fp[79] ^= 0x00000040; + fp[80] ^= 0x00000040; + fp[83] ^= 0x00000040; + fp[84] ^= 0x00000040; + fp[87] ^= 0x00000040; + fp[91] ^= 0x00000040; + fp[93] ^= 0x00000040; + } + fp += (vfp_o - vfp_e) / 4; +#endif #else - align_mem16(&fp, 0); - x_4_addr = fp; - memcpy(fp, x4, x8_soft - x4); - fp += (x8_soft - x4); + //fprintf(stderr, "Body start address = %016p\n", start); + /* copy function */ + assert((char*) leaf_ee > (char*) leaf_ee_init); + len = (char*) leaf_ee - (char*) leaf_ee_init; + memcpy(fp, leaf_ee_init, (size_t) len); + fp += len; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + ffts_align_mem16(&fp, 8); +#else + ffts_align_mem16(&fp, 9); #endif - insns_t *start = fp; - -#ifdef __arm__ - *fp = PUSH_LR(); fp++; - *fp = 0xed2d8b10; fp++; - - ADDI(&fp, 3, 1, 0); - ADDI(&fp, 7, 1, N); - ADDI(&fp, 5, 1, 2*N); - ADDI(&fp, 10, 7, 2*N); - ADDI(&fp, 4, 5, 2*N); - ADDI(&fp, 8, 10, 2*N); - ADDI(&fp, 6, 4, 2*N); - ADDI(&fp, 9, 8, 2*N); - - *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12 -// *fp++ = LDRI(1, 0, 4); // load ws into r1 - ADDI(&fp, 1, 0, 0); - - ADDI(&fp, 0, 2, 0), // mov out into r0 + + /* copy function */ + assert((char*) leaf_oo > (char*) leaf_ee); + len = (char*) leaf_oo - (char*) leaf_ee; + memcpy(fp, leaf_ee, (size_t) len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets[i]); + } + + fp += len; + + if (ffts_ctzl(N) & 1) { + if (p->i1) { + loop_count += 4 * p->i1; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); + ffts_align_mem16(&fp, 3); +#else + MOVI(&fp, ECX, loop_count); + ffts_align_mem16(&fp, 4); #endif + /* copy function */ + assert((char*) leaf_eo > (char*) leaf_oo); + len = (char*) leaf_eo - (char*) leaf_oo; + memcpy(fp, leaf_oo, len); -#ifdef __arm__ - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - #ifdef HAVE_NEON - MOVI(&fp, 11, p->i0); - #else - MOVI(&fp, 11, p->i0); - #endif + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]); + } + + fp += len; + } + + loop_count += 4; + + /* copy function */ + assert((char*) leaf_end > (char*) leaf_oe); + len = (char*) leaf_end - (char*) leaf_oe; + memcpy(fp, leaf_oe, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_oe_offsets[i], 4 * offsets_o[i]); + } + fp += len; + } else { + loop_count += 4; + + /* copy function */ + assert((char*) leaf_oe > (char*) leaf_eo); + len = (char*) leaf_oe - (char*) leaf_eo; + memcpy(fp, leaf_eo, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_eo_offsets[i], 4 * offsets[i]); + } + + fp += len; + + if (p->i1) { + loop_count += 4 * p->i1; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); + ffts_align_mem16(&fp, 3); #else - align_mem16(&fp, 0); - start = fp; - - *fp++ = 0x4c; - *fp++ = 0x8b; - *fp++ = 0x07; - uint32_t lp_cnt = p->i0 * 4; - MOVI(&fp, RCX, lp_cnt); - - //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); + MOVI(&fp, ECX, loop_count); + ffts_align_mem16(&fp, 4); #endif - //fp++; -#ifdef __arm__ -#ifdef HAVE_NEON - memcpy(fp, neon_ee, neon_oo - neon_ee); - if(sign < 0) { - fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; - fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_oo - neon_ee) / 4; + + /* copy function */ + assert((char*) leaf_eo > (char*) leaf_oo); + len = (char*) leaf_eo - (char*) leaf_oo; + memcpy(fp, leaf_oo, len); + + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]); + } + + fp += len; + } + } + + if (p->i1) { + uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; + + loop_count += 4 * p->i1; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); + ffts_align_mem16(&fp, 8); #else - memcpy(fp, vfp_e, vfp_o - vfp_e); - if(sign > 0) { - fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; - fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; - fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; - } - fp += (vfp_o - vfp_e) / 4; + MOVI(&fp, ECX, loop_count); + ffts_align_mem16(&fp, 9); #endif + + assert((char*) leaf_oo > (char*) leaf_ee); + len = (char*) leaf_oo - (char*) leaf_ee; + memcpy(fp, leaf_ee, len); + + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets_oe[i]); + } + + fp += len; + } + + //fprintf(stderr, "Body start address = %016p\n", fp); + //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); + memcpy(fp, x_init, (char*) x4 - (char*) x_init); + //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); + fp += ((char*) x4 - (char*) x_init); + + count = 2; + while (pps[0]) { + size_t ws_is; + + if (!pN) { +#ifdef _M_AMD64 + MOVI(&fp, EBX, pps[0]); #else -//fprintf(stderr, "Body start address = %016p\n", start); - - PUSH(&fp, RBP); - PUSH(&fp, RBX); - PUSH(&fp, R10); - PUSH(&fp, R11); - PUSH(&fp, R12); - PUSH(&fp, R13); - PUSH(&fp, R14); - PUSH(&fp, R15); - - int i; - memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init); - -//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); -//fprintf(stderr, "Constants address = %016p\n", sse_constants); -//fprintf(stderr, "Constants address = %016p\n", p->constants); - -//int32_t val = READ_IMM32(fp + 3); -//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p)); - -//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp ); -//fprintf(stderr, "IMM = 0x%llx\n", v2); - -//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); - fp += (leaf_ee - leaf_ee_init); - -//fprintf(stderr, "Leaf start address = %016p\n", fp); - align_mem16(&fp, 9); - memcpy(fp, leaf_ee, leaf_oo - leaf_ee); - - - uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; - uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; - uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; - - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); - - fp += (leaf_oo - leaf_ee); - - if(__builtin_ctzl(N) & 1){ - - if(p->i1) { - lp_cnt += p->i1 * 4; - MOVI(&fp, RCX, lp_cnt); - align_mem16(&fp, 4); - memcpy(fp, leaf_oo, leaf_eo - leaf_oo); - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); - fp += (leaf_eo - leaf_oo); - } - - - memcpy(fp, leaf_oe, leaf_end - leaf_oe); - lp_cnt += 4; - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); - fp += (leaf_end - leaf_oe); - - }else{ - - - memcpy(fp, leaf_eo, leaf_oe - leaf_eo); - lp_cnt += 4; - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); - fp += (leaf_oe - leaf_eo); - - if(p->i1) { - lp_cnt += p->i1 * 4; - MOVI(&fp, RCX, lp_cnt); - align_mem16(&fp, 4); - memcpy(fp, leaf_oo, leaf_eo - leaf_oo); - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); - fp += (leaf_eo - leaf_oo); - } - - } - if(p->i1) { - lp_cnt += p->i1 * 4; - MOVI(&fp, RCX, lp_cnt); - align_mem16(&fp, 9); - memcpy(fp, leaf_ee, leaf_oo - leaf_ee); - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); - fp += (leaf_oo - leaf_ee); - - } - -//fprintf(stderr, "Body start address = %016p\n", fp); - //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); - memcpy(fp, x_init, x4 - x_init); -//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); - fp += (x4 - x_init); - - int32_t pAddr = 0; - int32_t pN = 0; - int32_t pLUT = 0; - count = 2; - while(pps[0]) { - - if(!pN) { - MOVI(&fp, RCX, pps[0] / 4); - }else{ - if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr); - if(pps[0] > leafN && pps[0] - pN) { - - int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN); - *fp++ = 0xc1; - - if(diff > 0) { - *fp++ = 0xe1; - *fp++ = (diff & 0xff); - }else{ - *fp++ = 0xe9; - *fp++ = ((-diff) & 0xff); - } - } - } - - if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) - ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); - - - if(pps[0] == 2*leafN) { - CALL(&fp, x_4_addr); - // }else if(!pps[2]){ - // //uint32_t *x_8_t_addr = fp; - // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); - // fp += (neon_ee - neon_x8_t) / 4; - // //*fp++ = BL(fp+2, x_8_t_addr); - }else{ - CALL(&fp, x_8_addr); - } - - pAddr = pps[1] * 4; - if(pps[0] > leafN) - pN = pps[0]; - pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); -// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); - count += 4; - pps += 2; - } + MOVI(&fp, ECX, pps[0] / 4); #endif + } else { + int offset = (4 * pps[1]) - pAddr; + if (offset) { +#ifdef _M_AMD64 + ADDI(&fp, R8, offset); +#else + ADDI(&fp, RDX, offset); +#endif + } + + if (pps[0] > leaf_N && pps[0] - pN) { + int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN); + +#ifdef _M_AMD64 + SHIFT(&fp, EBX, factor); +#else + SHIFT(&fp, ECX, factor); +#endif + } + } + + ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1]; + if (ws_is != pLUT) { + int offset = (int) (ws_is - pLUT); + +#ifdef _M_AMD64 + ADDI(&fp, RDI, offset); +#else + ADDI(&fp, R8, offset); +#endif + } + + if (pps[0] == 2 * leaf_N) { + CALL(&fp, x_4_addr); + } else { + CALL(&fp, x_8_addr); + } + + pAddr = 4 * pps[1]; + if (pps[0] > leaf_N) { + pN = pps[0]; + } + + pLUT = ws_is;//LUT_offset(pps[0], leafN); + //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); + count += 4; + pps += 2; + } +#endif + #ifdef __arm__ #ifdef HAVE_NEON - if(__builtin_ctzl(N) & 1){ - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); - - if(p->i1) { - MOVI(&fp, 11, p->i1); - memcpy(fp, neon_oo, neon_eo - neon_oo); - if(sign < 0) { - fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; - fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_eo - neon_oo) / 4; - } - - *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; - - memcpy(fp, neon_oe, neon_end - neon_oe); - if(sign < 0) { - fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000; - fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; - fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000; - } - fp += (neon_end - neon_oe) / 4; - - }else{ - - *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; - - memcpy(fp, neon_eo, neon_oe - neon_eo); - if(sign < 0) { - fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; - fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000; - fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000; - } - fp += (neon_oe - neon_eo) / 4; - - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); - - if(p->i1) { - MOVI(&fp, 11, p->i1); - memcpy(fp, neon_oo, neon_eo - neon_oo); - if(sign < 0) { - fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; - fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_eo - neon_oo) / 4; - } - - } - - - if(p->i1) { - ADDI(&fp, 2, 3, 0); - ADDI(&fp, 3, 7, 0); - ADDI(&fp, 7, 2, 0); - - ADDI(&fp, 2, 4, 0); - ADDI(&fp, 4, 8, 0); - ADDI(&fp, 8, 2, 0); - - ADDI(&fp, 2, 5, 0); - ADDI(&fp, 5, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 6, 0); - ADDI(&fp, 6, 10, 0); - ADDI(&fp, 10, 2, 0); - - ADDI(&fp, 2, 9, 0); - ADDI(&fp, 9, 10, 0); - ADDI(&fp, 10, 2, 0); - - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - MOVI(&fp, 11, p->i1); - memcpy(fp, neon_ee, neon_oo - neon_ee); - if(sign < 0) { - fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; - fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_oo - neon_ee) / 4; - - } + if(__builtin_ctzl(N) & 1) { + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + if(sign < 0) { + fp[12] ^= 0x00200000; + fp[13] ^= 0x00200000; + fp[14] ^= 0x00200000; + fp[15] ^= 0x00200000; + fp[27] ^= 0x00200000; + fp[29] ^= 0x00200000; + fp[30] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + fp += (neon_eo - neon_oo) / 4; + } + + *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); + fp++; + + memcpy(fp, neon_oe, neon_end - neon_oe); + if(sign < 0) { + fp[19] ^= 0x00200000; + fp[20] ^= 0x00200000; + fp[22] ^= 0x00200000; + fp[23] ^= 0x00200000; + fp[37] ^= 0x00200000; + fp[38] ^= 0x00200000; + fp[40] ^= 0x00200000; + fp[41] ^= 0x00200000; + fp[64] ^= 0x00200000; + fp[65] ^= 0x00200000; + fp[66] ^= 0x00200000; + fp[67] ^= 0x00200000; + } + fp += (neon_end - neon_oe) / 4; + + } else { + + *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); + fp++; + + memcpy(fp, neon_eo, neon_oe - neon_eo); + if(sign < 0) { + fp[10] ^= 0x00200000; + fp[11] ^= 0x00200000; + fp[13] ^= 0x00200000; + fp[14] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[33] ^= 0x00200000; + fp[34] ^= 0x00200000; + fp[35] ^= 0x00200000; + fp[59] ^= 0x00200000; + fp[60] ^= 0x00200000; + fp[61] ^= 0x00200000; + fp[62] ^= 0x00200000; + } + fp += (neon_oe - neon_eo) / 4; + + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + if(sign < 0) { + fp[12] ^= 0x00200000; + fp[13] ^= 0x00200000; + fp[14] ^= 0x00200000; + fp[15] ^= 0x00200000; + fp[27] ^= 0x00200000; + fp[29] ^= 0x00200000; + fp[30] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + fp += (neon_eo - neon_oo) / 4; + } + + } + + + if(p->i1) { + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + fp++; + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_ee, neon_oo - neon_ee); + if(sign < 0) { + fp[33] ^= 0x00200000; + fp[37] ^= 0x00200000; + fp[38] ^= 0x00200000; + fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; + fp[41] ^= 0x00200000; + fp[44] ^= 0x00200000; + fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + fp += (neon_oo - neon_ee) / 4; + + } #else - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); - - MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); - memcpy(fp, vfp_o, vfp_x4 - vfp_o); - if(sign > 0) { - fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040; - fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040; - } - fp += (vfp_x4 - vfp_o) / 4; - - ADDI(&fp, 2, 3, 0); - ADDI(&fp, 3, 7, 0); - ADDI(&fp, 7, 2, 0); - - ADDI(&fp, 2, 4, 0); - ADDI(&fp, 4, 8, 0); - ADDI(&fp, 8, 2, 0); - - ADDI(&fp, 2, 5, 0); - ADDI(&fp, 5, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 6, 0); - ADDI(&fp, 6, 10, 0); - ADDI(&fp, 10, 2, 0); - - ADDI(&fp, 2, 9, 0); - ADDI(&fp, 9, 10, 0); - ADDI(&fp, 10, 2, 0); - - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); - memcpy(fp, vfp_e, vfp_o - vfp_e); - if(sign > 0) { - fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; - fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; - fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; - } - fp += (vfp_o - vfp_e) / 4; + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); + memcpy(fp, vfp_o, vfp_x4 - vfp_o); + if(sign > 0) { + fp[22] ^= 0x00000040; + fp[24] ^= 0x00000040; + fp[25] ^= 0x00000040; + fp[26] ^= 0x00000040; + fp[62] ^= 0x00000040; + fp[64] ^= 0x00000040; + fp[65] ^= 0x00000040; + fp[66] ^= 0x00000040; + } + fp += (vfp_x4 - vfp_o) / 4; + + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + fp++; + MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); + memcpy(fp, vfp_e, vfp_o - vfp_e); + if(sign > 0) { + fp[64] ^= 0x00000040; + fp[65] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[75] ^= 0x00000040; + fp[76] ^= 0x00000040; + fp[79] ^= 0x00000040; + fp[80] ^= 0x00000040; + fp[83] ^= 0x00000040; + fp[84] ^= 0x00000040; + fp[87] ^= 0x00000040; + fp[91] ^= 0x00000040; + fp[93] ^= 0x00000040; + } + fp += (vfp_o - vfp_e) / 4; #endif - *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12 - //ADDI(&fp, 2, 1, 0); - MOVI(&fp, 1, 0); - - // args: r0 - out - // r1 - N - // r2 - ws -// ADDI(&fp, 3, 1, 0); // put N into r3 for counter - - int32_t pAddr = 0; - int32_t pN = 0; - int32_t pLUT = 0; - count = 2; - while(pps[0]) { - -// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); - if(!pN) { - MOVI(&fp, 1, pps[0]); - }else{ - if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); - if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); - } - - if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) - ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); - - - if(pps[0] == 2*leafN) { - *fp = BL(fp+2, x_4_addr); fp++; - }else if(!pps[2]){ - //uint32_t *x_8_t_addr = fp; + *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); + fp++; // load offsets into r12 + //ADDI(&fp, 2, 1, 0); + MOVI(&fp, 1, 0); + + // args: r0 - out + // r1 - N + // r2 - ws + // ADDI(&fp, 3, 1, 0); // put N into r3 for counter + + int32_t pAddr = 0; + int32_t pN = 0; + int32_t pLUT = 0; + count = 2; + while(pps[0]) { + + // fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); + if(!pN) { + MOVI(&fp, 1, pps[0]); + } else { + if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); + if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); + } + + if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) + ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); + + + if(pps[0] == 2*leafN) { + *fp = BL(fp+2, x_4_addr); + fp++; + } else if(!pps[2]) { + //uint32_t *x_8_t_addr = fp; #ifdef HAVE_NEON - memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); - if(sign < 0) { - fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; - fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; - fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; - } - fp += (neon_ee - neon_x8_t) / 4; - //*fp++ = BL(fp+2, x_8_t_addr); + memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); + if(sign < 0) { + fp[31] ^= 0x00200000; + fp[32] ^= 0x00200000; + fp[33] ^= 0x00200000; + fp[34] ^= 0x00200000; + fp[65] ^= 0x00200000; + fp[66] ^= 0x00200000; + fp[70] ^= 0x00200000; + fp[74] ^= 0x00200000; + fp[97] ^= 0x00200000; + fp[98] ^= 0x00200000; + fp[102] ^= 0x00200000; + fp[104] ^= 0x00200000; + } + fp += (neon_ee - neon_x8_t) / 4; + //*fp++ = BL(fp+2, x_8_t_addr); #else - *fp = BL(fp+2, x_8_addr); fp++; + *fp = BL(fp+2, x_8_addr); + fp++; #endif - }else{ - *fp = BL(fp+2, x_8_addr); fp++; - } - - pAddr = pps[1] * 4; - pN = pps[0]; - pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); -// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); - count += 4; - pps += 2; - } - - *fp++ = 0xecbd8b10; - *fp++ = POP_LR(); count++; + } else { + *fp = BL(fp+2, x_8_addr); + fp++; + } + + pAddr = pps[1] * 4; + pN = pps[0]; + pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); + // fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); + count += 4; + pps += 2; + } + + *fp++ = 0xecbd8b10; + *fp++ = POP_LR(); + count++; #else - POP(&fp, R15); - POP(&fp, R14); - POP(&fp, R13); - POP(&fp, R12); - POP(&fp, R11); - POP(&fp, R10); - POP(&fp, RBX); - POP(&fp, RBP); - RET(&fp); - -//uint8_t *pp = func; -//int counter = 0; -//do{ -// printf("%02x ", *pp); -// if(counter++ % 16 == 15) printf("\n"); -//} while(++pp < fp); + /* restore nonvolatile registers */ +#ifdef _M_AMD64 + /* mov rbx, [rsp + 8] */ + *fp++ = 0x48; + *fp++ = 0x8B; + *fp++ = 0x5C; + *fp++ = 0x24; + *fp++ = 0x08; + + /* mov rsi, [rsp + 16] */ + *fp++ = 0x48; + *fp++ = 0x8B; + *fp++ = 0x74; + *fp++ = 0x24; + *fp++ = 0x10; + + /* mov rdi, [rsp + 24] */ + *fp++ = 0x48; + *fp++ = 0x8B; + *fp++ = 0x7C; + *fp++ = 0x24; + *fp++ = 0x18; +#else + POP(&fp, R15); + POP(&fp, R14); + POP(&fp, R13); + POP(&fp, R12); + POP(&fp, R11); + POP(&fp, R10); + POP(&fp, RBX); + POP(&fp, RBP); +#endif -//printf("\n"); + RET(&fp); + //uint8_t *pp = func; + //int counter = 0; + //do{ + // printf("%02x ", *pp); + // if(counter++ % 16 == 15) printf("\n"); + //} while(++pp < fp); + //printf("\n"); #endif + // *fp++ = B(14); count++; -// *fp++ = B(14); count++; - -//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) -// fprintf(stderr, "%08x\n", x_4_addr[i]); -//fprintf(stderr, "\n"); -//for(int i=0;i<count;i++) - - free(ps); - - if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) { - perror("Couldn't mprotect"); - exit(1); - } -#ifdef __APPLE__ - sys_icache_invalidate(func, p->transform_size); -#elif __ANDROID__ - cacheflush((long)(func), (long)(func) + p->transform_size, 0); -#elif __linux__ -#ifdef __GNUC__ - __clear_cache((long)(func), (long)(func) + p->transform_size); -#endif -#endif + //for(int i=0;i<(neon_x8 - neon_x4)/4;i++) + // fprintf(stderr, "%08x\n", x_4_addr[i]); + //fprintf(stderr, "\n"); + //for(int i=0;i<count;i++) -//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); + fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp)); - p->transform = (void *) (start); -} -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: + free(ps); + + return (transform_func_t) start; +}
\ No newline at end of file |