diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-31 17:55:21 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-31 17:55:21 +0200 |
commit | 196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c (patch) | |
tree | a35307258c8cd76cf4c41af630938943c2c47e09 | |
parent | 7b999686ec4c732d28efd344065606fccba84ae4 (diff) | |
download | ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.zip ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.tar.gz |
Add CMake as an alternative build system
Add support for Windows x64 (requires YASM)
-rw-r--r-- | CMakeLists.txt | 158 | ||||
-rw-r--r-- | src/codegen.c | 1501 | ||||
-rw-r--r-- | src/codegen.h | 27 | ||||
-rw-r--r-- | src/codegen_sse.h | 240 | ||||
-rw-r--r-- | src/ffts.c | 940 | ||||
-rw-r--r-- | src/ffts.h | 273 | ||||
-rw-r--r-- | src/macros-sse.h | 44 | ||||
-rw-r--r-- | src/macros.h | 167 | ||||
-rw-r--r-- | src/sse.s | 13 | ||||
-rw-r--r-- | src/sse_win64.s | 840 |
10 files changed, 2878 insertions, 1325 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..365ec32 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,158 @@ +cmake_minimum_required(VERSION 2.8) + +project(ffts C ASM) + +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +# default build type is Debug which means no optimization +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") +endif(NOT CMAKE_BUILD_TYPE) + +# common options +option(ENABLE_SSE + "Enables the use of SSE instructions." ON +) + +option(ENABLE_NEON + "Enables the use of NEON instructions." OFF +) + +option(ENABLE_VFP + "Enables the use of VFP instructions." OFF +) + +option(DISABLE_DYNAMIC_CODE + "Disables the use of dynamic machine code generation." OFF +) + +option(ENABLE_SHARED + "Enable building a shared library." OFF +) + +#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pedantic -pipe -Wall") +add_definitions(-DFFTS_CMAKE_GENERATED) + +include(CheckIncludeFile) +include(CheckLibraryExists) + +if(MSVC) + add_definitions(-D_USE_MATH_DEFINES) +else() + # some systems need libm for some of the math functions to work + check_library_exists(m pow "" HAVE_LIBM) + if(HAVE_LIBM) + list(APPEND CMAKE_REQUIRED_LIBRARIES m) + list(APPEND FFTS_EXTRA_LIBRARIES m) + endif(HAVE_LIBM) +endif(MSVC) + +include_directories(src) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +set(FFTS_SOURCES + src/ffts_attributes.h + src/ffts.c + src/ffts.h + src/ffts_nd.c + src/ffts_nd.h + src/ffts_real.h + src/ffts_real.c + src/ffts_real_nd.c + src/ffts_real_nd.h + src/ffts_small.c + src/macros.h + src/patterns.c + src/patterns.h + src/types.h +) + +if(ENABLE_SSE) + list(APPEND FFTS_SOURCES + src/macros-sse.h + ) + + if(MSVC) + set(CMAKE_ASM-ATT_COMPILER yasm) + enable_language(ASM-ATT) + + add_custom_command( + OUTPUT sse_win64.obj + COMMAND ${CMAKE_ASM-ATT_COMPILER} -f win64 -m amd64 + -o ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj -p gas + ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Generating sse_win64.obj" + ) + + list(APPEND FFTS_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj + src/sse_win64.s + ) + else() + list(APPEND FFTS_SOURCES + src/sse.s + ) + endif(MSVC) + + add_definitions(-D_USE_MATH_DEFINES) + add_definitions(-D__x86_64__) + add_definitions(-DHAVE_SSE -msse2) +endif() + +if(ENABLE_NEON) + if(DISABLE_DYNAMIC_CODE) + list(APPEND FFTS_SOURCES + source/neon_static_f.s + source/neon_static_i.s + ) + else() + list(APPEND FFTS_SOURCES + source/neon.s + source/arch/neon.c + ) + endif() + + add_definitions(-DHAVE_NEON) +endif() + +if(ENABLE_VFP) + list(APPEND FFTS_SOURCES + source/vfp.s + source/arch/vfp.c + ) + + add_definitions(-DHAVE_VFP) +endif() + +if(ENABLE_SINGLE) + add_definitions(-DHAVE_SINGLE) +endif() + +if(DISABLE_DYNAMIC_CODE) + list(APPEND FFTS_SOURCES + src/ffts_static.c + ) + + add_definitions(-DDYNAMIC_DISABLED) +else() + list(APPEND FFTS_SOURCES + src/codegen.c + src/codegen.h + ) +endif() + +add_library(ffts_static + ${FFTS_SOURCES} +) + +add_executable(ffts_test + tests/test.c +) + +target_link_libraries(ffts_test + ffts_static + ${FFTS_EXTRA_LIBRARIES} +)
\ No newline at end of file diff --git a/src/codegen.c b/src/codegen.c index 79aaca6..0cc3d24 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -35,698 +35,951 @@ #include "macros.h" #include "ffts.h" -#ifdef __APPLE__ - #include <libkern/OSCacheControl.h> -#endif - -#include <sys/types.h> -#include <sys/mman.h> - #ifdef HAVE_NEON - #include "codegen_arm.h" - #include "neon.h" +#include "codegen_arm.h" +#include "neon.h" #elif HAVE_VFP - #include "codegen_arm.h" - #include "vfp.h" +#include "codegen_arm.h" +#include "vfp.h" #else - #include "codegen_sse.h" - #include "macros-sse.h" +#include "codegen_sse.h" +#include "macros-sse.h" #endif +#include <assert.h> +#include <errno.h> +#include <stddef.h> +/* #include <stdio.h> */ +#include <stdlib.h> +#include <string.h> + #ifdef __ANDROID__ - #include <unistd.h> +#include <unistd.h> #endif -int tree_count(int N, int leafN, int offset) { - - if(N <= leafN) return 0; - int count = 0; - count += tree_count(N/4, leafN, offset); - count += tree_count(N/8, leafN, offset + N/4); - count += tree_count(N/8, leafN, offset + N/4 + N/8); - count += tree_count(N/4, leafN, offset + N/2); - count += tree_count(N/4, leafN, offset + 3*N/4); - - return 1 + count; -} - -void elaborate_tree(size_t **p, int N, int leafN, int offset) { - - if(N <= leafN) return; - elaborate_tree(p, N/4, leafN, offset); - elaborate_tree(p, N/8, leafN, offset + N/4); - elaborate_tree(p, N/8, leafN, offset + N/4 + N/8); - elaborate_tree(p, N/4, leafN, offset + N/2); - elaborate_tree(p, N/4, leafN, offset + 3*N/4); - - (*p)[0] = N; - (*p)[1] = offset*2; +#ifdef __arm__ +typedef uint32_t insns_t; +#else +typedef uint8_t insns_t; +#endif - (*p)+=2; -} +#define P(x) (*(*p)++ = x) +static int ffts_tree_count(int N, int leaf_N, int offset) +{ + int count; + if (N <= leaf_N) { + return 0; + } + count = ffts_tree_count(N/4, leaf_N, offset); + count += ffts_tree_count(N/8, leaf_N, offset + N/4); + count += ffts_tree_count(N/8, leaf_N, offset + N/4 + N/8); + count += ffts_tree_count(N/4, leaf_N, offset + N/2); + count += ffts_tree_count(N/4, leaf_N, offset + 3*N/4); -uint32_t LUT_offset(size_t N, size_t leafN) { - int i; - size_t p_lut_size = 0; - size_t lut_size = 0; - int hardcoded = 0; - size_t n_luts = __builtin_ctzl(N/leafN); - int n = leafN*2; - //if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } - - for(i=0;i<n_luts-1;i++) { - p_lut_size = lut_size; - if(!i || hardcoded) { - #ifdef __arm__ - if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); - else lut_size += n/4 * sizeof(cdata_t); - #else - lut_size += n/4 * 2 * sizeof(cdata_t); - #endif - // n *= 2; - } else { - #ifdef __arm__ - lut_size += n/8 * 3 * sizeof(cdata_t); - #else - lut_size += n/8 * 3 * 2 * sizeof(cdata_t); - #endif - } - n *= 2; - } - return lut_size; + return 1 + count; } -#ifdef __arm__ - typedef uint32_t insns_t; -#else - typedef uint8_t insns_t; -#endif +static void ffts_elaborate_tree(size_t **p, int N, int leaf_N, int offset) +{ + if (N <= leaf_N) { + return; + } -#define P(x) (*(*p)++ = x) + ffts_elaborate_tree(p, N/4, leaf_N, offset); + ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4); + ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4 + N/8); + ffts_elaborate_tree(p, N/4, leaf_N, offset + N/2); + ffts_elaborate_tree(p, N/4, leaf_N, offset + 3*N/4); + + (*p)[0] = N; + (*p)[1] = 2 * offset; -void insert_nops(uint8_t **p, uint32_t count) { - switch(count) { - case 0: break; - case 2: P(0x66); - case 1: P(0x90); break; - case 3: P(0x0F); P(0x1F); P(0x00); break; - case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break; - case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break; - case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break; - case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break; - case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break; - case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break; - default: - P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); - insert_nops(p, count-9); - break; - } + (*p) += 2; } +static void ffts_insert_nops(uint8_t **p, uint32_t count) +{ + if (count >= 9) { + P(0x66); + P(0x0F); + P(0x1F); + P(0x84); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + + if (count > 9) { + ffts_insert_nops(p, count - 9); + } + } else { + switch(count) { + case 0: + break; + case 2: + P(0x66); + /* fall through */ + case 1: + P(0x90); + break; + case 3: + P(0x0F); + P(0x1F); + P(0x00); + break; + case 4: + P(0x0F); + P(0x1F); + P(0x40); + P(0x00); + break; + case 5: + P(0x0F); + P(0x1F); + P(0x44); + P(0x00); + P(0x00); + break; + case 6: + P(0x66); + P(0x0F); + P(0x1F); + P(0x44); + P(0x00); + P(0x00); + break; + case 7: + P(0x0F); + P(0x1F); + P(0x80); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + break; + case 8: + default: + P(0x0F); + P(0x1F); + P(0x84); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + P(0x00); + break; + } + } +} -void align_mem16(uint8_t **p, uint32_t offset) { +static void ffts_align_mem16(uint8_t **p, uint32_t offset) +{ #ifdef __x86_64__ - int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf); - r = (16 + r) & 0xf; - insert_nops(p, r); + int r = (16 - (offset & 0xf)) - ((uintptr_t)(*p) & 0xf); + r = (16 + r) & 0xf; + ffts_insert_nops(p, r); #endif } -void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { - int count = tree_count(N, leafN, 0) + 1; - size_t *ps = malloc(count * 2 * sizeof(size_t)); - size_t *pps = ps; +transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) +{ + uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; + uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; + + int32_t pAddr = 0; + int32_t pN = 0; + int32_t pLUT = 0; + + insns_t *fp; + insns_t *start; + insns_t *x_4_addr; + insns_t *x_8_addr; + uint32_t loop_count; + + int count; + int i; + ptrdiff_t len; + + size_t *ps; + size_t *pps; + + count = ffts_tree_count(N, leaf_N, 0) + 1; + + ps = pps = malloc(2 * count * sizeof(*ps)); + if (!ps) { + return NULL; + } + + ffts_elaborate_tree(&pps, N, leaf_N, 0); + + pps[0] = 0; + pps[1] = 0; + + pps = ps; #ifdef __x86_64__ - if(sign < 0) p->constants = sse_constants; - else p->constants = sse_constants_inv; + if (sign < 0) { + p->constants = sse_constants; + } else { + p->constants = sse_constants_inv; + } #endif - elaborate_tree(&pps, N, leafN, 0); - pps[0] = 0; - pps[1] = 0; + fp = (insns_t*) p->transform_base; - pps = ps; +#ifdef __arm__ +#ifdef HAVE_NEON + memcpy(fp, neon_x8, neon_x8_t - neon_x8); + /* + * Changes adds to subtracts and vice versa to allow the computation + * of both the IFFT and FFT + */ + if(sign < 0) { + fp[31] ^= 0x00200000; + fp[32] ^= 0x00200000; + fp[33] ^= 0x00200000; + fp[34] ^= 0x00200000; + fp[65] ^= 0x00200000; + fp[66] ^= 0x00200000; + fp[70] ^= 0x00200000; + fp[74] ^= 0x00200000; + fp[97] ^= 0x00200000; + fp[98] ^= 0x00200000; + fp[102] ^= 0x00200000; + fp[104] ^= 0x00200000; + } + fp += (neon_x8_t - neon_x8) / 4; +#else + memcpy(fp, vfp_x8, vfp_end - vfp_x8); + if(sign > 0) { + fp[65] ^= 0x00000040; + fp[66] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[70] ^= 0x00000040; + fp[103] ^= 0x00000040; + fp[104] ^= 0x00000040; + fp[105] ^= 0x00000040; + fp[108] ^= 0x00000040; + fp[113] ^= 0x00000040; + fp[114] ^= 0x00000040; + fp[117] ^= 0x00000040; + fp[118] ^= 0x00000040; + } + fp += (vfp_end - vfp_x8) / 4; +#endif +#else + /* align call destination */ + ffts_align_mem16(&fp, 0); + x_8_addr = fp; -#ifdef __arm__ - if(N < 8192) p->transform_size = 8192; - else p->transform_size = N; + /* align loop/jump destination */ +#ifdef _M_AMD64 + ffts_align_mem16(&fp, 6); #else - if(N < 2048) p->transform_size = 16384; - else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N); + ffts_align_mem16(&fp, 5); +#endif + + /* copy function */ + assert((char*) x8_soft_end > (char*) x8_soft); + len = (char*) x8_soft_end - (char*) x8_soft; + memcpy(fp, x8_soft, (size_t) len); + fp += len; #endif + //uint32_t *x_8_t_addr = fp; + //memcpy(fp, neon_x8_t, neon_end - neon_x8_t); + //fp += (neon_end - neon_x8_t) / 4; -#ifdef __APPLE__ - p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); +#ifdef __arm__ +#ifdef HAVE_NEON + memcpy(fp, neon_x4, neon_x8 - neon_x4); + if(sign < 0) { + fp[26] ^= 0x00200000; + fp[28] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[32] ^= 0x00200000; + } + fp += (neon_x8 - neon_x4) / 4; #else -#define MAP_ANONYMOUS 0x20 - p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); + if(sign > 0) { + fp[36] ^= 0x00000040; + fp[38] ^= 0x00000040; + fp[43] ^= 0x00000040; + fp[44] ^= 0x00000040; + } + fp += (vfp_x8 - vfp_x4) / 4; +#endif +#else + /* align call destination */ + ffts_align_mem16(&fp, 0); + x_4_addr = fp; + + /* copy function */ + assert((char*) x8_soft > (char*) x4); + len = (char*) x8_soft - (char*) x4; + memcpy(fp, x4, (size_t) len); + fp += len; #endif -/* - if(p->transform_base == MAP_FAILED) { - fprintf(stderr, "MAP FAILED\n"); - exit(1); - }*/ - insns_t *func = p->transform_base;//valloc(8192); - insns_t *fp = func; - -//fprintf(stderr, "Allocating %d bytes \n", p->transform_size); -//fprintf(stderr, "Base address = %016p\n", func); - - if(!func) { - fprintf(stderr, "NOMEM\n"); - exit(1); - } - - insns_t *x_8_addr = fp; #ifdef __arm__ + start = fp; + + *fp = PUSH_LR(); + fp++; + *fp = 0xed2d8b10; + fp++; + + ADDI(&fp, 3, 1, 0); + ADDI(&fp, 7, 1, N); + ADDI(&fp, 5, 1, 2*N); + ADDI(&fp, 10, 7, 2*N); + ADDI(&fp, 4, 5, 2*N); + ADDI(&fp, 8, 10, 2*N); + ADDI(&fp, 6, 4, 2*N); + ADDI(&fp, 9, 8, 2*N); + + *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); + fp++; // load offsets into r12 + // *fp++ = LDRI(1, 0, 4); // load ws into r1 + ADDI(&fp, 1, 0, 0); + + ADDI(&fp, 0, 2, 0), // mov out into r0 + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + fp++; + #ifdef HAVE_NEON - memcpy(fp, neon_x8, neon_x8_t - neon_x8); - /* - * Changes adds to subtracts and vice versa to allow the computation - * of both the IFFT and FFT - */ - if(sign < 0) { - fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; - fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; - fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; - } - fp += (neon_x8_t - neon_x8) / 4; + MOVI(&fp, 11, p->i0); +#else + MOVI(&fp, 11, p->i0); +#endif +#else + /* align call destination */ + ffts_align_mem16(&fp, 0); + start = fp; + + /* save nonvolatile registers */ +#ifdef _M_AMD64 + /* use the shadow space to save first 3 registers */ + + /* mov [rsp + 8], rbx */ + *fp++ = 0x48; + *fp++ = 0x89; + *fp++ = 0x5C; + *fp++ = 0x24; + *fp++ = 0x08; + + /* mov [rsp + 16], rsi */ + *fp++ = 0x48; + *fp++ = 0x89; + *fp++ = 0x74; + *fp++ = 0x24; + *fp++ = 0x10; + + /* mov [rsp + 24], rdi */ + *fp++ = 0x48; + *fp++ = 0x89; + *fp++ = 0x7C; + *fp++ = 0x24; + *fp++ = 0x18; #else - memcpy(fp, vfp_x8, vfp_end - vfp_x8); - if(sign > 0) { - fp[65] ^= 0x00000040; - fp[66] ^= 0x00000040; - fp[68] ^= 0x00000040; - fp[70] ^= 0x00000040; - fp[103] ^= 0x00000040; - fp[104] ^= 0x00000040; - fp[105] ^= 0x00000040; - fp[108] ^= 0x00000040; - fp[113] ^= 0x00000040; - fp[114] ^= 0x00000040; - fp[117] ^= 0x00000040; - fp[118] ^= 0x00000040; - } - fp += (vfp_end - vfp_x8) / 4; + PUSH(&fp, RBP); + PUSH(&fp, RBX); + PUSH(&fp, R10); + PUSH(&fp, R11); + PUSH(&fp, R12); + PUSH(&fp, R13); + PUSH(&fp, R14); + PUSH(&fp, R15); #endif + + /* assign loop counter register */ + loop_count = p->i0 * 4; +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); #else - align_mem16(&fp, 0); - x_8_addr = fp; - align_mem16(&fp, 5); - memcpy(fp, x8_soft, x8_hard - x8_soft); - fp += (x8_hard - x8_soft); -//fprintf(stderr, "X8 start address = %016p\n", x_8_addr); + MOVI(&fp, ECX, loop_count); +#endif #endif -//uint32_t *x_8_t_addr = fp; -//memcpy(fp, neon_x8_t, neon_end - neon_x8_t); -//fp += (neon_end - neon_x8_t) / 4; - insns_t *x_4_addr = fp; + #ifdef __arm__ - #ifdef HAVE_NEON - memcpy(fp, neon_x4, neon_x8 - neon_x4); - if(sign < 0) { - fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; - } - fp += (neon_x8 - neon_x4) / 4; - #else - memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); - if(sign > 0) { - fp[36] ^= 0x00000040; - fp[38] ^= 0x00000040; - fp[43] ^= 0x00000040; - fp[44] ^= 0x00000040; - } - fp += (vfp_x8 - vfp_x4) / 4; - #endif +#ifdef HAVE_NEON + memcpy(fp, neon_ee, neon_oo - neon_ee); + if (sign < 0) { + fp[33] ^= 0x00200000; + fp[37] ^= 0x00200000; + fp[38] ^= 0x00200000; + fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; + fp[41] ^= 0x00200000; + fp[44] ^= 0x00200000; + fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + + fp += (neon_oo - neon_ee) / 4; +#else + memcpy(fp, vfp_e, vfp_o - vfp_e); + + if (sign > 0) { + fp[64] ^= 0x00000040; + fp[65] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[75] ^= 0x00000040; + fp[76] ^= 0x00000040; + fp[79] ^= 0x00000040; + fp[80] ^= 0x00000040; + fp[83] ^= 0x00000040; + fp[84] ^= 0x00000040; + fp[87] ^= 0x00000040; + fp[91] ^= 0x00000040; + fp[93] ^= 0x00000040; + } + fp += (vfp_o - vfp_e) / 4; +#endif #else - align_mem16(&fp, 0); - x_4_addr = fp; - memcpy(fp, x4, x8_soft - x4); - fp += (x8_soft - x4); + //fprintf(stderr, "Body start address = %016p\n", start); + /* copy function */ + assert((char*) leaf_ee > (char*) leaf_ee_init); + len = (char*) leaf_ee - (char*) leaf_ee_init; + memcpy(fp, leaf_ee_init, (size_t) len); + fp += len; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + ffts_align_mem16(&fp, 8); +#else + ffts_align_mem16(&fp, 9); #endif - insns_t *start = fp; - -#ifdef __arm__ - *fp = PUSH_LR(); fp++; - *fp = 0xed2d8b10; fp++; - - ADDI(&fp, 3, 1, 0); - ADDI(&fp, 7, 1, N); - ADDI(&fp, 5, 1, 2*N); - ADDI(&fp, 10, 7, 2*N); - ADDI(&fp, 4, 5, 2*N); - ADDI(&fp, 8, 10, 2*N); - ADDI(&fp, 6, 4, 2*N); - ADDI(&fp, 9, 8, 2*N); - - *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12 -// *fp++ = LDRI(1, 0, 4); // load ws into r1 - ADDI(&fp, 1, 0, 0); - - ADDI(&fp, 0, 2, 0), // mov out into r0 + + /* copy function */ + assert((char*) leaf_oo > (char*) leaf_ee); + len = (char*) leaf_oo - (char*) leaf_ee; + memcpy(fp, leaf_ee, (size_t) len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets[i]); + } + + fp += len; + + if (ffts_ctzl(N) & 1) { + if (p->i1) { + loop_count += 4 * p->i1; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); + ffts_align_mem16(&fp, 3); +#else + MOVI(&fp, ECX, loop_count); + ffts_align_mem16(&fp, 4); #endif + /* copy function */ + assert((char*) leaf_eo > (char*) leaf_oo); + len = (char*) leaf_eo - (char*) leaf_oo; + memcpy(fp, leaf_oo, len); -#ifdef __arm__ - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - #ifdef HAVE_NEON - MOVI(&fp, 11, p->i0); - #else - MOVI(&fp, 11, p->i0); - #endif + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]); + } + + fp += len; + } + + loop_count += 4; + + /* copy function */ + assert((char*) leaf_end > (char*) leaf_oe); + len = (char*) leaf_end - (char*) leaf_oe; + memcpy(fp, leaf_oe, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_oe_offsets[i], 4 * offsets_o[i]); + } + fp += len; + } else { + loop_count += 4; + + /* copy function */ + assert((char*) leaf_oe > (char*) leaf_eo); + len = (char*) leaf_oe - (char*) leaf_eo; + memcpy(fp, leaf_eo, len); + + /* patch offsets */ + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_eo_offsets[i], 4 * offsets[i]); + } + + fp += len; + + if (p->i1) { + loop_count += 4 * p->i1; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); + ffts_align_mem16(&fp, 3); #else - align_mem16(&fp, 0); - start = fp; - - *fp++ = 0x4c; - *fp++ = 0x8b; - *fp++ = 0x07; - uint32_t lp_cnt = p->i0 * 4; - MOVI(&fp, RCX, lp_cnt); - - //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); + MOVI(&fp, ECX, loop_count); + ffts_align_mem16(&fp, 4); #endif - //fp++; -#ifdef __arm__ -#ifdef HAVE_NEON - memcpy(fp, neon_ee, neon_oo - neon_ee); - if(sign < 0) { - fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; - fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_oo - neon_ee) / 4; + + /* copy function */ + assert((char*) leaf_eo > (char*) leaf_oo); + len = (char*) leaf_eo - (char*) leaf_oo; + memcpy(fp, leaf_oo, len); + + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]); + } + + fp += len; + } + } + + if (p->i1) { + uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; + + loop_count += 4 * p->i1; + + /* align loop/jump destination */ +#ifdef _M_AMD64 + MOVI(&fp, EBX, loop_count); + ffts_align_mem16(&fp, 8); #else - memcpy(fp, vfp_e, vfp_o - vfp_e); - if(sign > 0) { - fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; - fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; - fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; - } - fp += (vfp_o - vfp_e) / 4; + MOVI(&fp, ECX, loop_count); + ffts_align_mem16(&fp, 9); #endif + + assert((char*) leaf_oo > (char*) leaf_ee); + len = (char*) leaf_oo - (char*) leaf_ee; + memcpy(fp, leaf_ee, len); + + for (i = 0; i < 8; i++) { + IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets_oe[i]); + } + + fp += len; + } + + //fprintf(stderr, "Body start address = %016p\n", fp); + //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); + memcpy(fp, x_init, (char*) x4 - (char*) x_init); + //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); + fp += ((char*) x4 - (char*) x_init); + + count = 2; + while (pps[0]) { + size_t ws_is; + + if (!pN) { +#ifdef _M_AMD64 + MOVI(&fp, EBX, pps[0]); #else -//fprintf(stderr, "Body start address = %016p\n", start); - - PUSH(&fp, RBP); - PUSH(&fp, RBX); - PUSH(&fp, R10); - PUSH(&fp, R11); - PUSH(&fp, R12); - PUSH(&fp, R13); - PUSH(&fp, R14); - PUSH(&fp, R15); - - int i; - memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init); - -//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); -//fprintf(stderr, "Constants address = %016p\n", sse_constants); -//fprintf(stderr, "Constants address = %016p\n", p->constants); - -//int32_t val = READ_IMM32(fp + 3); -//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p)); - -//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp ); -//fprintf(stderr, "IMM = 0x%llx\n", v2); - -//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); - fp += (leaf_ee - leaf_ee_init); - -//fprintf(stderr, "Leaf start address = %016p\n", fp); - align_mem16(&fp, 9); - memcpy(fp, leaf_ee, leaf_oo - leaf_ee); - - - uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; - uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; - uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; - - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); - - fp += (leaf_oo - leaf_ee); - - if(__builtin_ctzl(N) & 1){ - - if(p->i1) { - lp_cnt += p->i1 * 4; - MOVI(&fp, RCX, lp_cnt); - align_mem16(&fp, 4); - memcpy(fp, leaf_oo, leaf_eo - leaf_oo); - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); - fp += (leaf_eo - leaf_oo); - } - - - memcpy(fp, leaf_oe, leaf_end - leaf_oe); - lp_cnt += 4; - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); - fp += (leaf_end - leaf_oe); - - }else{ - - - memcpy(fp, leaf_eo, leaf_oe - leaf_eo); - lp_cnt += 4; - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); - fp += (leaf_oe - leaf_eo); - - if(p->i1) { - lp_cnt += p->i1 * 4; - MOVI(&fp, RCX, lp_cnt); - align_mem16(&fp, 4); - memcpy(fp, leaf_oo, leaf_eo - leaf_oo); - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); - fp += (leaf_eo - leaf_oo); - } - - } - if(p->i1) { - lp_cnt += p->i1 * 4; - MOVI(&fp, RCX, lp_cnt); - align_mem16(&fp, 9); - memcpy(fp, leaf_ee, leaf_oo - leaf_ee); - for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); - fp += (leaf_oo - leaf_ee); - - } - -//fprintf(stderr, "Body start address = %016p\n", fp); - //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); - memcpy(fp, x_init, x4 - x_init); -//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); - fp += (x4 - x_init); - - int32_t pAddr = 0; - int32_t pN = 0; - int32_t pLUT = 0; - count = 2; - while(pps[0]) { - - if(!pN) { - MOVI(&fp, RCX, pps[0] / 4); - }else{ - if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr); - if(pps[0] > leafN && pps[0] - pN) { - - int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN); - *fp++ = 0xc1; - - if(diff > 0) { - *fp++ = 0xe1; - *fp++ = (diff & 0xff); - }else{ - *fp++ = 0xe9; - *fp++ = ((-diff) & 0xff); - } - } - } - - if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) - ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); - - - if(pps[0] == 2*leafN) { - CALL(&fp, x_4_addr); - // }else if(!pps[2]){ - // //uint32_t *x_8_t_addr = fp; - // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); - // fp += (neon_ee - neon_x8_t) / 4; - // //*fp++ = BL(fp+2, x_8_t_addr); - }else{ - CALL(&fp, x_8_addr); - } - - pAddr = pps[1] * 4; - if(pps[0] > leafN) - pN = pps[0]; - pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); -// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); - count += 4; - pps += 2; - } + MOVI(&fp, ECX, pps[0] / 4); #endif + } else { + int offset = (4 * pps[1]) - pAddr; + if (offset) { +#ifdef _M_AMD64 + ADDI(&fp, R8, offset); +#else + ADDI(&fp, RDX, offset); +#endif + } + + if (pps[0] > leaf_N && pps[0] - pN) { + int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN); + +#ifdef _M_AMD64 + SHIFT(&fp, EBX, factor); +#else + SHIFT(&fp, ECX, factor); +#endif + } + } + + ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1]; + if (ws_is != pLUT) { + int offset = (int) (ws_is - pLUT); + +#ifdef _M_AMD64 + ADDI(&fp, RDI, offset); +#else + ADDI(&fp, R8, offset); +#endif + } + + if (pps[0] == 2 * leaf_N) { + CALL(&fp, x_4_addr); + } else { + CALL(&fp, x_8_addr); + } + + pAddr = 4 * pps[1]; + if (pps[0] > leaf_N) { + pN = pps[0]; + } + + pLUT = ws_is;//LUT_offset(pps[0], leafN); + //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); + count += 4; + pps += 2; + } +#endif + #ifdef __arm__ #ifdef HAVE_NEON - if(__builtin_ctzl(N) & 1){ - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); - - if(p->i1) { - MOVI(&fp, 11, p->i1); - memcpy(fp, neon_oo, neon_eo - neon_oo); - if(sign < 0) { - fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; - fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_eo - neon_oo) / 4; - } - - *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; - - memcpy(fp, neon_oe, neon_end - neon_oe); - if(sign < 0) { - fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000; - fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; - fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000; - } - fp += (neon_end - neon_oe) / 4; - - }else{ - - *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; - - memcpy(fp, neon_eo, neon_oe - neon_eo); - if(sign < 0) { - fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; - fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000; - fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000; - } - fp += (neon_oe - neon_eo) / 4; - - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); - - if(p->i1) { - MOVI(&fp, 11, p->i1); - memcpy(fp, neon_oo, neon_eo - neon_oo); - if(sign < 0) { - fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; - fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_eo - neon_oo) / 4; - } - - } - - - if(p->i1) { - ADDI(&fp, 2, 3, 0); - ADDI(&fp, 3, 7, 0); - ADDI(&fp, 7, 2, 0); - - ADDI(&fp, 2, 4, 0); - ADDI(&fp, 4, 8, 0); - ADDI(&fp, 8, 2, 0); - - ADDI(&fp, 2, 5, 0); - ADDI(&fp, 5, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 6, 0); - ADDI(&fp, 6, 10, 0); - ADDI(&fp, 10, 2, 0); - - ADDI(&fp, 2, 9, 0); - ADDI(&fp, 9, 10, 0); - ADDI(&fp, 10, 2, 0); - - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - MOVI(&fp, 11, p->i1); - memcpy(fp, neon_ee, neon_oo - neon_ee); - if(sign < 0) { - fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; - fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; - fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; - } - fp += (neon_oo - neon_ee) / 4; - - } + if(__builtin_ctzl(N) & 1) { + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + if(sign < 0) { + fp[12] ^= 0x00200000; + fp[13] ^= 0x00200000; + fp[14] ^= 0x00200000; + fp[15] ^= 0x00200000; + fp[27] ^= 0x00200000; + fp[29] ^= 0x00200000; + fp[30] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + fp += (neon_eo - neon_oo) / 4; + } + + *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); + fp++; + + memcpy(fp, neon_oe, neon_end - neon_oe); + if(sign < 0) { + fp[19] ^= 0x00200000; + fp[20] ^= 0x00200000; + fp[22] ^= 0x00200000; + fp[23] ^= 0x00200000; + fp[37] ^= 0x00200000; + fp[38] ^= 0x00200000; + fp[40] ^= 0x00200000; + fp[41] ^= 0x00200000; + fp[64] ^= 0x00200000; + fp[65] ^= 0x00200000; + fp[66] ^= 0x00200000; + fp[67] ^= 0x00200000; + } + fp += (neon_end - neon_oe) / 4; + + } else { + + *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); + fp++; + + memcpy(fp, neon_eo, neon_oe - neon_eo); + if(sign < 0) { + fp[10] ^= 0x00200000; + fp[11] ^= 0x00200000; + fp[13] ^= 0x00200000; + fp[14] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[33] ^= 0x00200000; + fp[34] ^= 0x00200000; + fp[35] ^= 0x00200000; + fp[59] ^= 0x00200000; + fp[60] ^= 0x00200000; + fp[61] ^= 0x00200000; + fp[62] ^= 0x00200000; + } + fp += (neon_oe - neon_eo) / 4; + + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + if(p->i1) { + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_oo, neon_eo - neon_oo); + if(sign < 0) { + fp[12] ^= 0x00200000; + fp[13] ^= 0x00200000; + fp[14] ^= 0x00200000; + fp[15] ^= 0x00200000; + fp[27] ^= 0x00200000; + fp[29] ^= 0x00200000; + fp[30] ^= 0x00200000; + fp[31] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + fp += (neon_eo - neon_oo) / 4; + } + + } + + + if(p->i1) { + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + fp++; + MOVI(&fp, 11, p->i1); + memcpy(fp, neon_ee, neon_oo - neon_ee); + if(sign < 0) { + fp[33] ^= 0x00200000; + fp[37] ^= 0x00200000; + fp[38] ^= 0x00200000; + fp[39] ^= 0x00200000; + fp[40] ^= 0x00200000; + fp[41] ^= 0x00200000; + fp[44] ^= 0x00200000; + fp[45] ^= 0x00200000; + fp[46] ^= 0x00200000; + fp[47] ^= 0x00200000; + fp[48] ^= 0x00200000; + fp[57] ^= 0x00200000; + } + fp += (neon_oo - neon_ee) / 4; + + } #else - ADDI(&fp, 2, 7, 0); - ADDI(&fp, 7, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 8, 0); - ADDI(&fp, 8, 10, 0); - ADDI(&fp, 10, 2, 0); - - MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); - memcpy(fp, vfp_o, vfp_x4 - vfp_o); - if(sign > 0) { - fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040; - fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040; - } - fp += (vfp_x4 - vfp_o) / 4; - - ADDI(&fp, 2, 3, 0); - ADDI(&fp, 3, 7, 0); - ADDI(&fp, 7, 2, 0); - - ADDI(&fp, 2, 4, 0); - ADDI(&fp, 4, 8, 0); - ADDI(&fp, 8, 2, 0); - - ADDI(&fp, 2, 5, 0); - ADDI(&fp, 5, 9, 0); - ADDI(&fp, 9, 2, 0); - - ADDI(&fp, 2, 6, 0); - ADDI(&fp, 6, 10, 0); - ADDI(&fp, 10, 2, 0); - - ADDI(&fp, 2, 9, 0); - ADDI(&fp, 9, 10, 0); - ADDI(&fp, 10, 2, 0); - - *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; - MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); - memcpy(fp, vfp_e, vfp_o - vfp_e); - if(sign > 0) { - fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; - fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; - fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; - } - fp += (vfp_o - vfp_e) / 4; + ADDI(&fp, 2, 7, 0); + ADDI(&fp, 7, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 8, 0); + ADDI(&fp, 8, 10, 0); + ADDI(&fp, 10, 2, 0); + + MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); + memcpy(fp, vfp_o, vfp_x4 - vfp_o); + if(sign > 0) { + fp[22] ^= 0x00000040; + fp[24] ^= 0x00000040; + fp[25] ^= 0x00000040; + fp[26] ^= 0x00000040; + fp[62] ^= 0x00000040; + fp[64] ^= 0x00000040; + fp[65] ^= 0x00000040; + fp[66] ^= 0x00000040; + } + fp += (vfp_x4 - vfp_o) / 4; + + ADDI(&fp, 2, 3, 0); + ADDI(&fp, 3, 7, 0); + ADDI(&fp, 7, 2, 0); + + ADDI(&fp, 2, 4, 0); + ADDI(&fp, 4, 8, 0); + ADDI(&fp, 8, 2, 0); + + ADDI(&fp, 2, 5, 0); + ADDI(&fp, 5, 9, 0); + ADDI(&fp, 9, 2, 0); + + ADDI(&fp, 2, 6, 0); + ADDI(&fp, 6, 10, 0); + ADDI(&fp, 10, 2, 0); + + ADDI(&fp, 2, 9, 0); + ADDI(&fp, 9, 10, 0); + ADDI(&fp, 10, 2, 0); + + *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); + fp++; + MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); + memcpy(fp, vfp_e, vfp_o - vfp_e); + if(sign > 0) { + fp[64] ^= 0x00000040; + fp[65] ^= 0x00000040; + fp[68] ^= 0x00000040; + fp[75] ^= 0x00000040; + fp[76] ^= 0x00000040; + fp[79] ^= 0x00000040; + fp[80] ^= 0x00000040; + fp[83] ^= 0x00000040; + fp[84] ^= 0x00000040; + fp[87] ^= 0x00000040; + fp[91] ^= 0x00000040; + fp[93] ^= 0x00000040; + } + fp += (vfp_o - vfp_e) / 4; #endif - *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12 - //ADDI(&fp, 2, 1, 0); - MOVI(&fp, 1, 0); - - // args: r0 - out - // r1 - N - // r2 - ws -// ADDI(&fp, 3, 1, 0); // put N into r3 for counter - - int32_t pAddr = 0; - int32_t pN = 0; - int32_t pLUT = 0; - count = 2; - while(pps[0]) { - -// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); - if(!pN) { - MOVI(&fp, 1, pps[0]); - }else{ - if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); - if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); - } - - if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) - ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); - - - if(pps[0] == 2*leafN) { - *fp = BL(fp+2, x_4_addr); fp++; - }else if(!pps[2]){ - //uint32_t *x_8_t_addr = fp; + *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); + fp++; // load offsets into r12 + //ADDI(&fp, 2, 1, 0); + MOVI(&fp, 1, 0); + + // args: r0 - out + // r1 - N + // r2 - ws + // ADDI(&fp, 3, 1, 0); // put N into r3 for counter + + int32_t pAddr = 0; + int32_t pN = 0; + int32_t pLUT = 0; + count = 2; + while(pps[0]) { + + // fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); + if(!pN) { + MOVI(&fp, 1, pps[0]); + } else { + if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); + if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); + } + + if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) + ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); + + + if(pps[0] == 2*leafN) { + *fp = BL(fp+2, x_4_addr); + fp++; + } else if(!pps[2]) { + //uint32_t *x_8_t_addr = fp; #ifdef HAVE_NEON - memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); - if(sign < 0) { - fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; - fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; - fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; - } - fp += (neon_ee - neon_x8_t) / 4; - //*fp++ = BL(fp+2, x_8_t_addr); + memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); + if(sign < 0) { + fp[31] ^= 0x00200000; + fp[32] ^= 0x00200000; + fp[33] ^= 0x00200000; + fp[34] ^= 0x00200000; + fp[65] ^= 0x00200000; + fp[66] ^= 0x00200000; + fp[70] ^= 0x00200000; + fp[74] ^= 0x00200000; + fp[97] ^= 0x00200000; + fp[98] ^= 0x00200000; + fp[102] ^= 0x00200000; + fp[104] ^= 0x00200000; + } + fp += (neon_ee - neon_x8_t) / 4; + //*fp++ = BL(fp+2, x_8_t_addr); #else - *fp = BL(fp+2, x_8_addr); fp++; + *fp = BL(fp+2, x_8_addr); + fp++; #endif - }else{ - *fp = BL(fp+2, x_8_addr); fp++; - } - - pAddr = pps[1] * 4; - pN = pps[0]; - pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); -// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); - count += 4; - pps += 2; - } - - *fp++ = 0xecbd8b10; - *fp++ = POP_LR(); count++; + } else { + *fp = BL(fp+2, x_8_addr); + fp++; + } + + pAddr = pps[1] * 4; + pN = pps[0]; + pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); + // fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); + count += 4; + pps += 2; + } + + *fp++ = 0xecbd8b10; + *fp++ = POP_LR(); + count++; #else - POP(&fp, R15); - POP(&fp, R14); - POP(&fp, R13); - POP(&fp, R12); - POP(&fp, R11); - POP(&fp, R10); - POP(&fp, RBX); - POP(&fp, RBP); - RET(&fp); - -//uint8_t *pp = func; -//int counter = 0; -//do{ -// printf("%02x ", *pp); -// if(counter++ % 16 == 15) printf("\n"); -//} while(++pp < fp); + /* restore nonvolatile registers */ +#ifdef _M_AMD64 + /* mov rbx, [rsp + 8] */ + *fp++ = 0x48; + *fp++ = 0x8B; + *fp++ = 0x5C; + *fp++ = 0x24; + *fp++ = 0x08; + + /* mov rsi, [rsp + 16] */ + *fp++ = 0x48; + *fp++ = 0x8B; + *fp++ = 0x74; + *fp++ = 0x24; + *fp++ = 0x10; + + /* mov rdi, [rsp + 24] */ + *fp++ = 0x48; + *fp++ = 0x8B; + *fp++ = 0x7C; + *fp++ = 0x24; + *fp++ = 0x18; +#else + POP(&fp, R15); + POP(&fp, R14); + POP(&fp, R13); + POP(&fp, R12); + POP(&fp, R11); + POP(&fp, R10); + POP(&fp, RBX); + POP(&fp, RBP); +#endif -//printf("\n"); + RET(&fp); + //uint8_t *pp = func; + //int counter = 0; + //do{ + // printf("%02x ", *pp); + // if(counter++ % 16 == 15) printf("\n"); + //} while(++pp < fp); + //printf("\n"); #endif + // *fp++ = B(14); count++; -// *fp++ = B(14); count++; - -//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) -// fprintf(stderr, "%08x\n", x_4_addr[i]); -//fprintf(stderr, "\n"); -//for(int i=0;i<count;i++) - - free(ps); - - if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) { - perror("Couldn't mprotect"); - exit(1); - } -#ifdef __APPLE__ - sys_icache_invalidate(func, p->transform_size); -#elif __ANDROID__ - cacheflush((long)(func), (long)(func) + p->transform_size, 0); -#elif __linux__ -#ifdef __GNUC__ - __clear_cache((long)(func), (long)(func) + p->transform_size); -#endif -#endif + //for(int i=0;i<(neon_x8 - neon_x4)/4;i++) + // fprintf(stderr, "%08x\n", x_4_addr[i]); + //fprintf(stderr, "\n"); + //for(int i=0;i<count;i++) -//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); + fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp)); - p->transform = (void *) (start); -} -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: + free(ps); + + return (transform_func_t) start; +}
\ No newline at end of file diff --git a/src/codegen.h b/src/codegen.h index c07144f..e3c2381 100644 --- a/src/codegen.h +++ b/src/codegen.h @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,20 +31,15 @@ */ -#ifndef __CODEGEN_H__ -#define __CODEGEN_H__ +#ifndef FFTS_CODEGEN_H +#define FFTS_CODEGEN_H -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> -#include <errno.h> -#include <sys/mman.h> -#include <string.h> -#include <limits.h> /* for PAGESIZE */ +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif #include "ffts.h" -void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign); +transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign); -#endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#endif /* FFTS_CODEGEN_H */ diff --git a/src/codegen_sse.h b/src/codegen_sse.h index 6a38671..a63d21d 100644 --- a/src/codegen_sse.h +++ b/src/codegen_sse.h @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,8 +32,8 @@ */ -#ifndef __CODEGEN_SSE_H__ -#define __CODEGEN_SSE_H__ +#ifndef FFTS_CODEGEN_SSE_H +#define FFTS_CODEGEN_SSE_H void neon_x4(float *, size_t, float *); void neon_x8(float *, size_t, float *); @@ -47,7 +47,7 @@ void leaf_end(); void x_init(); void x4(); void x8_soft(); -void x8_hard(); +void x8_soft_end(); void sse_constants(); void sse_constants_inv(); @@ -74,123 +74,157 @@ extern const uint32_t sse_leaf_oe_offsets[8]; #define RSI 6 #define RDI 7 #define RBP 5 -#define R8 8 -#define R9 9 -#define R10 10 -#define R11 11 -#define R12 12 -#define R13 13 -#define R14 14 -#define R15 15 - -void IMM8(uint8_t **p, int32_t imm) { - *(*p)++ = (imm & 0xff); +#define R8 8 +#define R9 9 +#define R10 10 +#define R11 11 +#define R12 12 +#define R13 13 +#define R14 14 +#define R15 15 + +void IMM8(uint8_t **p, int32_t imm) +{ + *(*p)++ = (imm & 0xff); } -void IMM16(uint8_t **p, int32_t imm) { - int i; - for(i=0;i<2;i++) { - *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); - } +void IMM16(uint8_t **p, int32_t imm) +{ + int i; + + for (i = 0; i < 2; i++) { + *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i); + } } -void IMM32(uint8_t **p, int32_t imm) { - int i; - for(i=0;i<4;i++) { - *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); - } + +void IMM32(uint8_t **p, int32_t imm) +{ + int i; + + for (i = 0; i < 4; i++) { + *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i); + } } -void IMM32_NI(uint8_t *p, int32_t imm) { - int i; - for(i=0;i<4;i++) { - *(p+i) = (imm & (0xff << (i*8))) >> (i*8); - } + +void IMM32_NI(uint8_t *p, int32_t imm) +{ + int i; + + for (i = 0; i < 4; i++) { + *(p+i) = (imm & (0xff << (8 * i))) >> (8 * i); + } } -int32_t READ_IMM32(uint8_t *p) { - int32_t rval = 0; - int i; - for(i=0;i<4;i++) { - rval |= *(p+i) << (i*8); - } - return rval; +int32_t READ_IMM32(uint8_t *p) +{ + int32_t rval = 0; + int i; + + for (i = 0; i < 4; i++) { + rval |= *(p+i) << (8 * i); + } + + return rval; } -void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) { -// if(imm < 65536) *(*p)++ = 0x66; - if(dst >= 8) *(*p)++ = 0x41; - - //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66; - - //if(imm >= 256) - *(*p)++ = 0xb8 | (dst & 0x7); -// else *(*p)++ = 0xb0 | (dst & 0x7); - - // if(imm < 256) IMM8(p, imm); -// else -//if(imm < 65536) IMM16(p, imm); -//else - IMM32(p, imm); - -//if(dst < 8) { -// *(*p)++ = 0xb8 + dst; -//}else{ -// *(*p)++ = 0x49; -// *(*p)++ = 0xc7; -// *(*p)++ = 0xc0 | (dst - 8); -//} -//IMM32(p, imm); +void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) +{ + if (dst >= 8) { + *(*p)++ = 0x41; + } + + *(*p)++ = 0xb8 | (dst & 0x7); + IMM32(p, imm); } -void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) { - if(disp == 0) { - *(*p)++ = (rm & 7) | ((reg & 7) << 3); - }else if(disp <= 127 || disp >= -128) { - *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3); - IMM8(p, disp); - }else{ - *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3); - IMM32(p, disp); - } +void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) +{ + if (disp == 0) { + *(*p)++ = (rm & 7) | ((reg & 7) << 3); + } else if (disp <= 127 || disp >= -128) { + *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3); + IMM8(p, disp); + } else { + *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3); + IMM32(p, disp); + } } -void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) { +void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) +{ + *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1); + *(*p)++ = 0x8d; + ADDRMODE(p, dst, base, disp); +} - *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1); - *(*p)++ = 0x8d; - ADDRMODE(p, dst, base, disp); +void RET(uint8_t **p) +{ + *(*p)++ = 0xc3; } -void RET(uint8_t **p) { - *(*p)++ = 0xc3; +void ADDI(uint8_t **p, uint8_t dst, int32_t imm) +{ + if (dst >= 8) { + *(*p)++ = 0x49; + } else { + *(*p)++ = 0x48; + } + + if (imm > 127 || imm <= -128) { + *(*p)++ = 0x81; + } else { + *(*p)++ = 0x83; + } + + *(*p)++ = 0xc0 | (dst & 0x7); + + if (imm > 127 || imm <= -128) { + IMM32(p, imm); + } else { + IMM8(p, imm); + } } -void ADDI(uint8_t **p, uint8_t dst, int32_t imm) { - - if(dst >= 8) *(*p)++ = 0x49; - else *(*p)++ = 0x48; - - if(imm > 127 || imm <= -128) *(*p)++ = 0x81; - else *(*p)++ = 0x83; - - *(*p)++ = 0xc0 | (dst & 0x7); - - if(imm > 127 || imm <= -128) IMM32(p, imm); - else IMM8(p, imm); +void CALL(uint8_t **p, uint8_t *func) +{ + *(*p)++ = 0xe8; + IMM32(p, func - *p - 4); } -void CALL(uint8_t **p, uint8_t *func) { - *(*p)++ = 0xe8; - IMM32(p, ((void *)func) - (void *)(*p) - 4); +void PUSH(uint8_t **p, uint8_t reg) +{ + if (reg >= 8) { + *(*p)++ = 0x41; + } + + *(*p)++ = 0x50 | (reg & 7); } -void PUSH(uint8_t **p, uint8_t reg) { - if(reg >= 8) *(*p)++ = 0x41; - *(*p)++ = 0x50 | (reg & 7); +void POP(uint8_t **p, uint8_t reg) +{ + if (reg >= 8) { + *(*p)++ = 0x41; + } + + *(*p)++ = 0x58 | (reg & 7); } -void POP(uint8_t **p, uint8_t reg) { - if(reg >= 8) *(*p)++ = 0x41; - *(*p)++ = 0x58 | (reg & 7); + +void SHIFT(uint8_t **p, uint8_t reg, int shift) +{ + if (reg >= 8) { + *(*p)++ = 0x49; + } + + + *(*p)++ = 0xc1; + if (shift > 0) { + *(*p)++ = 0xe0 | (reg & 7); + *(*p)++ = (shift & 0xff); + } else { + *(*p)++ = 0xe8 | (reg & 7); + *(*p)++ = ((-shift) & 0xff); + } } -#endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: + +#endif /* FFTS_CODEGEN_SSE_H */
\ No newline at end of file @@ -1,418 +1,618 @@ /* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + #include "ffts.h" #include "macros.h" -//#include "mini_macros.h" #include "patterns.h" #include "ffts_small.h" #ifdef DYNAMIC_DISABLED - #include "ffts_static.h" +#include "ffts_static.h" #else - #include "codegen.h" +#include "codegen.h" #endif -#include <errno.h> - #include <sys/mman.h> - #include <string.h> - #include <limits.h> /* for PAGESIZE */ - #if __APPLE__ - #include <libkern/OSCacheControl.h> +#include <libkern/OSCacheControl.h> +#endif + +#if _WIN32 +#include <windows.h> #else +#include <sys/mman.h> +#endif + +#ifdef __arm__ +static const FFTS_ALIGN(64) float w_data[16] = { + 0.70710678118654757273731092936941f, + 0.70710678118654746171500846685376f, + -0.70710678118654757273731092936941f, + -0.70710678118654746171500846685376f, + 1.0f, + 0.70710678118654757273731092936941f, + -0.0f, + -0.70710678118654746171500846685376f, + 0.70710678118654757273731092936941f, + 0.70710678118654746171500846685376f, + 0.70710678118654757273731092936941f, + 0.70710678118654746171500846685376f, + 1.0f, + 0.70710678118654757273731092936941f, + 0.0f, + 0.70710678118654746171500846685376f +}; #endif -void ffts_execute(ffts_plan_t *p, const void * in, void * out) { +static FFTS_INLINE int ffts_allow_execute(void *start, size_t len) +{ + int result; -//TODO: Define NEEDS_ALIGNED properly instead -#if defined(HAVE_SSE) || defined(HAVE_NEON) - if(((int)in % 16) != 0) { - LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n"); - } +#ifdef _WIN32 + DWORD old_protect; + result = !VirtualProtect(start, len, PAGE_EXECUTE_READ, &old_protect); +#else + result = mprotect(start, len, PROT_READ | PROT_EXEC); +#endif + + return result; +} + +static FFTS_INLINE int ffts_deny_execute(void *start, size_t len) +{ + int result; + +#ifdef _WIN32 + DWORD old_protect; + result = (int) VirtualProtect(start, len, PAGE_READWRITE, &old_protect); +#else + result = mprotect(start, len, PROT_READ | PROT_WRITE); +#endif + + return result; +} + +static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length) +{ +#ifdef __APPLE__ + sys_icache_invalidate(start, length); +#elif __ANDROID__ + cacheflush((long) start, (long) start + length, 0); +#elif __linux__ +#if GCC_VERSION_AT_LEAST(4,3) + __builtin___clear_cache(start, (char*) start + length); +#elif __GNUC__ + __clear_cache((long) start, (long) start + length); +#endif +#elif _WIN32 + return !FlushInstructionCache(GetCurrentProcess(), start, length); +#endif + return 0; +} - if(((int)out % 16) != 0) { - LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n"); - } +static FFTS_INLINE void *ffts_vmem_alloc(size_t length) +{ +#if __APPLE__ + return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); +#elif _WIN32 + return VirtualAlloc(NULL, length, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); +#else +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS 0x20 #endif - p->transform(p, (const float *)in, (float *)out); + return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); +#endif } -void ffts_free(ffts_plan_t *p) { - p->destroy(p); +static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length) +{ +#ifdef _WIN32 + (void) length; + VirtualFree(addr, 0, MEM_RELEASE); +#else + munmap(addr, length); +#endif } -void ffts_free_1d(ffts_plan_t *p) { - - size_t i; +void ffts_execute(ffts_plan_t *p, const void *in, void *out) +{ + /* TODO: Define NEEDS_ALIGNED properly instead */ +#if defined(HAVE_SSE) || defined(HAVE_NEON) + if (((int) in % 16) != 0) { + LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n"); + } + + if (((int) out % 16) != 0) { + LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n"); + } +#endif + + p->transform(p, (const float*) in, (float*) out); +} - if(p->ws) { - FFTS_FREE(p->ws); - } - if(p->is) free(p->is); - if(p->ws_is) free(p->ws_is); - if(p->offsets) free(p->offsets); - //free(p->transforms); - if(p->transforms) free(p->transforms); +void ffts_free(ffts_plan_t *p) +{ + if (p) { + p->destroy(p); + } +} +void ffts_free_1d(ffts_plan_t *p) +{ #if !defined(DYNAMIC_DISABLED) - if(p->transform_base) { - if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) { - perror("Couldn't mprotect"); - exit(errno); - } - munmap(p->transform_base, p->transform_size); - //free(p->transform_base); - } + if (p->transform_base) { + ffts_deny_execute(p->transform_base, p->transform_size); + ffts_vmem_free(p->transform_base, p->transform_size); + } #endif - free(p); + + if (p->ws_is) { + free(p->ws_is); + } + + if (p->ws) { + FFTS_FREE(p->ws); + } + + if (p->transforms) { + free(p->transforms); + } + + if (p->is) { + free(p->is); + } + + if (p->offsets) { + free(p->offsets); + } + + free(p); } -ffts_plan_t *ffts_init_1d(size_t N, int sign) { - if(N == 0 || (N & (N - 1)) != 0){ - LOG("FFT size must be a power of two\n"); - return NULL; - } +static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) +{ + V MULI_SIGN; + int hardcoded; + size_t lut_size; + size_t n_luts; + float *tmp; + cdata_t *w; + size_t i; + int n; + +#ifdef __arm__ + /* #ifdef HAVE_NEON */ + V MULI_SIGN; + + if(sign < 0) { + MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + } else { + MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); + } + + /* #endif */ +#else + if (sign < 0) { + MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + } else { + MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); + } +#endif + + /* LUTS */ + n_luts = ffts_ctzl(N / leaf_N); + if (N < 32) { + n_luts = ffts_ctzl(N / 4); + hardcoded = 1; + } else { + hardcoded = 0; + } + + if (n_luts >= 32) { + n_luts = 0; + } - ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - size_t leafN = 8; - size_t i; + /* fprintf(stderr, "n_luts = %zu\n", n_luts); */ + n = leaf_N * 2; + if (hardcoded) { + n = 8; + } + + lut_size = 0; + + for (i = 0; i < n_luts; i++) { + if (!i || hardcoded) { +#ifdef __arm__ + if (N <= 32) { + lut_size += n/4 * 2 * sizeof(cdata_t); + } else { + lut_size += n/4 * sizeof(cdata_t); + } +#else + lut_size += n/4 * 2 * sizeof(cdata_t); +#endif + n *= 2; + } else { #ifdef __arm__ -//#ifdef HAVE_NEON - V MULI_SIGN; - - if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); - else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); -//#endif + lut_size += n/8 * 3 * sizeof(cdata_t); #else - V MULI_SIGN; - - if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); - else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); + lut_size += n/8 * 3 * 2 * sizeof(cdata_t); +#endif + } + n *= 2; + } + + /* lut_size *= 16; */ + + /* fprintf(stderr, "lut size = %zu\n", lut_size); */ + + if (n_luts) { + p->ws = FFTS_MALLOC(lut_size, 32); + if (!p->ws) { + goto cleanup; + } + + p->ws_is = malloc(n_luts * sizeof(*p->ws_is)); + if (!p->ws_is) { + goto cleanup; + } + } + + w = p->ws; + + n = leaf_N * 2; + if (hardcoded) { + n = 8; + } + +#ifdef HAVE_NEON + V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); #endif - p->transform = NULL; - p->transform_base = NULL; - p->transforms = NULL; - p->is = NULL; - p->ws_is = NULL; - p->ws = NULL; - p->offsets = NULL; - p->destroy = ffts_free_1d; - - if(N >= 32) { - ffts_init_offsets(p, N, leafN); + for (i = 0; i < n_luts; i++) { + p->ws_is[i] = w - (cdata_t *)p->ws; + //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); + + if(!i || hardcoded) { + cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); + float *fw0 = (float*) w0; + float *fw = (float *)w; + + size_t j; + for (j = 0; j < n/4; j++) { + w0[j][0] = W_re(n,j); + w0[j][1] = W_im(n,j); + } + #ifdef __arm__ + if (N < 32) { + // w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); + float *fw = (float *)w; + V temp0, temp1, temp2; + for (j=0; j<n/4; j+=2) { + // #ifdef HAVE_NEON + temp0 = VLD(fw0 + j*2); + V re, im; + re = VDUPRE(temp0); + im = VDUPIM(temp0); +#ifdef HAVE_NEON + im = VXOR(im, MULI_SIGN); + //im = IMULI(sign>0, im); +#else + im = MULI(sign>0, im); +#endif + VST(fw + j*4 , re); + VST(fw + j*4+4, im); + // #endif + } + w += n/4 * 2; + } else { + //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); + float *fw = (float *)w; #ifdef HAVE_NEON - ffts_init_is(p, N, leafN, 1); + VS temp0, temp1, temp2; + for (j=0; j<n/4; j+=4) { + temp0 = VLD2(fw0 + j*2); + temp0.val[1] = VXOR(temp0.val[1], neg); + STORESPR(fw + j*2, temp0); + } #else - ffts_init_is(p, N, leafN, 1); + for (j=0; j<n/4; j+=1) { + fw[j*2] = fw0[j*2]; + fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; + } #endif + w += n/4; + } #else - ffts_init_is(p, N, leafN, 1); + //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); + for (j = 0; j < n/4; j += 2) { + V re, im, temp0; + temp0 = VLD(fw0 + j*2); + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*4 + 0, re); + VST(fw + j*4 + 4, im); + } + + w += n/4 * 2; #endif - - p->i0 = N/leafN/3+1; - p->i1 = N/leafN/3; - if((N/leafN) % 3 > 1) p->i1++; - p->i2 = N/leafN/3; - - #ifdef __arm__ - #ifdef HAVE_NEON - p->i0/=2; - p->i1/=2; - #endif - #else - p->i0/=2; - p->i1/=2; - #endif - - }else{ - p->transforms = malloc(2 * sizeof(transform_index_t)); - p->transforms[0] = 0; - p->transforms[1] = 1; - if(N == 2) p->transform = &firstpass_2; - else if(N == 4 && sign == -1) p->transform = &firstpass_4_f; - else if(N == 4 && sign == 1) p->transform = &firstpass_4_b; - else if(N == 8 && sign == -1) p->transform = &firstpass_8_f; - else if(N == 8 && sign == 1) p->transform = &firstpass_8_b; - else if(N == 16 && sign == -1) p->transform = &firstpass_16_f; - else if(N == 16 && sign == 1) p->transform = &firstpass_16_b; - - p->is = NULL; - p->offsets = NULL; - } - - int hardcoded = 0; - - /* LUTS */ - size_t n_luts = __builtin_ctzl(N/leafN); - if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } - - if(n_luts >= 32) n_luts = 0; - -// fprintf(stderr, "n_luts = %zu\n", n_luts); - - cdata_t *w; - - int n = leafN*2; - if(hardcoded) n = 8; - - size_t lut_size = 0; - - for(i=0;i<n_luts;i++) { - if(!i || hardcoded) { - #ifdef __arm__ - if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); - else lut_size += n/4 * sizeof(cdata_t); - #else - lut_size += n/4 * 2 * sizeof(cdata_t); - #endif - n *= 2; - } else { - #ifdef __arm__ - lut_size += n/8 * 3 * sizeof(cdata_t); - #else - lut_size += n/8 * 3 * 2 * sizeof(cdata_t); - #endif - } - n *= 2; - } - -// lut_size *= 16; - - // fprintf(stderr, "lut size = %zu\n", lut_size); - if(n_luts) { - p->ws = FFTS_MALLOC(lut_size,32); - p->ws_is = malloc(n_luts * sizeof(size_t)); - }else{ - p->ws = NULL; - p->ws_is = NULL; - } - w = p->ws; - - n = leafN*2; - if(hardcoded) n = 8; - - #ifdef HAVE_NEON - V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); - #endif - - for(i=0;i<n_luts;i++) { - p->ws_is[i] = w - (cdata_t *)p->ws; - //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); - - if(!i || hardcoded) { - cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); - - size_t j; - for(j=0;j<n/4;j++) { - w0[j][0] = W_re(n,j); - w0[j][1] = W_im(n,j); - } - - - float *fw0 = (float *)w0; - #ifdef __arm__ - if(N < 32) { - //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); - float *fw = (float *)w; - V temp0, temp1, temp2; - for(j=0;j<n/4;j+=2) { - // #ifdef HAVE_NEON - temp0 = VLD(fw0 + j*2); - V re, im; - re = VDUPRE(temp0); - im = VDUPIM(temp0); - #ifdef HAVE_NEON - im = VXOR(im, MULI_SIGN); - //im = IMULI(sign>0, im); - #else - im = MULI(sign>0, im); - #endif - VST(fw + j*4 , re); - VST(fw + j*4+4, im); - // #endif - } - w += n/4 * 2; - }else{ - //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); - float *fw = (float *)w; - #ifdef HAVE_NEON - VS temp0, temp1, temp2; - for(j=0;j<n/4;j+=4) { - temp0 = VLD2(fw0 + j*2); - temp0.val[1] = VXOR(temp0.val[1], neg); - STORESPR(fw + j*2, temp0); - } - #else - for(j=0;j<n/4;j+=1) { - fw[j*2] = fw0[j*2]; - fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; - } - #endif - w += n/4; - } - #else - //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); - float *fw = (float *)w; - V temp0, temp1, temp2; - for(j=0;j<n/4;j+=2) { - temp0 = VLD(fw0 + j*2); - V re, im; - re = VDUPRE(temp0); - im = VDUPIM(temp0); - im = VXOR(im, MULI_SIGN); - VST(fw + j*4 , re); - VST(fw + j*4+4, im); - } - w += n/4 * 2; - #endif - - FFTS_FREE(w0); - }else{ - - cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); - cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); - cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); - - size_t j; - for(j=0;j<n/8;j++) { - w0[j][0] = W_re(n,j*2); - w0[j][1] = W_im(n,j*2); - w1[j][0] = W_re(n,j); - w1[j][1] = W_im(n,j); - w2[j][0] = W_re(n,j + (n/8)); - w2[j][1] = W_im(n,j + (n/8)); - - } - - float *fw0 = (float *)w0; - float *fw1 = (float *)w1; - float *fw2 = (float *)w2; - #ifdef __arm__ - //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); - float *fw = (float *)w; - #ifdef HAVE_NEON - VS temp0, temp1, temp2; - for(j=0;j<n/8;j+=4) { - temp0 = VLD2(fw0 + j*2); - temp0.val[1] = VXOR(temp0.val[1], neg); - STORESPR(fw + j*2*3, temp0); - temp1 = VLD2(fw1 + j*2); - temp1.val[1] = VXOR(temp1.val[1], neg); - STORESPR(fw + j*2*3 + 8, temp1); - temp2 = VLD2(fw2 + j*2); - temp2.val[1] = VXOR(temp2.val[1], neg); - STORESPR(fw + j*2*3 + 16, temp2); - } - #else - for(j=0;j<n/8;j+=1) { - fw[j*6] = fw0[j*2]; - fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; - fw[j*6+2] = fw1[j*2+0]; - fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1]; - fw[j*6+4] = fw2[j*2+0]; - fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1]; - } - #endif - w += n/8 * 3; - #else - //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32); - float *fw = (float *)w; - V temp0, temp1, temp2, re, im; - for(j=0;j<n/8;j+=2) { - temp0 = VLD(fw0 + j*2); - re = VDUPRE(temp0); - im = VDUPIM(temp0); - im = VXOR(im, MULI_SIGN); - VST(fw + j*2*6 , re); - VST(fw + j*2*6+4, im); - - temp1 = VLD(fw1 + j*2); - re = VDUPRE(temp1); - im = VDUPIM(temp1); - im = VXOR(im, MULI_SIGN); - VST(fw + j*2*6+8 , re); - VST(fw + j*2*6+12, im); - - temp2 = VLD(fw2 + j*2); - re = VDUPRE(temp2); - im = VDUPIM(temp2); - im = VXOR(im, MULI_SIGN); - VST(fw + j*2*6+16, re); - VST(fw + j*2*6+20, im); - } - w += n/8 * 3 * 2; - #endif - - FFTS_FREE(w0); - FFTS_FREE(w1); - FFTS_FREE(w2); - } - ///p->ws[i] = w; - - n *= 2; - } - - float *tmp = (float *)p->ws; - - if(sign < 0) { - p->oe_ws = (void *)(&w_data[4]); - p->ee_ws = (void *)(w_data); - p->eo_ws = (void *)(&w_data[4]); - }else{ - p->oe_ws = (void *)(w_data + 12); - p->ee_ws = (void *)(w_data + 8); - p->eo_ws = (void *)(w_data + 12); - } - - p->N = N; - p->lastlut = w; - p->n_luts = n_luts; -#ifdef DYNAMIC_DISABLED - if(sign < 0) { - if(N >= 32) p->transform = ffts_static_transform_f; - }else{ - if(N >= 32) p->transform = ffts_static_transform_i; - } + FFTS_FREE(w0); + } else { + cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); + + float *fw0 = (float*) w0; + float *fw1 = (float*) w1; + float *fw2 = (float*) w2; + + float *fw = (float *)w; + V temp0, temp1, temp2, re, im; + + size_t j; + for (j = 0; j < n/8; j++) { + w0[j][0] = W_re((float) n, (float) 2*j); + w0[j][1] = W_im((float) n, (float) 2*j); + w1[j][0] = W_re((float) n, (float) j); + w1[j][1] = W_im((float) n, (float) j); + w2[j][0] = W_re((float) n, (float) (j + (n/8))); + w2[j][1] = W_im((float) n, (float) (j + (n/8))); + } + +#ifdef __arm__ + //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); + float *fw = (float *)w; +#ifdef HAVE_NEON + VS temp0, temp1, temp2; + for (j = 0; j < n/8; j += 4) { + temp0 = VLD2(fw0 + j*2); + temp0.val[1] = VXOR(temp0.val[1], neg); + STORESPR(fw + j*2*3, temp0); + temp1 = VLD2(fw1 + j*2); + temp1.val[1] = VXOR(temp1.val[1], neg); + STORESPR(fw + j*2*3 + 8, temp1); + temp2 = VLD2(fw2 + j*2); + temp2.val[1] = VXOR(temp2.val[1], neg); + STORESPR(fw + j*2*3 + 16, temp2); + } #else - if(N>=32) ffts_generate_func_code(p, N, leafN, sign); + for (j = 0; j < n/8; j += 1) { + fw[j*6] = fw0[j*2]; + fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; + fw[j*6+2] = fw1[j*2+0]; + fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1]; + fw[j*6+4] = fw2[j*2+0]; + fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1]; + } #endif + w += n/8 * 3; +#else + //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32); + for (j = 0; j < n/8; j += 2) { + temp0 = VLD(fw0 + j*2); + re = VDUPRE(temp0); + im = VDUPIM(temp0); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6 , re); + VST(fw + j*2*6+4, im); + + temp1 = VLD(fw1 + j*2); + re = VDUPRE(temp1); + im = VDUPIM(temp1); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6+8 , re); + VST(fw + j*2*6+12, im); + + temp2 = VLD(fw2 + j*2); + re = VDUPRE(temp2); + im = VDUPIM(temp2); + im = VXOR(im, MULI_SIGN); + VST(fw + j*2*6+16, re); + VST(fw + j*2*6+20, im); + } + + w += n/8 * 3 * 2; +#endif + + FFTS_FREE(w0); + FFTS_FREE(w1); + FFTS_FREE(w2); + } + ///p->ws[i] = w; + + n *= 2; + } + + tmp = (float *)p->ws; + +#ifdef __arm__ + if (sign < 0) { + p->oe_ws = (void*)(&w_data[4]); + p->ee_ws = (void*)(w_data); + p->eo_ws = (void*)(&w_data[4]); + } else { + p->oe_ws = (void*)(w_data + 12); + p->ee_ws = (void*)(w_data + 8); + p->eo_ws = (void*)(w_data + 12); + } +#endif + + p->lastlut = w; + p->n_luts = n_luts; + return 0; - return p; +cleanup: + return -1; } -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +ffts_plan_t *ffts_init_1d(size_t N, int sign) +{ + const size_t leaf_N = 8; + ffts_plan_t *p; + + if (N < 2 || (N & (N - 1)) != 0) { + LOG("FFT size must be a power of two\n"); + return NULL; + } + + p = calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + p->destroy = ffts_free_1d; + p->N = N; + + if (ffts_generate_luts(p, N, leaf_N, sign)) { + goto cleanup; + } + + if (N >= 32) { + p->offsets = ffts_init_offsets(N, leaf_N); + if (!p->offsets) { + goto cleanup; + } + + p->is = ffts_init_is(N, leaf_N, 1); + if (!p->is) { + goto cleanup; + } + + p->i0 = N/leaf_N/3 + 1; + p->i1 = p->i2 = N/leaf_N/3; + if ((N/leaf_N) % 3 > 1) { + p->i1++; + } + +#ifdef __arm__ +#ifdef HAVE_NEON + p->i0 /= 2; + p->i1 /= 2; +#endif +#else + p->i0 /= 2; + p->i1 /= 2; +#endif + +#ifdef DYNAMIC_DISABLED + if (sign < 0) { + p->transform = ffts_static_transform_f; + } else { + p->transform = ffts_static_transform_i; + } +#else + /* determinate transform size */ +#ifdef __arm__ + if (N < 8192) { + p->transform_size = 8192; + } else { + p->transform_size = N; + } +#else + if (N < 2048) { + p->transform_size = 16384; + } else { + p->transform_size = 16384 + 2*N/8 * ffts_ctzl(N); + } +#endif + + /* allocate code/function buffer */ + p->transform_base = ffts_vmem_alloc(p->transform_size); + if (!p->transform_base) { + goto cleanup; + } + + /* generate code */ + p->transform = ffts_generate_func_code(p, N, leaf_N, sign); + if (!p->transform) { + goto cleanup; + } + + /* enable execution with read access for the block */ + if (ffts_allow_execute(p->transform_base, p->transform_size)) { + goto cleanup; + } + + /* flush from the instruction cache */ + if (ffts_flush_instruction_cache(p->transform_base, p->transform_size)) { + goto cleanup; + } +#endif + } else { + p->transforms = malloc(2 * sizeof(*p->transforms)); + if (!p->transforms) { + goto cleanup; + } + + p->transforms[0] = 0; + p->transforms[1] = 1; + + switch (N) { + case 2: + p->transform = &ffts_firstpass_2; + break; + case 4: + if (sign == -1) { + p->transform = &ffts_firstpass_4_f; + } else if (sign == 1) { + p->transform = &ffts_firstpass_4_b; + } + break; + case 8: + if (sign == -1) { + p->transform = &ffts_firstpass_8_f; + } else if (sign == 1) { + p->transform = &ffts_firstpass_8_b; + } + break; + case 16: + default: + if (sign == -1) { + p->transform = &ffts_firstpass_16_f; + } else { + p->transform = &ffts_firstpass_16_b; + } + break; + } + } + + return p; + +cleanup: + ffts_free_1d(p); + return NULL; +}
\ No newline at end of file @@ -30,21 +30,21 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + #ifndef __CP_SSE_H__ #define __CP_SSE_H__ -#include "config.h" +//#include "config.h" +//#include "codegen.h" +#include "ffts_attributes.h" +#include "types.h" -#include <stdio.h> -#include <stdlib.h> +#include <malloc.h> #include <math.h> #include <stddef.h> #include <stdint.h> -//#include <stdalign.h> - -//#include "codegen.h" -#include "types.h" - +#include <stdlib.h> +#include <stdio.h> #ifdef __ANDROID__ #include <android/log.h> @@ -53,29 +53,14 @@ #define LOG(s) fprintf(stderr, s) #endif -#define PI 3.1415926535897932384626433832795028841971693993751058209 - -static const __attribute__ ((aligned(64))) float w_data[16] = { - 0.70710678118654757273731092936941, 0.70710678118654746171500846685376, - -0.70710678118654757273731092936941, -0.70710678118654746171500846685376, - 1.0f, 0.70710678118654757273731092936941f, - -0.0f, -0.70710678118654746171500846685376, - 0.70710678118654757273731092936941, 0.70710678118654746171500846685376, - 0.70710678118654757273731092936941, 0.70710678118654746171500846685376, - 1.0f, 0.70710678118654757273731092936941f, - 0.0f, 0.70710678118654746171500846685376 -}; - -__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); } -__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); } - -typedef size_t transform_index_t; - -//typedef void (*transform_func_t)(float *data, size_t N, float *LUT); -typedef void (*transform_func_t)(float *data, size_t N, float *LUT); +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795028841971693993751058209 +#endif typedef struct _ffts_plan_t ffts_plan_t; +typedef void (*transform_func_t)(ffts_plan_t *p, const void *in, void *out); + /** * Contains all the Information need to perform FFT * @@ -86,101 +71,153 @@ typedef struct _ffts_plan_t ffts_plan_t; */ struct _ffts_plan_t { - /** - * - */ - ptrdiff_t *offsets; + /** + * + */ + ptrdiff_t *offsets; #ifdef DYNAMIC_DISABLED - /** - * Twiddle factors - */ - void *ws; - /** - * ee - 2 size x size8 - * oo - 2 x size4 in parallel - * oe - - */ - void *oe_ws, *eo_ws, *ee_ws; + /** + * Twiddle factors + */ + void *ws; + + /** + * ee - 2 size x size8 + * oo - 2 x size4 in parallel + * oe - + */ + void *oe_ws, *eo_ws, *ee_ws; #else - void __attribute__((aligned(32))) *ws; - void __attribute__((aligned(32))) *oe_ws, *eo_ws, *ee_ws; + void FFTS_ALIGN(32) *ws; + void FFTS_ALIGN(32) *oe_ws, *eo_ws, *ee_ws; #endif - /** - * Pointer into an array of precomputed indexes for the input data array - */ - ptrdiff_t *is; - - /** - * Twiddle Factor Indexes - */ - size_t *ws_is; - - /** - * Size of the loops for the base cases - */ - size_t i0, i1, n_luts; - - /** - * Size fo the Transform - */ - size_t N; - void *lastlut; - /** - * Used in multidimensional Code ?? - */ - transform_index_t *transforms; - //transform_func_t transform; - - /** - * Pointer to the dynamically generated function - * that will execute the FFT - */ - void (*transform)(ffts_plan_t * , const void * , void * ); - - /** - * Pointer to the base memory address of - * of the transform function - */ - void *transform_base; - - /** - * Size of the memory block contain the - * generated code - */ - size_t transform_size; - - /** - * Points to the cosnant variables used by - * the Assembly Code - */ - void *constants; - - // multi-dimensional stuff: - struct _ffts_plan_t **plans; - int rank; - size_t *Ns, *Ms; - void *buf; - - void *transpose_buf; - - /** - * Pointer to the destroy function - * to clean up the plan after use - * (differs for real and multi dimension transforms - */ - void (*destroy)(ffts_plan_t *); - - /** - * Coefficiants for the real valued transforms - */ - float *A, *B; - - size_t i2; + + /** + * Pointer into an array of precomputed indexes for the input data array + */ + ptrdiff_t *is; + + /** + * Twiddle Factor Indexes + */ + size_t *ws_is; + + /** + * Size of the loops for the base cases + */ + size_t i0, i1, n_luts; + + /** + * Size fo the Transform + */ + size_t N; + void *lastlut; + + /** + * Used in multidimensional Code ?? + */ + size_t *transforms; + + /** + * Pointer to the dynamically generated function + * that will execute the FFT + */ + transform_func_t transform; + + /** + * Pointer to the base memory address of + * of the transform function + */ + void *transform_base; + + /** + * Size of the memory block contain the + * generated code + */ + size_t transform_size; + + /** + * Points to the cosnant variables used by + * the Assembly Code + */ + void *constants; + + // multi-dimensional stuff: + struct _ffts_plan_t **plans; + int rank; + size_t *Ns, *Ms; + void *buf; + + void *transpose_buf; + + /** + * Pointer to the destroy function + * to clean up the plan after use + * (differs for real and multi dimension transforms + */ + void (*destroy)(ffts_plan_t *); + + /** + * Coefficiants for the real valued transforms + */ + float *A, *B; + + size_t i2; }; +static FFTS_INLINE void *ffts_aligned_malloc(size_t size) +{ +#if defined(_MSC_VER) + return _aligned_malloc(size, 32); +#else + return valloc(size); +#endif +} -void ffts_free(ffts_plan_t *); +static FFTS_INLINE void ffts_aligned_free(void *p) +{ +#if defined(_MSC_VER) + _aligned_free(p); +#else + free(p); +#endif +} + +#if GCC_VERSION_AT_LEAST(3,3) +#define ffts_ctzl __builtin_ctzl +#elif defined(_MSC_VER) +#include <intrin.h> +#ifdef _M_AMD64 +#pragma intrinsic(_BitScanForward64) +static __inline unsigned long ffts_ctzl(size_t N) +{ + unsigned long count; + _BitScanForward64((unsigned long*) &count, N); + return count; +} +#else +#pragma intrinsic(_BitScanForward) +static __inline unsigned long ffts_ctzl(size_t N) +{ + unsigned long count; + _BitScanForward((unsigned long*) &count, N); + return count; +} +#endif /* _WIN64 */ +#endif /* _MSC_VER */ + +static FFTS_ALWAYS_INLINE float W_re(float N, float k) +{ + return cos(-2.0 * M_PI * k / N); +} + +static FFTS_ALWAYS_INLINE float W_im(float N, float k) +{ + return sin(-2.0 * M_PI * k / N); +} + +void ffts_free(ffts_plan_t *); +void ffts_execute(ffts_plan_t *, const void *, void *); ffts_plan_t *ffts_init_1d(size_t N, int sign); -void ffts_execute(ffts_plan_t *, const void *, void *); + #endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: diff --git a/src/macros-sse.h b/src/macros-sse.h index d845734..85cd02d 100644 --- a/src/macros-sse.h +++ b/src/macros-sse.h @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,8 +31,8 @@ */ -#ifndef __SSE_FLOAT_H__ -#define __SSE_FLOAT_H__ +#ifndef FFTS_MACROS_SSE_H +#define FFTS_MACROS_SSE_H #include <xmmintrin.h> @@ -63,23 +63,27 @@ typedef __m128 V; #define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) #define FFTS_FREE(d) (_mm_free(d)) -__INLINE V IMULI(int inv, V a) { - if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); - else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); +static FFTS_ALWAYS_INLINE V IMULI(int inv, V a) +{ + if (inv) { + return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + } else { + return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); + } } - -__INLINE V IMUL(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VSUB(re, im); +static FFTS_ALWAYS_INLINE V IMUL(V d, V re, V im) +{ + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); } -__INLINE V IMULJ(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VADD(re, im); +static FFTS_ALWAYS_INLINE V IMULJ(V d, V re, V im) +{ + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); } -#endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#endif /* FFTS_MACROS_SSE_H */ diff --git a/src/macros.h b/src/macros.h index 08029a3..12c52c6 100644 --- a/src/macros.h +++ b/src/macros.h @@ -1,38 +1,42 @@ /* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> - Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> +Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __MACROS_H__ -#define __MACROS_H__ +#ifndef FFTS_MACROS_H +#define FFTS_MACROS_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif #ifdef HAVE_NEON #include "macros-neon.h" @@ -44,119 +48,138 @@ #include "macros-altivec.h" #endif #endif - #endif - #ifdef HAVE_VFP #include "macros-alpha.h" #endif + #ifdef HAVE_SSE - #include "macros-sse.h" +#include "macros-sse.h" #endif -static inline void TX2(V *a, V *b) +static FFTS_INLINE void TX2(V *a, V *b) { V TX2_t0 = VUNPACKLO(*a, *b); V TX2_t1 = VUNPACKHI(*a, *b); - *a = TX2_t0; *b = TX2_t1; + *a = TX2_t0; + *b = TX2_t1; } -static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) +static FFTS_INLINE void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) { V uk, uk2, zk_p, zk_n, zk, zk_d; - uk = *r0; uk2 = *r1; + + uk = *r0; + uk2 = *r1; + zk_p = IMUL(*r2, re, im); zk_n = IMULJ(*r3, re, im); - zk = VADD(zk_p, zk_n); + zk = VADD(zk_p, zk_n); zk_d = IMULI(inv, VSUB(zk_p, zk_n)); - + *r2 = VSUB(uk, zk); *r0 = VADD(uk, zk); *r3 = VADD(uk2, zk_d); *r1 = VSUB(uk2, zk_d); } - -static inline void S_4(V r0, V r1, V r2, V r3, - data_t * restrict o0, data_t * restrict o1, - data_t * restrict o2, data_t * restrict o3) -{ - VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); -} - - -static inline void L_2_4(int inv, - const data_t * restrict i0, const data_t * restrict i1, - const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) +static FFTS_INLINE void L_2_4(int inv, const data_t* FFTS_RESTRICT i0, const data_t* FFTS_RESTRICT i1, + const data_t* FFTS_RESTRICT i2, const data_t* FFTS_RESTRICT i3, + V *r0, V *r1, V *r2, V *r3) { V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t0 = VLD(i0); + t1 = VLD(i1); + t2 = VLD(i2); + t3 = VLD(i3); + t4 = VADD(t0, t1); t5 = VSUB(t0, t1); t6 = VADD(t2, t3); t7 = VSUB(t2, t3); + *r0 = VUNPACKLO(t4, t5); *r1 = VUNPACKLO(t6, t7); + t5 = IMULI(inv, t5); + t0 = VADD(t6, t4); t2 = VSUB(t6, t4); t1 = VSUB(t7, t5); t3 = VADD(t7, t5); + *r3 = VUNPACKHI(t0, t1); *r2 = VUNPACKHI(t2, t3); } - -static inline void L_4_4(int inv, - const data_t * restrict i0, const data_t * restrict i1, - const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) +static FFTS_INLINE void L_4_4(int inv, const data_t* FFTS_RESTRICT i0, const data_t* FFTS_RESTRICT i1, + const data_t* FFTS_RESTRICT i2, const data_t* FFTS_RESTRICT i3, + V *r0, V *r1, V *r2, V *r3) { V t0, t1, t2, t3, t4, t5, t6, t7; - - t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + + t0 = VLD(i0); + t1 = VLD(i1); + t2 = VLD(i2); + t3 = VLD(i3); + t4 = VADD(t0, t1); t5 = VSUB(t0, t1); t6 = VADD(t2, t3); + t7 = IMULI(inv, VSUB(t2, t3)); + t0 = VADD(t4, t6); t2 = VSUB(t4, t6); t1 = VSUB(t5, t7); t3 = VADD(t5, t7); + TX2(&t0, &t1); TX2(&t2, &t3); - *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; -} - + *r0 = t0; + *r2 = t1; + *r1 = t2; + *r3 = t3; +} -static inline void L_4_2(int inv, - const data_t * restrict i0, const data_t * restrict i1, - const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) +static FFTS_INLINE void L_4_2(int inv, const data_t * FFTS_RESTRICT i0, const data_t * FFTS_RESTRICT i1, + const data_t * FFTS_RESTRICT i2, const data_t * FFTS_RESTRICT i3, + V *r0, V *r1, V *r2, V *r3) { V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3); + t0 = VLD(i0); + t1 = VLD(i1); + t6 = VLD(i2); + t7 = VLD(i3); + t2 = VBLEND(t6, t7); t3 = VBLEND(t7, t6); + t4 = VADD(t0, t1); t5 = VSUB(t0, t1); t6 = VADD(t2, t3); t7 = VSUB(t2, t3); + *r2 = VUNPACKHI(t4, t5); - *r3 = VUNPACKHI(t6, t7); + *r3 = VUNPACKHI(t6, t7); + t7 = IMULI(inv, t7); + t0 = VADD(t4, t6); t2 = VSUB(t4, t6); t1 = VSUB(t5, t7); t3 = VADD(t5, t7); + *r0 = VUNPACKLO(t0, t1); *r1 = VUNPACKLO(t2, t3); } -#endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: + +#define S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ + VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); + +#endif /* FFTS_MACROS_H */
\ No newline at end of file @@ -54,6 +54,7 @@ _leaf_ee_init: leaf_ee_init: #endif #lea L_sse_constants(%rip), %r9 + movq (%rdi), %r8 movq 0xe0(%rdi), %r9 xorl %eax, %eax @@ -559,9 +560,9 @@ x8_soft: leaq (%r14,%rcx,4), %r15 X8_soft_loop: movaps (%rsi), %xmm9 - movaps (%r10,%rax,4), %xmm6 + movaps (%r10,%rax,4), %xmm6 movaps %xmm9, %xmm11 - movaps (%r11,%rax,4), %xmm7 + movaps (%r11,%rax,4), %xmm7 movaps 16(%rsi), %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm9 @@ -647,6 +648,14 @@ X8_soft_loop: ret #ifdef __APPLE__ + .globl _x8_soft_end +_x8_soft_end: +#else + .globl x8_soft_end +x8_soft_end: +#endif + +#ifdef __APPLE__ .globl _x8_hard _x8_hard: #else diff --git a/src/sse_win64.s b/src/sse_win64.s new file mode 100644 index 0000000..6b75391 --- /dev/null +++ b/src/sse_win64.s @@ -0,0 +1,840 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + .code64 + + .globl _neon_x4 + .align 4 +_neon_x4: + + .globl _neon_x8 + .align 4 +_neon_x8: + + .globl _neon_x8_t + .align 4 +_neon_x8_t: + +#ifdef __APPLE__ + .globl _leaf_ee_init +_leaf_ee_init: +#else + .globl leaf_ee_init +leaf_ee_init: +#endif + +# rcx is a pointer to the ffts_plan +# eax is loop counter (init to 0) +# rbx is loop max count +# rdx is 'in' base pointer +# r8 is 'out' base pointer +# rdi is offsets pointer +# rsi is constants pointer +# scratch: rax r10 r11 + + xorl %eax, %eax + movq (%rcx), %rdi + movq 0xe0(%rcx), %rsi + +# _leaf_ee + 8 needs 16 byte alignment +#ifdef __APPLE__ + .globl _leaf_ee +_leaf_ee: +#else + .globl leaf_ee +leaf_ee: +#endif + movaps 32(%rsi), %xmm0 #83.5 + movaps (%rsi), %xmm8 #83.5 +LEAF_EE_1: +LEAF_EE_const_0: + movaps 0xFECA(%rdx,%rax,4), %xmm7 #83.5 +LEAF_EE_const_2: + movaps 0xFECA(%rdx,%rax,4), %xmm12 #83.5 + movaps %xmm7, %xmm6 #83.5 +LEAF_EE_const_3: + movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5 + movaps %xmm12, %xmm11 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm10, %xmm11 #83.5 + xorps %xmm8, %xmm12 #83.5 +LEAF_EE_const_1: + movaps 0xFECA(%rdx,%rax,4), %xmm9 #83.5 +LEAF_EE_const_4: + movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5 + addps %xmm9, %xmm6 #83.5 + subps %xmm9, %xmm7 #83.5 +LEAF_EE_const_5: + movaps 0xFECA(%rdx,%rax,4), %xmm13 #83.5 + movaps %xmm10, %xmm9 #83.5 +LEAF_EE_const_6: + movaps 0xFECA(%rdx,%rax,4), %xmm3 #83.5 + movaps %xmm6, %xmm5 #83.5 +LEAF_EE_const_7: + movaps 0xFECA(%rdx,%rax,4), %xmm14 #83.5 + movaps %xmm3, %xmm15 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm7, %xmm4 #83.5 + movslq (%rdi, %rax, 4), %r10 #83.44 + subps %xmm13, %xmm10 #83.5 + subps %xmm14, %xmm3 #83.5 + addps %xmm11, %xmm5 #83.5 + subps %xmm11, %xmm6 #83.5 + subps %xmm12, %xmm4 #83.5 + addps %xmm12, %xmm7 #83.5 + addps %xmm13, %xmm9 #83.5 + addps %xmm14, %xmm15 #83.5 + movaps 16(%rsi), %xmm12 #83.5 + movaps %xmm9, %xmm1 #83.5 + movaps 16(%rsi), %xmm11 #83.5 + movaps %xmm5, %xmm2 #83.5 + mulps %xmm10, %xmm12 #83.5 + subps %xmm15, %xmm9 #83.5 + addps %xmm15, %xmm1 #83.5 + mulps %xmm3, %xmm11 #83.5 + addps %xmm1, %xmm2 #83.5 + subps %xmm1, %xmm5 #83.5 + shufps $177, %xmm10, %xmm10 #83.5 + xorps %xmm8, %xmm9 #83.5 + shufps $177, %xmm3, %xmm3 #83.5 + movaps %xmm6, %xmm1 #83.5 + mulps %xmm0, %xmm10 #83.5 + movaps %xmm4, %xmm13 #83.5 + mulps %xmm0, %xmm3 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm3, %xmm11 #83.5 + movaps %xmm12, %xmm3 #83.5 + movaps %xmm7, %xmm14 #83.5 + shufps $177, %xmm9, %xmm9 #83.5 + subps %xmm11, %xmm12 #83.5 + addps %xmm11, %xmm3 #83.5 + subps %xmm9, %xmm1 #83.5 + addps %xmm9, %xmm6 #83.5 + addps %xmm3, %xmm4 #83.5 + subps %xmm3, %xmm13 #83.5 + xorps %xmm8, %xmm12 #83.5 + movaps %xmm2, %xmm3 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm6, %xmm9 #83.5 + movslq 8(%rdi, %rax, 4), %r11 #83.59 + movlhps %xmm4, %xmm3 #83.5 + addq $4, %rax + shufps $238, %xmm4, %xmm2 #83.5 + movaps %xmm1, %xmm4 #83.5 + subps %xmm12, %xmm7 #83.5 + addps %xmm12, %xmm14 #83.5 + movlhps %xmm7, %xmm4 #83.5 + shufps $238, %xmm7, %xmm1 #83.5 + movaps %xmm5, %xmm7 #83.5 + movlhps %xmm13, %xmm7 #83.5 + movlhps %xmm14, %xmm9 #83.5 + shufps $238, %xmm13, %xmm5 #83.5 + shufps $238, %xmm14, %xmm6 #83.5 + movaps %xmm3, (%r8,%r10,4) #83.5 + movaps %xmm4, 16(%r8,%r10,4) #83.5 + movaps %xmm7, 32(%r8,%r10,4) #83.5 + movaps %xmm9, 48(%r8,%r10,4) #83.5 + movaps %xmm2, (%r8,%r11,4) #83.5 + movaps %xmm1, 16(%r8,%r11,4) #83.5 + movaps %xmm5, 32(%r8,%r11,4) #83.5 + movaps %xmm6, 48(%r8,%r11,4) #83.5 + cmpq %rbx, %rax + jne LEAF_EE_1 + +# _leaf_oo + 3 needs to be 16 byte aligned +#ifdef __APPLE__ + .globl _leaf_oo +_leaf_oo: +#else + .globl leaf_oo +leaf_oo: +#endif + movaps (%rsi), %xmm5 #92.7 +LEAF_OO_1: +LEAF_OO_const_0: + movaps 0xFECA(%rdx,%rax,4), %xmm4 #93.5 + movaps %xmm4, %xmm6 #93.5 +LEAF_OO_const_1: + movaps 0xFECA(%rdx,%rax,4), %xmm7 #93.5 +LEAF_OO_const_2: + movaps 0xFECA(%rdx,%rax,4), %xmm10 #93.5 + addps %xmm7, %xmm6 #93.5 + subps %xmm7, %xmm4 #93.5 +LEAF_OO_const_3: + movaps 0xFECA(%rdx,%rax,4), %xmm8 #93.5 + movaps %xmm10, %xmm9 #93.5 +LEAF_OO_const_4: + movaps 0xFECA(%rdx,%rax,4), %xmm1 #93.5 + movaps %xmm6, %xmm3 #93.5 +LEAF_OO_const_5: + movaps 0xFECA(%rdx,%rax,4), %xmm11 #93.5 + movaps %xmm1, %xmm2 #93.5 +LEAF_OO_const_6: + movaps 0xFECA(%rdx,%rax,4), %xmm14 #93.5 + movaps %xmm4, %xmm15 #93.5 +LEAF_OO_const_7: + movaps 0xFECA(%rdx,%rax,4), %xmm12 #93.5 + movaps %xmm14, %xmm13 #93.5 + movslq (%rdi, %rax, 4), %r10 #83.44 + subps %xmm8, %xmm10 #93.5 + addps %xmm8, %xmm9 #93.5 + addps %xmm11, %xmm2 #93.5 + subps %xmm12, %xmm14 #93.5 + subps %xmm11, %xmm1 #93.5 + addps %xmm12, %xmm13 #93.5 + addps %xmm9, %xmm3 #93.5 + subps %xmm9, %xmm6 #93.5 + xorps %xmm5, %xmm10 #93.5 + xorps %xmm5, %xmm14 #93.5 + shufps $177, %xmm10, %xmm10 #93.5 + movaps %xmm2, %xmm9 #93.5 + shufps $177, %xmm14, %xmm14 #93.5 + movaps %xmm6, %xmm7 #93.5 + movslq 8(%rdi, %rax, 4), %r11 #83.59 + addq $4, %rax #92.18 + addps %xmm10, %xmm4 #93.5 + addps %xmm13, %xmm9 #93.5 + subps %xmm13, %xmm2 #93.5 + subps %xmm10, %xmm15 #93.5 + movaps %xmm1, %xmm13 #93.5 + movaps %xmm2, %xmm8 #93.5 + movlhps %xmm4, %xmm7 #93.5 + subps %xmm14, %xmm13 #93.5 + addps %xmm14, %xmm1 #93.5 + shufps $238, %xmm4, %xmm6 #93.5 + movaps %xmm3, %xmm14 #93.5 + movaps %xmm9, %xmm4 #93.5 + movlhps %xmm15, %xmm14 #93.5 + movlhps %xmm13, %xmm4 #93.5 + movlhps %xmm1, %xmm8 #93.5 + shufps $238, %xmm15, %xmm3 #93.5 + shufps $238, %xmm13, %xmm9 #93.5 + shufps $238, %xmm1, %xmm2 #93.5 + movaps %xmm14, (%r8,%r10,4) #93.5 + movaps %xmm7, 16(%r8,%r10,4) #93.5 + movaps %xmm4, 32(%r8,%r10,4) #93.5 + movaps %xmm8, 48(%r8,%r10,4) #93.5 + movaps %xmm3, (%r8,%r11,4) #93.5 + movaps %xmm6, 16(%r8,%r11,4) #93.5 + movaps %xmm9, 32(%r8,%r11,4) #93.5 + movaps %xmm2, 48(%r8,%r11,4) #93.5 + cmpq %rbx, %rax + jne LEAF_OO_1 # Prob 95% #92.14 + +#ifdef __APPLE__ + .globl _leaf_eo +_leaf_eo: +#else + .globl leaf_eo +leaf_eo: +#endif +LEAF_EO_const_0: + movaps 0xFECA(%rdx,%rax,4), %xmm9 #88.5 +LEAF_EO_const_2: + movaps 0xFECA(%rdx,%rax,4), %xmm7 #88.5 + movaps %xmm9, %xmm11 #88.5 +LEAF_EO_const_3: + movaps 0xFECA(%rdx,%rax,4), %xmm5 #88.5 + movaps %xmm7, %xmm6 #88.5 +LEAF_EO_const_1: + movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5 + subps %xmm5, %xmm7 #88.5 + addps %xmm4, %xmm11 #88.5 + subps %xmm4, %xmm9 #88.5 + addps %xmm5, %xmm6 #88.5 + movaps (%rsi), %xmm3 #88.5 + movaps %xmm11, %xmm10 #88.5 + xorps %xmm3, %xmm7 #88.5 + movaps %xmm9, %xmm8 #88.5 + shufps $177, %xmm7, %xmm7 #88.5 + addps %xmm6, %xmm10 #88.5 + subps %xmm6, %xmm11 #88.5 + subps %xmm7, %xmm8 #88.5 + addps %xmm7, %xmm9 #88.5 + movslq 8(%rdi, %rax, 4), %r11 #83.59 + movaps %xmm10, %xmm2 #88.5 + movslq (%rdi, %rax, 4), %r10 #83.44 + movaps %xmm11, %xmm1 #88.5 + shufps $238, %xmm8, %xmm10 #88.5 + shufps $238, %xmm9, %xmm11 #88.5 + movaps %xmm10, (%r8,%r11,4) #88.5 + movaps %xmm11, 16(%r8,%r11,4) #88.5 +LEAF_EO_const_4: + movaps 0xFECA(%rdx,%rax,4), %xmm15 #88.5 +LEAF_EO_const_5: + movaps 0xFECA(%rdx,%rax,4), %xmm12 #88.5 + movaps %xmm15, %xmm14 #88.5 +LEAF_EO_const_6: + movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5 + addps %xmm12, %xmm14 #88.5 + subps %xmm12, %xmm15 #88.5 +LEAF_EO_const_7: + movaps 0xFECA(%rdx,%rax,4), %xmm13 #88.5 + movaps %xmm4, %xmm5 #88.5 + movaps %xmm14, %xmm7 #88.5 + addps %xmm13, %xmm5 #88.5 + subps %xmm13, %xmm4 #88.5 + movlhps %xmm8, %xmm2 #88.5 + movaps %xmm5, %xmm8 #88.5 + movlhps %xmm15, %xmm7 #88.5 + xorps %xmm3, %xmm15 #88.5 + movaps %xmm5, %xmm6 #88.5 + subps %xmm14, %xmm5 #88.5 + addps %xmm14, %xmm6 #88.5 + movlhps %xmm9, %xmm1 #88.5 + movaps %xmm4, %xmm14 #88.5 + movlhps %xmm4, %xmm8 #88.5 + movaps %xmm1, %xmm12 #88.5 + shufps $177, %xmm15, %xmm15 #88.5 + movaps 0x30(%rsi), %xmm11 #88.5 + addq $4, %rax #90.5 + subps %xmm15, %xmm14 #88.5 + mulps %xmm7, %xmm11 #88.5 + addps %xmm15, %xmm4 #88.5 + movaps 0x30(%rsi), %xmm9 #88.5 + movaps 0x40(%rsi), %xmm15 #88.5 + shufps $177, %xmm7, %xmm7 #88.5 + mulps %xmm8, %xmm9 #88.5 + mulps %xmm15, %xmm7 #88.5 + shufps $177, %xmm8, %xmm8 #88.5 + subps %xmm7, %xmm11 #88.5 + mulps %xmm15, %xmm8 #88.5 + movaps %xmm11, %xmm10 #88.5 + addps %xmm8, %xmm9 #88.5 + shufps $238, %xmm14, %xmm6 #88.5 + subps %xmm9, %xmm11 #88.5 + addps %xmm9, %xmm10 #88.5 + xorps %xmm3, %xmm11 #88.5 + movaps %xmm2, %xmm3 #88.5 + shufps $177, %xmm11, %xmm11 #88.5 + subps %xmm10, %xmm3 #88.5 + addps %xmm10, %xmm2 #88.5 + addps %xmm11, %xmm12 #88.5 + subps %xmm11, %xmm1 #88.5 + shufps $238, %xmm4, %xmm5 #88.5 + movaps %xmm5, 48(%r8,%r11,4) #88.5 + movaps %xmm6, 32(%r8,%r11,4) #88.5 + movaps %xmm2, (%r8,%r10,4) #88.5 + movaps %xmm1, 16(%r8,%r10,4) #88.5 + movaps %xmm3, 32(%r8,%r10,4) #88.5 + movaps %xmm12, 48(%r8,%r10,4) #88.5 + +#ifdef __APPLE__ + .globl _leaf_oe +_leaf_oe: +#else + .globl leaf_oe +leaf_oe: +#endif + movaps (%rsi), %xmm0 #59.5 +LEAF_OE_const_2: + movaps 0xFECA(%rdx,%rax,4), %xmm6 #70.5 +LEAF_OE_const_3: + movaps 0xFECA(%rdx,%rax,4), %xmm8 #70.5 + movaps %xmm6, %xmm10 #70.5 + shufps $228, %xmm8, %xmm10 #70.5 + movaps %xmm10, %xmm9 #70.5 + shufps $228, %xmm6, %xmm8 #70.5 +LEAF_OE_const_0: + movaps 0xFECA(%rdx,%rax,4), %xmm12 #70.5 +LEAF_OE_const_1: + movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5 + movaps %xmm12, %xmm14 #70.5 + movslq (%rdi, %rax, 4), %r10 #83.44 + addps %xmm8, %xmm9 #70.5 + subps %xmm8, %xmm10 #70.5 + addps %xmm7, %xmm14 #70.5 + subps %xmm7, %xmm12 #70.5 + movaps %xmm9, %xmm4 #70.5 + movaps %xmm14, %xmm13 #70.5 + shufps $238, %xmm10, %xmm4 #70.5 + xorps %xmm0, %xmm10 #70.5 + shufps $177, %xmm10, %xmm10 #70.5 + movaps %xmm12, %xmm11 #70.5 + movaps %xmm14, %xmm5 #70.5 + addps %xmm9, %xmm13 #70.5 + subps %xmm10, %xmm11 #70.5 + subps %xmm9, %xmm14 #70.5 + shufps $238, %xmm12, %xmm5 #70.5 + addps %xmm10, %xmm12 #70.5 + movslq 8(%rdi, %rax, 4), %r11 #83.59 + movlhps %xmm11, %xmm13 #70.5 + movaps %xmm13, (%r8,%r10,4) #70.5 + movaps 0x30(%rsi), %xmm13 #70.5 + movlhps %xmm12, %xmm14 #70.5 + movaps 0x40(%rsi), %xmm12 #70.5 + mulps %xmm5, %xmm13 #70.5 + shufps $177, %xmm5, %xmm5 #70.5 + mulps %xmm12, %xmm5 #70.5 + movaps %xmm14, 16(%r8,%r10,4) #70.5 + subps %xmm5, %xmm13 #70.5 + movaps 0x30(%rsi), %xmm5 #70.5 + mulps %xmm4, %xmm5 #70.5 + shufps $177, %xmm4, %xmm4 #70.5 + mulps %xmm12, %xmm4 #70.5 +LEAF_OE_const_4: + movaps 0xFECA(%rdx,%rax,4), %xmm9 #70.5 + addps %xmm4, %xmm5 #70.5 +LEAF_OE_const_6: + movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5 + movaps %xmm9, %xmm3 #70.5 +LEAF_OE_const_7: + movaps 0xFECA(%rdx,%rax,4), %xmm2 #70.5 + movaps %xmm7, %xmm6 #70.5 +LEAF_OE_const_5: + movaps 0xFECA(%rdx,%rax,4), %xmm15 #70.5 + movaps %xmm13, %xmm4 #70.5 + subps %xmm2, %xmm7 #70.5 + addps %xmm15, %xmm3 #70.5 + subps %xmm15, %xmm9 #70.5 + addps %xmm2, %xmm6 #70.5 + subps %xmm5, %xmm13 #70.5 + addps %xmm5, %xmm4 #70.5 + xorps %xmm0, %xmm7 #70.5 + addq $4, %rax #72.5 + movaps %xmm3, %xmm2 #70.5 + shufps $177, %xmm7, %xmm7 #70.5 + movaps %xmm9, %xmm8 #70.5 + xorps %xmm0, %xmm13 #70.5 + addps %xmm6, %xmm2 #70.5 + subps %xmm7, %xmm8 #70.5 + subps %xmm6, %xmm3 #70.5 + addps %xmm7, %xmm9 #70.5 + movaps %xmm2, %xmm10 #70.5 + movaps %xmm3, %xmm11 #70.5 + shufps $238, %xmm8, %xmm2 #70.5 + shufps $238, %xmm9, %xmm3 #70.5 + movaps %xmm2, %xmm14 #70.5 + shufps $177, %xmm13, %xmm13 #70.5 + subps %xmm4, %xmm14 #70.5 + addps %xmm4, %xmm2 #70.5 + movaps %xmm3, %xmm4 #70.5 + subps %xmm13, %xmm3 #70.5 + addps %xmm13, %xmm4 #70.5 + movlhps %xmm8, %xmm10 #70.5 + movlhps %xmm9, %xmm11 #70.5 + movaps %xmm10, 32(%r8,%r10,4) #70.5 + movaps %xmm11, 48(%r8,%r10,4) #70.5 + movaps %xmm2, (%r8,%r11,4) #70.5 + movaps %xmm3, 16(%r8,%r11,4) #70.5 + movaps %xmm14, 32(%r8,%r11,4) #70.5 + movaps %xmm4, 48(%r8,%r11,4) #70.5 + +#ifdef __APPLE__ + .globl _leaf_end +_leaf_end: +#else + .globl leaf_end +leaf_end: +#endif + +#ifdef __APPLE__ + .globl _x_init +_x_init: +#else + .globl x_init +x_init: +#endif + movaps (%rsi), %xmm3 #34.3 + movq 0x20(%rcx), %rdi +#ifdef __APPLE__ + .globl _x4 +_x4: +#else + .globl x4 +x4: +#endif + movaps 64(%r8), %xmm0 #34.3 + movaps 96(%r8), %xmm1 #34.3 + movaps (%r8), %xmm7 #34.3 + movaps (%rdi), %xmm4 #const + movaps %xmm7, %xmm9 #34.3 + movaps %xmm4, %xmm6 #34.3 + movaps 16(%rdi), %xmm2 #const + mulps %xmm0, %xmm6 #34.3 + mulps %xmm1, %xmm4 #34.3 + shufps $177, %xmm0, %xmm0 #34.3 + shufps $177, %xmm1, %xmm1 #34.3 + mulps %xmm2, %xmm0 #34.3 + mulps %xmm1, %xmm2 #34.3 + subps %xmm0, %xmm6 #34.3 + addps %xmm2, %xmm4 #34.3 + movaps %xmm6, %xmm5 #34.3 + subps %xmm4, %xmm6 #34.3 + addps %xmm4, %xmm5 #34.3 + movaps 32(%r8), %xmm8 #34.3 + xorps %xmm3, %xmm6 #34.3 + shufps $177, %xmm6, %xmm6 #34.3 + movaps %xmm8, %xmm10 #34.3 + movaps 112(%r8), %xmm12 #34.3 + subps %xmm5, %xmm9 #34.3 + addps %xmm5, %xmm7 #34.3 + addps %xmm6, %xmm10 #34.3 + subps %xmm6, %xmm8 #34.3 + movaps %xmm7, (%r8) #34.3 + movaps %xmm8, 32(%r8) #34.3 + movaps %xmm9, 64(%r8) #34.3 + movaps %xmm10, 96(%r8) #34.3 + movaps 32(%rdi), %xmm14 #const #34.3 + movaps 80(%r8), %xmm11 #34.3 + movaps %xmm14, %xmm0 #34.3 + movaps 48(%rdi), %xmm13 #const #34.3 + mulps %xmm11, %xmm0 #34.3 + mulps %xmm12, %xmm14 #34.3 + shufps $177, %xmm11, %xmm11 #34.3 + shufps $177, %xmm12, %xmm12 #34.3 + mulps %xmm13, %xmm11 #34.3 + mulps %xmm12, %xmm13 #34.3 + subps %xmm11, %xmm0 #34.3 + addps %xmm13, %xmm14 #34.3 + movaps %xmm0, %xmm15 #34.3 + subps %xmm14, %xmm0 #34.3 + addps %xmm14, %xmm15 #34.3 + xorps %xmm3, %xmm0 #34.3 + movaps 16(%r8), %xmm1 #34.3 + movaps 48(%r8), %xmm2 #34.3 + movaps %xmm1, %xmm4 #34.3 + shufps $177, %xmm0, %xmm0 #34.3 + movaps %xmm2, %xmm5 #34.3 + addps %xmm15, %xmm1 #34.3 + subps %xmm0, %xmm2 #34.3 + subps %xmm15, %xmm4 #34.3 + addps %xmm0, %xmm5 #34.3 + movaps %xmm1, 16(%r8) #34.3 + movaps %xmm2, 48(%r8) #34.3 + movaps %xmm4, 80(%r8) #34.3 + movaps %xmm5, 112(%r8) #34.3 + ret + +# _x8_soft + 6 needs to be 16 byte aligned +#ifdef __APPLE__ + .globl _x8_soft +_x8_soft: +#else + .globl x8_soft +x8_soft: +#endif + # rax, rcx, rdx, r8, r10, r11 (r9 not used) + # rbx, rdi, rsi + + # input + movq %rdi, %rax + + # output + movq %r8, %rcx + + # loop stop (output + output_stride) + leaq (%r8, %rbx), %rdx + + # 3 * output_stride + leaq (%rbx, %rbx, 2), %rsi + + # 5 * output_stride + leaq (%rbx, %rbx, 4), %r10 + + # 7 * output_stride + leaq (%rsi, %rbx, 4), %r11 + +X8_soft_loop: + # input + 0 * input_stride + movaps (%rax), %xmm9 + + # output + 2 * output_stride + movaps (%rcx, %rbx, 2), %xmm6 + + movaps %xmm9, %xmm11 + + # output + 3 * output_stride + movaps (%rcx, %rsi), %xmm7 + + # input + 1 * input_stride + movaps 16(%rax), %xmm8 + + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm9 + shufps $177, %xmm6, %xmm6 + mulps %xmm8, %xmm6 + shufps $177, %xmm7, %xmm7 + subps %xmm6, %xmm11 + mulps %xmm7, %xmm8 + movaps %xmm11, %xmm10 + addps %xmm8, %xmm9 + + # input + 2 * input_stride + movaps 32(%rax), %xmm15 + + addps %xmm9, %xmm10 + subps %xmm9, %xmm11 + + # output + 0 * output_stride + movaps (%rcx), %xmm5 + + movaps %xmm15, %xmm6 + + # output + 4 * output_stride + movaps (%rcx, %rbx, 4), %xmm12 + + movaps %xmm5, %xmm2 + + # output + 6 * output_stride + movaps (%rcx, %rsi, 2), %xmm13 + + xorps %xmm3, %xmm11 #const + + # input + 3 * input_stride + movaps 48(%rax), %xmm14 + + subps %xmm10, %xmm2 + mulps %xmm12, %xmm6 + addps %xmm10, %xmm5 + mulps %xmm13, %xmm15 + + # input + 4 * input_stride + movaps 64(%rax), %xmm10 + + movaps %xmm5, %xmm0 + shufps $177, %xmm12, %xmm12 + shufps $177, %xmm13, %xmm13 + mulps %xmm14, %xmm12 + mulps %xmm13, %xmm14 + subps %xmm12, %xmm6 + addps %xmm14, %xmm15 + + # output + 5 * output_stride + movaps (%rcx, %r10), %xmm7 + + movaps %xmm10, %xmm13 + + # output + 7 * output_stride + movaps (%rcx, %r11), %xmm8 + + movaps %xmm6, %xmm12 + + # input + 5 * input_stride + movaps 80(%rax), %xmm9 + + # input + 6 * input_stride + addq $96, %rax + + mulps %xmm7, %xmm13 + subps %xmm15, %xmm6 + addps %xmm15, %xmm12 + mulps %xmm8, %xmm10 + subps %xmm12, %xmm0 + addps %xmm12, %xmm5 + shufps $177, %xmm7, %xmm7 + xorps %xmm3, %xmm6 #const + shufps $177, %xmm8, %xmm8 + movaps %xmm2, %xmm12 + mulps %xmm9, %xmm7 + mulps %xmm8, %xmm9 + subps %xmm7, %xmm13 + addps %xmm9, %xmm10 + + # output + 1 * output_stride + movaps (%rcx, %rbx), %xmm4 + + shufps $177, %xmm11, %xmm11 + movaps %xmm4, %xmm1 + shufps $177, %xmm6, %xmm6 + addps %xmm11, %xmm1 + subps %xmm11, %xmm4 + addps %xmm6, %xmm12 + subps %xmm6, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm4, %xmm14 + movaps %xmm1, %xmm6 + subps %xmm10, %xmm13 + addps %xmm10, %xmm11 + xorps %xmm3, %xmm13 #const + addps %xmm11, %xmm4 + subps %xmm11, %xmm14 + shufps $177, %xmm13, %xmm13 + + # output + 0 * output_stride + movaps %xmm5, (%rcx) + + # output + 1 * output_stride + movaps %xmm4, (%rcx, %rbx) + + # output + 2 * output_stride + movaps %xmm2, (%rcx, %rbx, 2) + + subps %xmm13, %xmm1 + addps %xmm13, %xmm6 + + # output + 3 * output_stride + movaps %xmm1, (%rcx, %rsi) + + # output + 4 * output_stride + movaps %xmm0, (%rcx, %rbx, 4) + + # output + 5 * output_stride + movaps %xmm14, (%rcx, %r10) + + # output + 6 * output_stride + movaps %xmm12, (%rcx, %rsi, 2) + + # output + 7 * output_stride + movaps %xmm6, (%rcx, %r11) + + # output + 8 * output_stride + addq $16, %rcx + + cmpq %rdx, %rcx + jne X8_soft_loop + ret + +#ifdef __APPLE__ + .globl _x8_soft_end +_x8_soft_end: +#else + .globl x8_soft_end +x8_soft_end: + +#ifdef __APPLE__ + .globl _sse_leaf_ee_offsets + .globl _sse_leaf_oo_offsets + .globl _sse_leaf_eo_offsets + .globl _sse_leaf_oe_offsets + .align 4 +_sse_leaf_ee_offsets: + .long LEAF_EE_const_0-_leaf_ee+0x4 + .long LEAF_EE_const_1-_leaf_ee+0x5 + .long LEAF_EE_const_2-_leaf_ee+0x5 + .long LEAF_EE_const_3-_leaf_ee+0x5 + .long LEAF_EE_const_4-_leaf_ee+0x5 + .long LEAF_EE_const_5-_leaf_ee+0x5 + .long LEAF_EE_const_6-_leaf_ee+0x4 + .long LEAF_EE_const_7-_leaf_ee+0x5 +_sse_leaf_oo_offsets: + .long LEAF_OO_const_0-_leaf_oo+0x4 + .long LEAF_OO_const_1-_leaf_oo+0x4 + .long LEAF_OO_const_2-_leaf_oo+0x5 + .long LEAF_OO_const_3-_leaf_oo+0x5 + .long LEAF_OO_const_4-_leaf_oo+0x4 + .long LEAF_OO_const_5-_leaf_oo+0x5 + .long LEAF_OO_const_6-_leaf_oo+0x5 + .long LEAF_OO_const_7-_leaf_oo+0x5 +_sse_leaf_eo_offsets: + .long LEAF_EO_const_0-_leaf_eo+0x5 + .long LEAF_EO_const_1-_leaf_eo+0x4 + .long LEAF_EO_const_2-_leaf_eo+0x4 + .long LEAF_EO_const_3-_leaf_eo+0x4 + .long LEAF_EO_const_4-_leaf_eo+0x5 + .long LEAF_EO_const_5-_leaf_eo+0x5 + .long LEAF_EO_const_6-_leaf_eo+0x4 + .long LEAF_EO_const_7-_leaf_eo+0x5 +_sse_leaf_oe_offsets: + .long LEAF_OE_const_0-_leaf_oe+0x5 + .long LEAF_OE_const_1-_leaf_oe+0x4 + .long LEAF_OE_const_2-_leaf_oe+0x4 + .long LEAF_OE_const_3-_leaf_oe+0x5 + .long LEAF_OE_const_4-_leaf_oe+0x5 + .long LEAF_OE_const_5-_leaf_oe+0x5 + .long LEAF_OE_const_6-_leaf_oe+0x4 + .long LEAF_OE_const_7-_leaf_oe+0x4 +#else + .globl sse_leaf_ee_offsets + .globl sse_leaf_oo_offsets + .globl sse_leaf_eo_offsets + .globl sse_leaf_oe_offsets + .align 4 +sse_leaf_ee_offsets: + .long LEAF_EE_const_0-leaf_ee+0x4 + .long LEAF_EE_const_1-leaf_ee+0x5 + .long LEAF_EE_const_2-leaf_ee+0x5 + .long LEAF_EE_const_3-leaf_ee+0x5 + .long LEAF_EE_const_4-leaf_ee+0x5 + .long LEAF_EE_const_5-leaf_ee+0x5 + .long LEAF_EE_const_6-leaf_ee+0x4 + .long LEAF_EE_const_7-leaf_ee+0x5 +sse_leaf_oo_offsets: + .long LEAF_OO_const_0-leaf_oo+0x4 + .long LEAF_OO_const_1-leaf_oo+0x4 + .long LEAF_OO_const_2-leaf_oo+0x5 + .long LEAF_OO_const_3-leaf_oo+0x5 + .long LEAF_OO_const_4-leaf_oo+0x4 + .long LEAF_OO_const_5-leaf_oo+0x5 + .long LEAF_OO_const_6-leaf_oo+0x5 + .long LEAF_OO_const_7-leaf_oo+0x5 +sse_leaf_eo_offsets: + .long LEAF_EO_const_0-leaf_eo+0x5 + .long LEAF_EO_const_1-leaf_eo+0x4 + .long LEAF_EO_const_2-leaf_eo+0x4 + .long LEAF_EO_const_3-leaf_eo+0x4 + .long LEAF_EO_const_4-leaf_eo+0x5 + .long LEAF_EO_const_5-leaf_eo+0x5 + .long LEAF_EO_const_6-leaf_eo+0x4 + .long LEAF_EO_const_7-leaf_eo+0x5 +sse_leaf_oe_offsets: + .long LEAF_OE_const_0-leaf_oe+0x5 + .long LEAF_OE_const_1-leaf_oe+0x4 + .long LEAF_OE_const_2-leaf_oe+0x4 + .long LEAF_OE_const_3-leaf_oe+0x5 + .long LEAF_OE_const_4-leaf_oe+0x5 + .long LEAF_OE_const_5-leaf_oe+0x5 + .long LEAF_OE_const_6-leaf_oe+0x4 + .long LEAF_OE_const_7-leaf_oe+0x4 +#endif + +#ifdef __APPLE__ + .data +#else + .section .data +#endif + .p2align 4 +#ifdef __APPLE__ + .globl _sse_constants +_sse_constants: +#else + .globl sse_constants +sse_constants: +#endif + .long 0x00000000,0x80000000,0x00000000,0x80000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 +#ifdef __APPLE__ + .globl _sse_constants_inv +_sse_constants_inv: +#else + .globl sse_constants_inv +sse_constants_inv: +#endif + .long 0x80000000,0x00000000,0x80000000,0x00000000 + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3 + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3 |